package strings

  1. class BreakIteratorSegmenter extends StringSegmenter

  2. class RegexSegmenter extends StringSegmenter

  3. class SetBasedStopwords extends StringSet

  4. class Stopwords extends StringSet

  5. trait StringSegmentIterator extends Iterator[String]

  6. trait StringSegmenter extends AnyRef

  7. trait StringSet extends AnyRef

    A collection of standard English "stop words"---common words often left out of processing.

  1. object EmptyStringSet extends StringSet

  2. object PorterStemmer

    Rewritten from http://tartarus.

  3. object Stopwords extends Stopwords

  4. object alphaSegmenter extends RegexSegmenter

  5. def charNGrams(word: String, min: Int, max: Int): Seq[String]

    Return Strings representing all possible character sub-sequences of length between "min" and "max", with prepended "<" and appended ">" to indicate start and end of the input string.

  6. def collapseDigits(word: String): String

  7. val containsDigitRegex: Regex

  8. object csvSegmenter extends RegexSegmenter

    For segmenting fields of a comma-separated-value file.

  9. val digitsRegex: Regex

  10. def editDistance(s: String, s2: String, substCost: Int = 1, deleteCost: Int = 1, insertCost: Int = 1): Int

    Implements Levenshtein Distance, with specific operation costs to go from this String to String s2.

  11. object foreignWordSegmenter extends RegexSegmenter

  12. def inputStreamToString(is: InputStream, encoding: String = "UTF-8"): String

    Read the entire contents of the InputStream with the given encoding, and return them as a String.

  13. object nonWhitespaceClassesSegmenter extends RegexSegmenter

  14. object nonWhitespaceSegmenter extends RegexSegmenter

  15. def porterStem(s: String): String

  16. def prefix(word: String, length: Int): String

  17. def readerToString(reader: Reader): String

    Read the entire contents of the Reader and return them as a String.

  18. val recentYearRegex: Regex

  19. def replaceDigits(word: String): String

  20. def simplifyDigits(word: String): String

    Return input string, with digits replaced, either the whole string with "<YEAR>" or "<NUM>" or just the digits replaced with "#"

  21. def stringShape(word: String, maxRepetitions: Int): String

    Return a string that captures the generic "shape" of the original word, mapping lowercase alphabetics to 'a', uppercase to 'A', digits to '1', whitespace to ' '.

    Return a string that captures the generic "shape" of the original word, mapping lowercase alphabetics to 'a', uppercase to 'A', digits to '1', whitespace to ' '. Skip more than 'maxRepetitions' of the same character class.

  22. def suffix(word: String, length: Int): String

  23. object urlSegmenter extends RegexSegmenter

  24. object wordClassesSegmenter extends RegexSegmenter

  25. object wordSegmenter extends RegexSegmenter

