class
DeterministicTokenizer extends DocumentAnnotator
Instance Constructors
-
new
DeterministicTokenizer(caseSensitive: Boolean = false, tokenizeSgml: Boolean = false, tokenizeNewline: Boolean = false, tokenizeAllDashedWords: Boolean = false, abbrevPreceedsLowercase: Boolean = false)
Value Members
-
final
def
!=(arg0: AnyRef): Boolean
-
final
def
!=(arg0: Any): Boolean
-
final
def
##(): Int
-
final
def
==(arg0: AnyRef): Boolean
-
final
def
==(arg0: Any): Boolean
-
val
abbrev: String
-
val
abbrevs: String
-
val
ap: String
-
val
ap2: String
-
def
apply(s: String): Seq[String]
-
val
apword: String
-
final
def
asInstanceOf[T0]: T0
-
val
atuser: String
-
val
caps: String
-
val
catchAll: String
-
def
clone(): AnyRef
-
val
consonantNonAbbrevs: String
-
val
contractedWord: String
-
val
contraction: String
-
val
contraction2: String
-
val
currency: String
-
val
dash: String
-
val
dashedPrefixWord: String
-
val
dashedPrefixes: String
-
val
dashedSuffixWord: String
-
val
dashedSuffixes: String
-
val
date: String
-
val
day: String
-
def
documentAnnotationString(document: Document): String
-
val
ellipsis: String
-
val
email: String
-
val
emoticon: String
-
final
def
eq(arg0: AnyRef): Boolean
-
def
equals(arg0: Any): Boolean
-
val
filename: String
-
def
finalize(): Unit
-
val
fraction: String
-
val
frphone: String
-
final
def
getClass(): Class[_]
-
def
hashCode(): Int
-
val
hashtag: String
-
val
honorific: String
-
val
html: String
-
val
htmlAccentedLetter: String
-
val
htmlChar: String
-
val
htmlComment: String
-
val
htmlSymbol: String
-
val
initials: String
-
val
initials2: String
-
final
def
isInstanceOf[T0]: Boolean
-
val
latin: String
-
val
latin2: String
-
val
letter: String
-
val
mdash: String
-
def
mentionAnnotationString(mention: Mention): String
-
val
month: String
-
final
def
ne(arg0: AnyRef): Boolean
-
val
newline: String
-
val
noAbbrev: String
-
final
def
notify(): Unit
-
final
def
notifyAll(): Unit
-
val
number: String
-
val
number2: String
-
val
ordinals: String
-
val
org: String
-
val
patterns: ArrayBuffer[String]
-
def
phraseAnnotationString(phrase: Phrase): String
-
val
place: String
-
def
postAttrs: Iterable[Class[_]]
-
def
prereqAttrs: Iterable[Class[_]]
-
-
-
-
val
punc: String
-
val
quote: String
-
val
repeatedPunc: String
-
val
sgml: String
-
val
sgml2: String
-
val
space: String
-
val
state: String
-
val
state2: String
-
val
suffix: String
-
val
symbol: String
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
-
def
toString(): String
-
def
tokenAnnotationString(token: Token): String
-
val
tokenRegex: Regex
-
val
tokenRegexString: String
-
val
units: String
-
val
url: String
-
val
url2: String
-
val
url3: String
-
val
usphone: String
-
final
def
wait(): Unit
-
final
def
wait(arg0: Long, arg1: Int): Unit
-
final
def
wait(arg0: Long): Unit
-
val
word: String
Split a String into a sequence of Tokens. Aims to adhere to tokenization rules used in Ontonotes and Penn Treebank. Note that CoNLL tokenization would use tokenizeAllDashedWords=true. Punctuation that ends a sentence should be placed alone in its own Token, hence this segmentation implicitly defines sentence segmentation also. (Although our the DeterministicSentenceSegmenter does make a few adjustments beyond this tokenizer.)