update th

This commit is contained in:
Wannaphong Phatthiyaphaibun 2017-09-21 00:36:02 +07:00
parent 44291f6697
commit 39bb5690f0
2 changed files with 2 additions and 39 deletions

View File

@ -25,6 +25,4 @@ class Thai(Language):
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/wannaphongcom/pythainlp/") "https://github.com/wannaphongcom/pythainlp/")
words = [x for x in list(word_tokenize(text,"newmm"))] words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai']

View File

@ -42,39 +42,4 @@ TOKENIZER_EXCEPTIONS = {
"ธ.ค.": [ "ธ.ค.": [
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
] ]
} }
# exceptions mapped to a single token containing only ORTH property
# example: {"string": [{ORTH: "string"}]}
# converted using strings_to_exc() util
'''
ORTH_ONLY = [
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
]
'''