Reorganise Japanese language data

This commit is contained in:
ines 2017-05-08 15:50:46 +02:00
parent 51a389d3bb
commit 5edbc725d8
3 changed files with 2 additions and 38 deletions

View File

@ -1,14 +1,10 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from os import path
from ..language import Language from ..language import Language
from ..attrs import LANG from ..attrs import LANG
from ..tokens import Doc from ..tokens import Doc
from .language_data import *
class Japanese(Language): class Japanese(Language):
lang = 'ja' lang = 'ja'
@ -22,4 +18,5 @@ class Japanese(Language):
words = [x.surface for x in Tokenizer().tokenize(text)] words = [x.surface for x in Tokenizer().tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
EXPORT = Japanese
__all__ = ['Japanese']

View File

@ -1,9 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# stop words as whitespace-separated list
STOP_WORDS = set("""
""".split())

View File

@ -1,24 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}