Reorganise Japanese language data

This commit is contained in:
ines 2017-05-08 15:50:46 +02:00
parent 51a389d3bb
commit 5edbc725d8
3 changed files with 2 additions and 38 deletions

View File

@ -1,14 +1,10 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class Japanese(Language):
lang = 'ja'
@ -22,4 +18,5 @@ class Japanese(Language):
words = [x.surface for x in Tokenizer().tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
EXPORT = Japanese
__all__ = ['Japanese']

View File

@ -1,9 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# stop words as whitespace-separated list
STOP_WORDS = set("""
""".split())

View File

@ -1,24 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}