mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Reorganise Japanese language data
This commit is contained in:
parent
51a389d3bb
commit
5edbc725d8
|
@ -1,14 +1,10 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
from .language_data import *
|
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
lang = 'ja'
|
lang = 'ja'
|
||||||
|
@ -22,4 +18,5 @@ class Japanese(Language):
|
||||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
EXPORT = Japanese
|
|
||||||
|
__all__ = ['Japanese']
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# stop words as whitespace-separated list
|
|
||||||
STOP_WORDS = set("""
|
|
||||||
。
|
|
||||||
、
|
|
||||||
""".split())
|
|
|
@ -1,24 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..symbols import *
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
|
||||||
"ADV": {POS: ADV},
|
|
||||||
"NOUN": {POS: NOUN},
|
|
||||||
"ADP": {POS: ADP},
|
|
||||||
"PRON": {POS: PRON},
|
|
||||||
"SCONJ": {POS: SCONJ},
|
|
||||||
"PROPN": {POS: PROPN},
|
|
||||||
"DET": {POS: DET},
|
|
||||||
"SYM": {POS: SYM},
|
|
||||||
"INTJ": {POS: INTJ},
|
|
||||||
"PUNCT": {POS: PUNCT},
|
|
||||||
"NUM": {POS: NUM},
|
|
||||||
"AUX": {POS: AUX},
|
|
||||||
"X": {POS: X},
|
|
||||||
"CONJ": {POS: CONJ},
|
|
||||||
"ADJ": {POS: ADJ},
|
|
||||||
"VERB": {POS: VERB}
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user