diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..09546467e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -5,97 +5,148 @@ import re from collections import namedtuple from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP +from .tag_orth_map import TAG_ORTH_MAP +from .tag_bigram_map import TAG_BIGRAM_MAP from ...attrs import LANG -from ...language import Language -from ...tokens import Doc from ...compat import copy_reg -from ...util import DummyTokenizer +from ...language import Language +from ...symbols import POS +from ...tokens import Doc +from ...util import DummyTokenizer, get_words_and_spaces + +# Hold the attributes we need with convenient names +DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) # Handling for multiple spaces in a row is somewhat awkward, this simplifies # the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) -DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) +DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) +DummySpace = DummyNode(" ", " ", " ") -def try_fugashi_import(): - """Fugashi is required for Japanese support, so check for it. +def try_sudachi_import(): + """SudachiPy is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: - import fugashi + from sudachipy import dictionary, tokenizer - return fugashi + tok = dictionary.Dictionary().create( + mode=tokenizer.Tokenizer.SplitMode.A + ) + return tok except ImportError: raise ImportError( - "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" + "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" ) -def resolve_pos(token): +def resolve_pos(token, next_token): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context - in the sentence. This function adds information to the POS tag to - resolve ambiguous mappings. + in the sentence. This function returns resolved POSs for both token + and next_token by tuple. """ - # this is only used for consecutive ascii spaces - if token.surface == " ": - return "空白" + # Some tokens have their UD tag decided based on the POS of the following + # token. - # TODO: This is a first take. The rules here are crude approximations. - # For many of these, full dependencies are needed to properly resolve - # PoS mappings. - if token.pos == "連体詞,*,*,*": - if re.match(r"[こそあど此其彼]の", token.surface): - return token.pos + ",DET" - if re.match(r"[こそあど此其彼]", token.surface): - return token.pos + ",PRON" - return token.pos + ",ADJ" - return token.pos + # orth based rules + if token.pos in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[token.pos[0]] + if token.surface in orth_map: + return orth_map[token.surface], None + + # tag bi-gram mapping + if next_token: + tag_bigram = token.pos[0], next_token.pos[0] + if tag_bigram in TAG_BIGRAM_MAP: + bipos = TAG_BIGRAM_MAP[tag_bigram] + if bipos[0] is None: + return TAG_MAP[token.pos[0]][POS], bipos[1] + else: + return bipos + + return TAG_MAP[token.pos[0]][POS], None -def get_words_and_spaces(tokenizer, text): - """Get the individual tokens that make up the sentence and handle white space. +# Use a mapping of paired punctuation to avoid splitting quoted sentences. +pairpunct = {'「':'」', '『': '』', '【': '】'} - Japanese doesn't usually use white space, and MeCab's handling of it for - multiple spaces in a row is somewhat awkward. + +def separate_sentences(doc): + """Given a doc, mark tokens that start sentences based on Unidic tags. """ - tokens = tokenizer.parseToNodeList(text) + stack = [] # save paired punctuation + for i, token in enumerate(doc[:-2]): + # Set all tokens after the first to false by default. This is necessary + # for the doc code to be aware we've done sentencization, see + # `is_sentenced`. + token.sent_start = (i == 0) + if token.tag_: + if token.tag_ == "補助記号-括弧開": + ts = str(token) + if ts in pairpunct: + stack.append(pairpunct[ts]) + elif stack and ts == stack[-1]: + stack.pop() + + if token.tag_ == "補助記号-句点": + next_token = doc[i+1] + if next_token.tag_ != token.tag_ and not stack: + next_token.sent_start = True + + +def get_dtokens(tokenizer, text): + tokens = tokenizer.tokenize(text) words = [] - spaces = [] - for token in tokens: - # If there's more than one space, spaces after the first become tokens - for ii in range(len(token.white_space) - 1): - words.append(DummySpace) - spaces.append(False) - - words.append(token) - spaces.append(bool(token.white_space)) - return words, spaces + for ti, token in enumerate(tokens): + tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) + inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) + dtoken = DetailedToken( + token.surface(), + (tag, inf), + token.dictionary_form()) + if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': + # don't add multiple space tokens in a row + continue + words.append(dtoken) + # remove empty tokens. These can be produced with characters like … that + # Sudachi normalizes internally. + words = [ww for ww in words if len(ww.surface) > 0] + return words class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_fugashi_import().Tagger() - self.tokenizer.parseToNodeList("") # see #2901 + self.tokenizer = try_sudachi_import() def __call__(self, text): - dtokens, spaces = get_words_and_spaces(self.tokenizer, text) + dtokens = get_dtokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + words, spaces = get_words_and_spaces(words, text) + unidic_tags = [",".join(x.pos) for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) - unidic_tags = [] - for token, dtoken in zip(doc, dtokens): - unidic_tags.append(dtoken.pos) - token.tag_ = resolve_pos(dtoken) + next_pos = None + for ii, (token, dtoken) in enumerate(zip(doc, dtokens)): + ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None + token.tag_ = dtoken.pos[0] + if next_pos: + token.pos = next_pos + next_pos = None + else: + token.pos, next_pos = resolve_pos(dtoken, ntoken) # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = dtoken.feature.lemma or dtoken.surface + token.lemma_ = dtoken.lemma doc.user_data["unidic_tags"] = unidic_tags + + separate_sentences(doc) return doc @@ -104,6 +155,7 @@ class JapaneseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda _text: "ja" stop_words = STOP_WORDS tag_map = TAG_MAP + syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @classmethod diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py new file mode 100644 index 000000000..7c3eee336 --- /dev/null +++ b/spacy/lang/ja/bunsetu.py @@ -0,0 +1,144 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + + +POS_PHRASE_MAP = { + "NOUN": "NP", + "NUM": "NP", + "PRON": "NP", + "PROPN": "NP", + + "VERB": "VP", + + "ADJ": "ADJP", + + "ADV": "ADVP", + + "CCONJ": "CCONJP", +} + + +# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] +def yield_bunsetu(doc, debug=False): + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + prev = None + prev_tag = None + prev_dep = None + prev_head = None + for t in doc: + pos = t.pos_ + pos_type = POS_PHRASE_MAP.get(pos, None) + tag = t.tag_ + dep = t.dep_ + head = t.head.i + if debug: + print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + + # DET is always an individual bunsetu + if pos == "DET": + if bunsetu: + yield bunsetu, phrase_type, phrase + yield [t], None, None + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + + # PRON or Open PUNCT always splits bunsetu + elif tag == "補助記号-括弧開": + if bunsetu: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = True + phrase_type = None + phrase = None + + # bunsetu head not appeared + elif phrase_type is None: + if bunsetu and prev_tag == "補助記号-読点": + yield bunsetu, phrase_type, phrase + bunsetu = [] + bunsetu_may_end = False + phrase_type = None + phrase = None + bunsetu.append(t) + if pos_type: # begin phrase + phrase = [t] + phrase_type = pos_type + if pos_type in {"ADVP", "CCONJP"}: + bunsetu_may_end = True + + # entering new bunsetu + elif pos_type and ( + pos_type != phrase_type or # different phrase type arises + bunsetu_may_end # same phrase type but bunsetu already ended + ): + # exceptional case: NOUN to VERB + if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + bunsetu.append(t) + phrase_type = "VP" + phrase.append(t) + # exceptional case: VERB to NOUN + elif phrase_type == "VP" and pos_type == "NP" and ( + prev_dep == 'compound' and prev_head == t.i or + dep == 'compound' and prev == head or + prev_dep == 'nmod' and prev_head == t.i + ): + bunsetu.append(t) + phrase_type = "NP" + phrase.append(t) + else: + yield bunsetu, phrase_type, phrase + bunsetu = [t] + bunsetu_may_end = False + phrase_type = pos_type + phrase = [t] + + # NOUN bunsetu + elif phrase_type == "NP": + bunsetu.append(t) + if not bunsetu_may_end and (( + (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # VERB bunsetu + elif phrase_type == "VP": + bunsetu.append(t) + if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + phrase.append(t) + else: + bunsetu_may_end = True + + # ADJ bunsetu + elif phrase_type == "ADJP" and tag != '連体詞': + bunsetu.append(t) + if not bunsetu_may_end and (( + pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} + ) or ( + pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' + )): + phrase.append(t) + else: + bunsetu_may_end = True + + # other bunsetu + else: + bunsetu.append(t) + + prev = t.i + prev_tag = t.tag_ + prev_dep = t.dep_ + prev_head = head + + if bunsetu: + yield bunsetu, phrase_type, phrase diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py new file mode 100644 index 000000000..cd1e4fde7 --- /dev/null +++ b/spacy/lang/ja/syntax_iterators.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON, VERB + +# XXX this can probably be pruned a bit +labels = [ + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", +] + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings.add(label) for label in labels] + conj = doc.vocab.strings.add("conj") + np_label = doc.vocab.strings.add("NP") + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + unseen = [w.i for w in word.subtree if w.i not in seen] + if not unseen: + continue + + # this takes care of particles etc. + seen.update(j.i for j in word.subtree) + # This avoids duplicating embedded clauses + seen.update(range(word.i + 1)) + + # if the head of this is a verb, mark that and rights seen + # Don't do the subtree as that can hide other phrases + if word.head.pos == VERB: + seen.add(word.head.i) + seen.update(w.i for w in word.head.rights) + yield unseen[0], word.i + 1, np_label + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py new file mode 100644 index 000000000..5ed9aec89 --- /dev/null +++ b/spacy/lang/ja/tag_bigram_map.py @@ -0,0 +1,37 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB + +# mapping from tag bi-gram to pos of previous token +TAG_BIGRAM_MAP = { + # This covers only small part of AUX. + ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None), + + ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None), + # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ), + + # This covers acl, advcl, obl and root, but has side effect for compound. + ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX), + # This covers almost all of the deps + ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX), + + ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB), + ("副詞", "動詞-非自立可能"): (None, VERB), + ("形容詞-一般", "動詞-非自立可能"): (None, VERB), + ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB), + ("接頭辞", "動詞-非自立可能"): (None, VERB), + ("助詞-係助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-副助詞", "動詞-非自立可能"): (None, VERB), + ("助詞-格助詞", "動詞-非自立可能"): (None, VERB), + ("補助記号-読点", "動詞-非自立可能"): (None, VERB), + + ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART), + + ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN), + ("連体詞", "形状詞-助動詞語幹"): (None, NOUN), + + ("動詞-一般", "助詞-副助詞"): (None, PART), + ("動詞-非自立可能", "助詞-副助詞"): (None, PART), + ("助動詞", "助詞-副助詞"): (None, PART), +} diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..ad416e109 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,82 +1,104 @@ # encoding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN +from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { # Explanation of Unidic tags: # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf - # Universal Dependencies Mapping: + # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below) # http://universaldependencies.org/ja/overview/morphology.html # http://universaldependencies.org/ja/pos/all.html - "記号,一般,*,*": { - POS: PUNCT + "記号-一般": { + POS: NOUN }, # this includes characters used to represent sounds like ドレミ - "記号,文字,*,*": { - POS: PUNCT - }, # this is for Greek and Latin characters used as sumbols, as in math - "感動詞,フィラー,*,*": {POS: INTJ}, - "感動詞,一般,*,*": {POS: INTJ}, - # this is specifically for unicode full-width space - "空白,*,*,*": {POS: X}, - # This is used when sequential half-width spaces are present + "記号-文字": { + POS: NOUN + }, # this is for Greek and Latin characters having some meanings, or used as symbols, as in math + "感動詞-フィラー": {POS: INTJ}, + "感動詞-一般": {POS: INTJ}, + "空白": {POS: SPACE}, - "形状詞,一般,*,*": {POS: ADJ}, - "形状詞,タリ,*,*": {POS: ADJ}, - "形状詞,助動詞語幹,*,*": {POS: ADJ}, - "形容詞,一般,*,*": {POS: ADJ}, - "形容詞,非自立可能,*,*": {POS: AUX}, # XXX ADJ if alone, AUX otherwise - "助詞,格助詞,*,*": {POS: ADP}, - "助詞,係助詞,*,*": {POS: ADP}, - "助詞,終助詞,*,*": {POS: PART}, - "助詞,準体助詞,*,*": {POS: SCONJ}, # の as in 走るのが速い - "助詞,接続助詞,*,*": {POS: SCONJ}, # verb ending て - "助詞,副助詞,*,*": {POS: PART}, # ばかり, つつ after a verb - "助動詞,*,*,*": {POS: AUX}, - "接続詞,*,*,*": {POS: SCONJ}, # XXX: might need refinement - "接頭辞,*,*,*": {POS: NOUN}, - "接尾辞,形状詞的,*,*": {POS: ADJ}, # がち, チック - "接尾辞,形容詞的,*,*": {POS: ADJ}, # -らしい - "接尾辞,動詞的,*,*": {POS: NOUN}, # -じみ - "接尾辞,名詞的,サ変可能,*": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* - "接尾辞,名詞的,一般,*": {POS: NOUN}, - "接尾辞,名詞的,助数詞,*": {POS: NOUN}, - "接尾辞,名詞的,副詞可能,*": {POS: NOUN}, # -後, -過ぎ - "代名詞,*,*,*": {POS: PRON}, - "動詞,一般,*,*": {POS: VERB}, - "動詞,非自立可能,*,*": {POS: VERB}, # XXX VERB if alone, AUX otherwise - "動詞,非自立可能,*,*,AUX": {POS: AUX}, - "動詞,非自立可能,*,*,VERB": {POS: VERB}, - "副詞,*,*,*": {POS: ADV}, - "補助記号,AA,一般,*": {POS: SYM}, # text art - "補助記号,AA,顔文字,*": {POS: SYM}, # kaomoji - "補助記号,一般,*,*": {POS: SYM}, - "補助記号,括弧開,*,*": {POS: PUNCT}, # open bracket - "補助記号,括弧閉,*,*": {POS: PUNCT}, # close bracket - "補助記号,句点,*,*": {POS: PUNCT}, # period or other EOS marker - "補助記号,読点,*,*": {POS: PUNCT}, # comma - "名詞,固有名詞,一般,*": {POS: PROPN}, # general proper noun - "名詞,固有名詞,人名,一般": {POS: PROPN}, # person's name - "名詞,固有名詞,人名,姓": {POS: PROPN}, # surname - "名詞,固有名詞,人名,名": {POS: PROPN}, # first name - "名詞,固有名詞,地名,一般": {POS: PROPN}, # place name - "名詞,固有名詞,地名,国": {POS: PROPN}, # country name - "名詞,助動詞語幹,*,*": {POS: AUX}, - "名詞,数詞,*,*": {POS: NUM}, # includes Chinese numerals - "名詞,普通名詞,サ変可能,*": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun - "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB}, - "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN}, # ex: 下手 - "名詞,普通名詞,一般,*": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 - "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN}, - "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ}, - "名詞,普通名詞,助数詞可能,*": {POS: NOUN}, # counter / unit - "名詞,普通名詞,副詞可能,*": {POS: NOUN}, - "連体詞,*,*,*": {POS: ADJ}, # XXX this has exceptions based on literal token - "連体詞,*,*,*,ADJ": {POS: ADJ}, - "連体詞,*,*,*,PRON": {POS: PRON}, - "連体詞,*,*,*,DET": {POS: DET}, + + "形状詞-一般": {POS: ADJ}, + "形状詞-タリ": {POS: ADJ}, + "形状詞-助動詞語幹": {POS: AUX}, + + "形容詞-一般": {POS: ADJ}, + + "形容詞-非自立可能": {POS: ADJ}, # XXX ADJ if alone, AUX otherwise + + "助詞-格助詞": {POS: ADP}, + + "助詞-係助詞": {POS: ADP}, + + "助詞-終助詞": {POS: PART}, + "助詞-準体助詞": {POS: SCONJ}, # の as in 走るのが速い + "助詞-接続助詞": {POS: SCONJ}, # verb ending て0 + + "助詞-副助詞": {POS: ADP}, # ばかり, つつ after a verb + + "助動詞": {POS: AUX}, + + "接続詞": {POS: CCONJ}, # XXX: might need refinement + "接頭辞": {POS: NOUN}, + "接尾辞-形状詞的": {POS: PART}, # がち, チック + + "接尾辞-形容詞的": {POS: AUX}, # -らしい + + "接尾辞-動詞的": {POS: PART}, # -じみ + "接尾辞-名詞的-サ変可能": {POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞-名詞的-一般": {POS: NOUN}, + "接尾辞-名詞的-助数詞": {POS: NOUN}, + "接尾辞-名詞的-副詞可能": {POS: NOUN}, # -後, -過ぎ + + "代名詞": {POS: PRON}, + + "動詞-一般": {POS: VERB}, + + "動詞-非自立可能": {POS: AUX}, # XXX VERB if alone, AUX otherwise + + "副詞": {POS: ADV}, + + "補助記号-AA-一般": {POS: SYM}, # text art + "補助記号-AA-顔文字": {POS: PUNCT}, # kaomoji + + "補助記号-一般": {POS: SYM}, + + "補助記号-括弧開": {POS: PUNCT}, # open bracket + "補助記号-括弧閉": {POS: PUNCT}, # close bracket + "補助記号-句点": {POS: PUNCT}, # period or other EOS marker + "補助記号-読点": {POS: PUNCT}, # comma + + "名詞-固有名詞-一般": {POS: PROPN}, # general proper noun + "名詞-固有名詞-人名-一般": {POS: PROPN}, # person's name + "名詞-固有名詞-人名-姓": {POS: PROPN}, # surname + "名詞-固有名詞-人名-名": {POS: PROPN}, # first name + "名詞-固有名詞-地名-一般": {POS: PROPN}, # place name + "名詞-固有名詞-地名-国": {POS: PROPN}, # country name + + "名詞-助動詞語幹": {POS: AUX}, + "名詞-数詞": {POS: NUM}, # includes Chinese numerals + + "名詞-普通名詞-サ変可能": {POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + + "名詞-普通名詞-サ変形状詞可能": {POS: NOUN}, + + "名詞-普通名詞-一般": {POS: NOUN}, + + "名詞-普通名詞-形状詞可能": {POS: NOUN}, # XXX: sometimes ADJ in UDv2 + + "名詞-普通名詞-助数詞可能": {POS: NOUN}, # counter / unit + + "名詞-普通名詞-副詞可能": {POS: NOUN}, + + "連体詞": {POS: DET}, # XXX this has exceptions based on literal token + + # GSD tags. These aren't in Unidic, but we need them for the GSD data. + "外国語": {POS: PROPN}, # Foreign words + + "絵文字・記号等": {POS: SYM}, # emoji / kaomoji ^^; + } diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py new file mode 100644 index 000000000..355cc655b --- /dev/null +++ b/spacy/lang/ja/tag_orth_map.py @@ -0,0 +1,30 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X + +# mapping from tag bi-gram to pos of previous token +TAG_ORTH_MAP = { + "空白": { + " ": SPACE, + " ": X, + }, + "助詞-副助詞": { + "たり": PART, + }, + "連体詞": { + "あの": DET, + "かの": DET, + "この": DET, + "その": DET, + "どの": DET, + "彼の": DET, + "此の": DET, + "其の": DET, + "ある": PRON, + "こんな": PRON, + "そんな": PRON, + "どんな": PRON, + "あらゆる": PRON, + }, +} diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..58cd3f3bf 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -6,7 +6,7 @@ import pytest @pytest.mark.parametrize( "word,lemma", - [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")], + [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")], ) def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..5213aed58 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -14,20 +14,26 @@ TOKENIZER_TESTS = [ ] TAG_TESTS = [ - ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']), - ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']), - ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']), - ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']), - ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*']) + ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']), + ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']), + ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']), + ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']), + ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能']) ] POS_TESTS = [ - ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']), + ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']), ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']), ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']), ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) ] + +SENTENCE_TESTS = [ + ('あれ。これ。', ['あれ。', 'これ。']), + ('「伝染るんです。」という漫画があります。', + ['「伝染るんです。」という漫画があります。']), + ] # fmt: on @@ -43,14 +49,27 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): assert tags == expected_tags +#XXX This isn't working? Always passes @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos +@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) +def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): + sents = [str(sent) for sent in ja_tokenizer(text).sents] + assert sents == expected_sents + def test_extra_spaces(ja_tokenizer): # note: three spaces after "I" tokens = ja_tokenizer("I like cheese.") - assert tokens[1].orth_ == " " - assert tokens[2].orth_ == " " + assert tokens[1].orth_ == " " + +from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS + +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) +def test_tokenizer_naughty_strings(ja_tokenizer, text): + tokens = ja_tokenizer(text) + assert tokens.text_with_ws == text +