Japanese model: add user_dict entries and small refactor (#5573)

* user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0
2025-07-15 02:32:37 +03:00 · 2020-06-22 21:32:25 +09:00 · 2020-06-22 21:32:25 +09:00 · 150a39ccca
commit 150a39ccca
parent c34420794a
3 changed files with 152 additions and 248 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -20,12 +20,7 @@ from ... import util
 # Hold the attributes we need with convenient names
-DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
+DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])
 # Handling for multiple spaces in a row is somewhat awkward, this simplifies
 # the flow by creating a dummy with the same interface.
 DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
 DummySpace = DummyNode(" ", " ", " ")
 def try_sudachi_import(split_mode="A"):
@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
        )
-def resolve_pos(orth, pos, next_pos):
+def resolve_pos(orth, tag, next_tag):
    """If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
    # Some tokens have their UD tag decided based on the POS of the following
    # token.
-    # orth based rules
+    # apply orth based mapping
-    if pos[0] in TAG_ORTH_MAP:
+    if tag in TAG_ORTH_MAP:
-        orth_map = TAG_ORTH_MAP[pos[0]]
+        orth_map = TAG_ORTH_MAP[tag]
        if orth in orth_map:
-            return orth_map[orth], None
+            return orth_map[orth], None  # current_pos, next_pos
-    # tag bi-gram mapping
+    # apply tag bi-gram mapping
-    if next_pos:
+    if next_tag:
-        tag_bigram = pos[0], next_pos[0]
+        tag_bigram = tag, next_tag
        if tag_bigram in TAG_BIGRAM_MAP:
-            bipos = TAG_BIGRAM_MAP[tag_bigram]
+            current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
-            if bipos[0] is None:
+            if current_pos is None:  # apply tag uni-gram mapping for current_pos
-                return TAG_MAP[pos[0]][POS], bipos[1]
+                return TAG_MAP[tag][POS], next_pos  # only next_pos is identified by tag bi-gram mapping
            else:
-                return bipos
+                return current_pos, next_pos
-    return TAG_MAP[pos[0]][POS], None
+    # apply tag uni-gram mapping
    return TAG_MAP[tag][POS], None
-# Use a mapping of paired punctuation to avoid splitting quoted sentences.
+def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
-pairpunct = {'「':'」', '『': '』', '【': '】'}
+    # Compare the content of tokens and text, first
 def separate_sentences(doc):
    """Given a doc, mark tokens that start sentences based on Unidic tags.
    """
    stack = [] # save paired punctuation
    for i, token in enumerate(doc[:-2]):
        # Set all tokens after the first to false by default. This is necessary
        # for the doc code to be aware we've done sentencization, see
        # `is_sentenced`.
        token.sent_start = (i == 0)
        if token.tag_:
            if token.tag_ == "補助記号-括弧開":
                ts = str(token)
                if ts in pairpunct:
                    stack.append(pairpunct[ts])
                elif stack and ts == stack[-1]:
                    stack.pop()
            if token.tag_ == "補助記号-句点":
                next_token = doc[i+1]
                if next_token.tag_ != token.tag_ and not stack:
                    next_token.sent_start = True
 def get_dtokens(tokenizer, text):
    tokens = tokenizer.tokenize(text)
    words = []
    for ti, token in enumerate(tokens):
        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
        dtoken = DetailedToken(
                token.surface(),
                (tag, inf),
                token.dictionary_form())
        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
            # don't add multiple space tokens in a row
            continue
        words.append(dtoken)
    # remove empty tokens. These can be produced with characters like … that
    # Sudachi normalizes internally. 
    words = [ww for ww in words if len(ww.surface) > 0]
    return words
 def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
    words = [x.surface for x in dtokens]
    if "".join("".join(words).split()) != "".join(text.split()):
        raise ValueError(Errors.E194.format(text=text, words=words))
-    text_words = []
+
-    text_lemmas = []
+    text_dtokens = []
    text_tags = []
    text_spaces = []
    text_pos = 0
    # handle empty and whitespace-only texts
    if len(words) == 0:
-        return text_words, text_lemmas, text_tags, text_spaces
+        return text_dtokens, text_spaces
    elif len([word for word in words if not word.isspace()]) == 0:
        assert text.isspace()
-        text_words = [text]
+        text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
        text_lemmas = [text]
        text_tags = [gap_tag]
        text_spaces = [False]
-        return text_words, text_lemmas, text_tags, text_spaces
+        return text_dtokens, text_spaces
-    # normalize words to remove all whitespace tokens
+
-    norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
+    # align words and dtokens by referring text, and insert gap tokens for the space char spans
-    # align words with text
+    for word, dtoken in zip(words, dtokens):
-    for word, dtoken in zip(norm_words, norm_dtokens):
+        # skip all space tokens
        if word.isspace():
            continue
        try:
            word_start = text[text_pos:].index(word)
        except ValueError:
            raise ValueError(Errors.E194.format(text=text, words=words))
        # space token
        if word_start > 0:
            w = text[text_pos:text_pos + word_start]
-            text_words.append(w)
+            text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
            text_lemmas.append(w)
            text_tags.append(gap_tag)
            text_spaces.append(False)
            text_pos += word_start
-        text_words.append(word)
+
-        text_lemmas.append(dtoken.lemma)
+        # content word
-        text_tags.append(dtoken.pos)
+        text_dtokens.append(dtoken)
        text_spaces.append(False)
        text_pos += len(word)
        # poll a space char after the word
        if text_pos < len(text) and text[text_pos] == " ":
            text_spaces[-1] = True
            text_pos += 1
    # trailing space token
    if text_pos < len(text):
        w = text[text_pos:]
-        text_words.append(w)
+        text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
        text_lemmas.append(w)
        text_tags.append(gap_tag)
        text_spaces.append(False)
-    return text_words, text_lemmas, text_tags, text_spaces
+
    return text_dtokens, text_spaces
 class JapaneseTokenizer(DummyTokenizer):
@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer):
        self.tokenizer = try_sudachi_import(self.split_mode)
    def __call__(self, text):
-        dtokens = get_dtokens(self.tokenizer, text)
+        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
        sudachipy_tokens = self.tokenizer.tokenize(text)
        dtokens = self._get_dtokens(sudachipy_tokens)
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
-        words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
+        # create Doc with tag bi-gram based part-of-speech identification rules
        words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
        sub_tokens_list = list(sub_tokens_list)
        doc = Doc(self.vocab, words=words, spaces=spaces)
-        next_pos = None
+        next_pos = None  # for bi-gram rules
-        for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
+        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
-            token.tag_ = unidic_tag[0]
+            token.tag_ = dtoken.tag
-            if next_pos:
+            if next_pos:  # already identified in previous iteration
                token.pos = next_pos
                next_pos = None
            else:
                token.pos, next_pos = resolve_pos(
                    token.orth_,
-                    unidic_tag,
+                    dtoken.tag,
-                    unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
+                    tags[idx + 1] if idx + 1 < len(tags) else None
                )
            # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = lemma
+            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
-        doc.user_data["unidic_tags"] = unidic_tags
+
        doc.user_data["inflections"] = inflections
        doc.user_data["reading_forms"] = readings
        doc.user_data["sub_tokens"] = sub_tokens_list
        return doc
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
        sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
        dtokens = [
            DetailedToken(
                token.surface(),  # orth
                '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']),  # tag
                ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']),  # inf
                token.dictionary_form(),  # lemma
                token.reading_form(),  # user_data['reading_forms']
                sub_tokens_list[idx] if sub_tokens_list else None,  # user_data['sub_tokens']
            ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
            # remove empty tokens which can be produced with characters like … that
        ]
        # Sudachi normalizes internally and outputs each space char as a token.
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
        return [
            t for idx, t in enumerate(dtokens) if
            idx == 0 or
            not t.surface.isspace() or t.tag != '空白' or
            not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
        ]
    def _get_sub_tokens(self, sudachipy_tokens):
        if self.split_mode is None or self.split_mode == "A":  # do nothing for default split mode
            return None
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
        for token in sudachipy_tokens:
            sub_a = token.split(self.tokenizer.SplitMode.A)
            if len(sub_a) == 1:  # no sub tokens
                sub_tokens_list.append(None)
            elif self.split_mode == "B":
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
            else:  # "C"
                sub_b = token.split(self.tokenizer.SplitMode.B)
                if len(sub_a) == len(sub_b):
                    dtokens = self._get_dtokens(sub_a, False)
                    sub_tokens_list.append([dtokens, dtokens])
                else:
                    sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
        return sub_tokens_list
    def _get_config(self):
        config = OrderedDict(
            (
--- a/spacy/lang/ja/bunsetu.py
+++ b/spacy/lang/ja/bunsetu.py
@ -1,144 +0,0 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 POS_PHRASE_MAP = {
    "NOUN": "NP",
    "NUM": "NP",
    "PRON": "NP",
    "PROPN": "NP",
    "VERB": "VP",
    "ADJ": "ADJP",
    "ADV": "ADVP",
    "CCONJ": "CCONJP",
 }
 # return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
 def yield_bunsetu(doc, debug=False):
    bunsetu = []
    bunsetu_may_end = False
    phrase_type = None
    phrase = None
    prev = None
    prev_tag = None
    prev_dep = None
    prev_head = None
    for t in doc:
        pos = t.pos_
        pos_type = POS_PHRASE_MAP.get(pos, None)
        tag = t.tag_
        dep = t.dep_
        head = t.head.i
        if debug:
            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
        # DET is always an individual bunsetu
        if pos == "DET":
            if bunsetu:
                yield bunsetu, phrase_type, phrase
            yield [t], None, None
            bunsetu = []
            bunsetu_may_end = False
            phrase_type = None
            phrase = None
        # PRON or Open PUNCT always splits bunsetu
        elif tag == "補助記号-括弧開":
            if bunsetu:
                yield bunsetu, phrase_type, phrase
            bunsetu = [t]
            bunsetu_may_end = True
            phrase_type = None
            phrase = None
        # bunsetu head not appeared
        elif phrase_type is None:
            if bunsetu and prev_tag == "補助記号-読点":
                yield bunsetu, phrase_type, phrase
                bunsetu = []
                bunsetu_may_end = False
                phrase_type = None
                phrase = None
            bunsetu.append(t)
            if pos_type:  # begin phrase
                phrase = [t]
                phrase_type = pos_type
                if pos_type in {"ADVP", "CCONJP"}:
                    bunsetu_may_end = True
        # entering new bunsetu
        elif pos_type and (
            pos_type != phrase_type or  # different phrase type arises
            bunsetu_may_end  # same phrase type but bunsetu already ended
        ):
            # exceptional case: NOUN to VERB
            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
                bunsetu.append(t)
                phrase_type = "VP"
                phrase.append(t)
            # exceptional case: VERB to NOUN
            elif phrase_type == "VP" and pos_type == "NP" and (
                    prev_dep == 'compound' and prev_head == t.i or
                    dep == 'compound' and prev == head or
                    prev_dep == 'nmod' and prev_head == t.i
            ):
                bunsetu.append(t)
                phrase_type = "NP"
                phrase.append(t)
            else:
                yield bunsetu, phrase_type, phrase
                bunsetu = [t]
                bunsetu_may_end = False
                phrase_type = pos_type
                phrase = [t]
        # NOUN bunsetu
        elif phrase_type == "NP":
            bunsetu.append(t)
            if not bunsetu_may_end and ((
                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
            ) or (
                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
            )):
                phrase.append(t)
            else:
                bunsetu_may_end = True
        # VERB bunsetu
        elif phrase_type == "VP":
            bunsetu.append(t)
            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
                phrase.append(t)
            else:
                bunsetu_may_end = True
        # ADJ bunsetu
        elif phrase_type == "ADJP" and tag != '連体詞':
            bunsetu.append(t)
            if not bunsetu_may_end and ((
                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
            ) or (
                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
            )):
                phrase.append(t)
            else:
                bunsetu_may_end = True
        # other bunsetu
        else:
            bunsetu.append(t)
        prev = t.i
        prev_tag = t.tag_
        prev_dep = t.dep_
        prev_head = head
    if bunsetu:
        yield bunsetu, phrase_type, phrase
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
-from spacy.lang.ja import Japanese
+from spacy.lang.ja import Japanese, DetailedToken
 # fmt: off
 TOKENIZER_TESTS = [
@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    assert len(nlp_c(text)) == len_c
@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c",
    [
        (
            "選挙管理委員会",
            [None, None, None, None],
            [None, None, [
                [
                    DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
                    DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
                ]
            ]],
            [[
                [
                    DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
                    DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
                    DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
                    DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
                ], [
                    DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
                    DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
                    DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None),
                ]
            ]]
        ),
    ]
 )
 def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c):
    nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
    nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
    nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
    assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
    assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
    assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
    assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@pytest.mark.parametrize("text,inflections,reading_forms",
    [
        (
            "取ってつけた",
            ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
            ("トッ", "テ", "ツケ", "タ"),
        ),
    ]
 )
 def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
    assert ja_tokenizer(text).user_data["inflections"] == inflections
    assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
 def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
    doc = ja_tokenizer("")
    assert len(doc) == 0