Moving Japanese tokenizer extra info to Token.morph (#8977)

* Use morph for extra Japanese tokenizer info

Previously Japanese tokenizer info that didn't correspond to Token
fields was put in user data. Since spaCy core should avoid touching user
data, this moves most information to the Token.morph attribute. It also
adds the normalized form, which wasn't exposed before.

The subtokens, which are a list of full tokens, are still added to user
data, except with the default tokenizer granualarity. With the default
tokenizer settings the subtokens are all None, so in this case the user
data is simply not set.

* Update tests

Also adds a new test for norm data.

* Update docs

* Add Japanese morphologizer factory

Set the default to `extend=True` so that the morphologizer does not
clobber the values set by the tokenizer.

* Use the norm_ field for normalized forms

Before this commit, normalized forms were put in the "norm" field in the
morph attributes. I am not sure why I did that instead of using the
token morph, I think I just forgot about it.

* Skip test if sudachipy is not installed

* Fix import

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Paul O'Leary McCann 2021-10-01 17:19:26 +00:00 committed by GitHub
parent 8f2409e514
commit 1ee6541ab0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 95 additions and 33 deletions

View File

@ -1,7 +1,8 @@
from typing import Optional, Union, Dict, Any
from typing import Optional, Union, Dict, Any, Callable
from pathlib import Path
import srsly
from collections import namedtuple
from thinc.api import Model
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
@ -10,9 +11,11 @@ from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
from ...errors import Errors
from ...language import Language
from ...pipeline import Morphologizer
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc
from ...tokens import Doc, MorphAnalysis
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
@ -41,6 +44,8 @@ class JapaneseTokenizer(DummyTokenizer):
self.vocab = vocab
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
# if we're using split mode A we don't need subtokens
self.need_subtokens = not (split_mode is None or split_mode == "A")
def __reduce__(self):
return JapaneseTokenizer, (self.vocab, self.split_mode)
@ -52,8 +57,8 @@ class JapaneseTokenizer(DummyTokenizer):
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6
words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 7
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
@ -71,9 +76,14 @@ class JapaneseTokenizer(DummyTokenizer):
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
morph = {}
morph["inflection"] = dtoken.inf
token.norm_ = dtoken.norm
if dtoken.reading:
morph["reading"] = dtoken.reading
token.morph = MorphAnalysis(self.vocab, morph)
if self.need_subtokens:
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@ -86,7 +96,8 @@ class JapaneseTokenizer(DummyTokenizer):
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
token.normalized_form(),
token.reading_form(),
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
@ -108,9 +119,8 @@ class JapaneseTokenizer(DummyTokenizer):
]
def _get_sub_tokens(self, sudachipy_tokens):
if (
self.split_mode is None or self.split_mode == "A"
): # do nothing for default split mode
# do nothing for default split mode
if not self.need_subtokens:
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
@ -179,9 +189,33 @@ class Japanese(Language):
Defaults = JapaneseDefaults
@Japanese.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": True,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
overwrite: bool,
extend: bool,
scorer: Optional[Callable],
):
return Morphologizer(
nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
)
# Hold the attributes we need with convenient names
DetailedToken = namedtuple(
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
"DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
)
@ -257,7 +291,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
text_spaces = [False]
return text_dtokens, text_spaces
@ -274,7 +308,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# space token
if word_start > 0:
w = text[text_pos : text_pos + word_start]
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False)
text_pos += word_start
@ -290,7 +324,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# trailing space token
if text_pos < len(text):
w = text[text_pos:]
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False)
return text_dtokens, text_spaces

View File

@ -8,3 +8,17 @@ import pytest
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
test_lemma = ja_tokenizer(word)[0].lemma_
assert test_lemma == lemma
@pytest.mark.parametrize(
"word,norm",
[
("SUMMER", "サマー"),
("食べ物", "食べ物"),
("綜合", "総合"),
("コンピュータ", "コンピューター"),
],
)
def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
test_norm = ja_tokenizer(word)[0].norm_
assert test_norm == norm

View File

@ -0,0 +1,9 @@
import pytest
from spacy.lang.ja import Japanese
def test_ja_morphologizer_factory():
pytest.importorskip("sudachipy")
nlp = Japanese()
morphologizer = nlp.add_pipe("morphologizer")
assert morphologizer.cfg["extend"] is True

View File

@ -34,22 +34,22 @@ SENTENCE_TESTS = [
]
tokens1 = [
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", reading="カイ", sub_tokens=None),
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", norm="", reading="カイ", sub_tokens=None),
]
tokens2 = [
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", reading="カイ", sub_tokens=None),
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", norm="", reading="カイ", sub_tokens=None),
]
tokens3 = [
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
]
SUB_TOKEN_TESTS = [
("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
]
# fmt: on
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
assert len(nlp_c(text)) == len_c
@pytest.mark.parametrize(
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
)
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
def test_ja_tokenizer_sub_tokens(
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
):
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
assert nlp_a(text).user_data.get("sub_tokens") is None
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@ -140,8 +138,11 @@ def test_ja_tokenizer_sub_tokens(
def test_ja_tokenizer_inflections_reading_forms(
ja_tokenizer, text, inflections, reading_forms
):
assert ja_tokenizer(text).user_data["inflections"] == inflections
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
tokens = ja_tokenizer(text)
test_inflections = [",".join(tt.morph.get("inflection")) for tt in tokens]
assert test_inflections == list(inflections)
test_readings = [tt.morph.get("reading")[0] for tt in tokens]
assert test_readings == list(reading_forms)
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):

View File

@ -247,6 +247,10 @@ config can be used to configure the split mode to `A`, `B` or `C`.
split_mode = "A"
```
Extra information, such as reading, inflection form, and the SudachiPy
normalized form, is available in `Token.morph`. For `B` or `C` split modes,
subtokens are stored in `Doc.user_data["sub_tokens"]`.
<Infobox variant="warning">
If you run into errors related to `sudachipy`, which is currently under active