mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Moving Japanese tokenizer extra info to Token.morph (#8977)
* Use morph for extra Japanese tokenizer info Previously Japanese tokenizer info that didn't correspond to Token fields was put in user data. Since spaCy core should avoid touching user data, this moves most information to the Token.morph attribute. It also adds the normalized form, which wasn't exposed before. The subtokens, which are a list of full tokens, are still added to user data, except with the default tokenizer granualarity. With the default tokenizer settings the subtokens are all None, so in this case the user data is simply not set. * Update tests Also adds a new test for norm data. * Update docs * Add Japanese morphologizer factory Set the default to `extend=True` so that the morphologizer does not clobber the values set by the tokenizer. * Use the norm_ field for normalized forms Before this commit, normalized forms were put in the "norm" field in the morph attributes. I am not sure why I did that instead of using the token morph, I think I just forgot about it. * Skip test if sudachipy is not installed * Fix import Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
8f2409e514
commit
1ee6541ab0
|
@ -1,7 +1,8 @@
|
|||
from typing import Optional, Union, Dict, Any
|
||||
from typing import Optional, Union, Dict, Any, Callable
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
@ -10,9 +11,11 @@ from .tag_orth_map import TAG_ORTH_MAP
|
|||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
from ...errors import Errors
|
||||
from ...language import Language
|
||||
from ...pipeline import Morphologizer
|
||||
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...tokens import Doc, MorphAnalysis
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
@ -41,6 +44,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
self.vocab = vocab
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
# if we're using split mode A we don't need subtokens
|
||||
self.need_subtokens = not (split_mode is None or split_mode == "A")
|
||||
|
||||
def __reduce__(self):
|
||||
return JapaneseTokenizer, (self.vocab, self.split_mode)
|
||||
|
@ -52,8 +57,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||
|
||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 6
|
||||
words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 7
|
||||
)
|
||||
sub_tokens_list = list(sub_tokens_list)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
@ -71,9 +76,14 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
)
|
||||
# if there's no lemma info (it's an unk) just use the surface
|
||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||
doc.user_data["inflections"] = inflections
|
||||
doc.user_data["reading_forms"] = readings
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
morph = {}
|
||||
morph["inflection"] = dtoken.inf
|
||||
token.norm_ = dtoken.norm
|
||||
if dtoken.reading:
|
||||
morph["reading"] = dtoken.reading
|
||||
token.morph = MorphAnalysis(self.vocab, morph)
|
||||
if self.need_subtokens:
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
return doc
|
||||
|
||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
||||
|
@ -86,7 +96,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||
token.dictionary_form(), # lemma
|
||||
token.reading_form(), # user_data['reading_forms']
|
||||
token.normalized_form(),
|
||||
token.reading_form(),
|
||||
sub_tokens_list[idx]
|
||||
if sub_tokens_list
|
||||
else None, # user_data['sub_tokens']
|
||||
|
@ -108,9 +119,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
]
|
||||
|
||||
def _get_sub_tokens(self, sudachipy_tokens):
|
||||
if (
|
||||
self.split_mode is None or self.split_mode == "A"
|
||||
): # do nothing for default split mode
|
||||
# do nothing for default split mode
|
||||
if not self.need_subtokens:
|
||||
return None
|
||||
|
||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||
|
@ -179,9 +189,33 @@ class Japanese(Language):
|
|||
Defaults = JapaneseDefaults
|
||||
|
||||
|
||||
@Japanese.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={
|
||||
"model": DEFAULT_MORPH_MODEL,
|
||||
"overwrite": True,
|
||||
"extend": True,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
model: Model,
|
||||
name: str,
|
||||
overwrite: bool,
|
||||
extend: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(
|
||||
nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
# Hold the attributes we need with convenient names
|
||||
DetailedToken = namedtuple(
|
||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
|
||||
)
|
||||
|
||||
|
||||
|
@ -257,7 +291,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
return text_dtokens, text_spaces
|
||||
elif len([word for word in words if not word.isspace()]) == 0:
|
||||
assert text.isspace()
|
||||
text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)]
|
||||
text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
|
||||
text_spaces = [False]
|
||||
return text_dtokens, text_spaces
|
||||
|
||||
|
@ -274,7 +308,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
# space token
|
||||
if word_start > 0:
|
||||
w = text[text_pos : text_pos + word_start]
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||
text_spaces.append(False)
|
||||
text_pos += word_start
|
||||
|
||||
|
@ -290,7 +324,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
# trailing space token
|
||||
if text_pos < len(text):
|
||||
w = text[text_pos:]
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None))
|
||||
text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
|
||||
text_spaces.append(False)
|
||||
|
||||
return text_dtokens, text_spaces
|
||||
|
|
|
@ -8,3 +8,17 @@ import pytest
|
|||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,norm",
|
||||
[
|
||||
("SUMMER", "サマー"),
|
||||
("食べ物", "食べ物"),
|
||||
("綜合", "総合"),
|
||||
("コンピュータ", "コンピューター"),
|
||||
],
|
||||
)
|
||||
def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
|
||||
test_norm = ja_tokenizer(word)[0].norm_
|
||||
assert test_norm == norm
|
||||
|
|
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
9
spacy/tests/lang/ja/test_morphologizer_factory.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import pytest
|
||||
from spacy.lang.ja import Japanese
|
||||
|
||||
|
||||
def test_ja_morphologizer_factory():
|
||||
pytest.importorskip("sudachipy")
|
||||
nlp = Japanese()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
assert morphologizer.cfg["extend"] is True
|
|
@ -34,22 +34,22 @@ SENTENCE_TESTS = [
|
|||
]
|
||||
|
||||
tokens1 = [
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||
]
|
||||
tokens2 = [
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None),
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
|
||||
DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None),
|
||||
]
|
||||
tokens3 = [
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None),
|
||||
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
|
||||
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
|
||||
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
|
||||
]
|
||||
SUB_TOKEN_TESTS = [
|
||||
("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]])
|
||||
("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|||
assert len(nlp_c(text)) == len_c
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
|
||||
)
|
||||
@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
|
||||
def test_ja_tokenizer_sub_tokens(
|
||||
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
|
||||
ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
|
||||
):
|
||||
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
|
||||
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
|
||||
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
|
||||
|
||||
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||
assert ja_tokenizer(text).user_data.get("sub_tokens") is None
|
||||
assert nlp_a(text).user_data.get("sub_tokens") is None
|
||||
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
||||
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
||||
|
||||
|
@ -140,8 +138,11 @@ def test_ja_tokenizer_sub_tokens(
|
|||
def test_ja_tokenizer_inflections_reading_forms(
|
||||
ja_tokenizer, text, inflections, reading_forms
|
||||
):
|
||||
assert ja_tokenizer(text).user_data["inflections"] == inflections
|
||||
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
|
||||
tokens = ja_tokenizer(text)
|
||||
test_inflections = [",".join(tt.morph.get("inflection")) for tt in tokens]
|
||||
assert test_inflections == list(inflections)
|
||||
test_readings = [tt.morph.get("reading")[0] for tt in tokens]
|
||||
assert test_readings == list(reading_forms)
|
||||
|
||||
|
||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||
|
|
|
@ -247,6 +247,10 @@ config can be used to configure the split mode to `A`, `B` or `C`.
|
|||
split_mode = "A"
|
||||
```
|
||||
|
||||
Extra information, such as reading, inflection form, and the SudachiPy
|
||||
normalized form, is available in `Token.morph`. For `B` or `C` split modes,
|
||||
subtokens are stored in `Doc.user_data["sub_tokens"]`.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
If you run into errors related to `sudachipy`, which is currently under active
|
||||
|
|
Loading…
Reference in New Issue
Block a user