mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Convert custom user_data to token extension format for Japanese tokenizer (#5652)
* Convert custom user_data to token extension format Convert the user_data values so that they can be loaded as custom token extensions for `inflection`, `reading_form`, `sub_tokens`, and `lemma`. * Reset Underscore state in ja tokenizer tests
This commit is contained in:
parent
167df42cb6
commit
1dd38191ec
|
@ -145,8 +145,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||||
|
|
||||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||||
words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
|
words = [dtoken.surface for dtoken in dtokens]
|
||||||
sub_tokens_list = list(sub_tokens_list)
|
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
next_pos = None # for bi-gram rules
|
next_pos = None # for bi-gram rules
|
||||||
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
||||||
|
@ -158,14 +157,14 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
token.pos, next_pos = resolve_pos(
|
token.pos, next_pos = resolve_pos(
|
||||||
token.orth_,
|
token.orth_,
|
||||||
dtoken.tag,
|
dtoken.tag,
|
||||||
tags[idx + 1] if idx + 1 < len(tags) else None
|
dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None
|
||||||
)
|
)
|
||||||
# if there's no lemma info (it's an unk) just use the surface
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||||
|
doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf
|
||||||
doc.user_data["inflections"] = inflections
|
doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading
|
||||||
doc.user_data["reading_forms"] = readings
|
doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens
|
||||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,18 @@ import pytest
|
||||||
|
|
||||||
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
||||||
from spacy.lang.ja import Japanese, DetailedToken
|
from spacy.lang.ja import Japanese, DetailedToken
|
||||||
|
from spacy.tokens import Token
|
||||||
|
from spacy.tokens.underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def clean_underscore():
|
||||||
|
# reset the Underscore object after the test, to avoid having state copied across tests
|
||||||
|
yield
|
||||||
|
Underscore.doc_extensions = {}
|
||||||
|
Underscore.span_extensions = {}
|
||||||
|
Underscore.token_extensions = {}
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
TOKENIZER_TESTS = [
|
TOKENIZER_TESTS = [
|
||||||
|
@ -127,24 +139,33 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke
|
||||||
nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
||||||
nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
|
nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
|
||||||
|
|
||||||
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
|
doc = ja_tokenizer(text)
|
||||||
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
|
doc_a = nlp_a(text)
|
||||||
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
doc_b = nlp_b(text)
|
||||||
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
doc_c = nlp_c(text)
|
||||||
|
|
||||||
|
Token.set_extension("sub_tokens", default="")
|
||||||
|
assert [t._.sub_tokens for t in doc] == sub_tokens_list_a
|
||||||
|
assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a
|
||||||
|
assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b
|
||||||
|
assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,inflections,reading_forms",
|
@pytest.mark.parametrize("text,inflections,reading_forms",
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"取ってつけた",
|
"取ってつけた",
|
||||||
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
|
["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"],
|
||||||
("トッ", "テ", "ツケ", "タ"),
|
["トッ", "テ", "ツケ", "タ"],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
|
def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
|
||||||
assert ja_tokenizer(text).user_data["inflections"] == inflections
|
Token.set_extension("inflection", default="")
|
||||||
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
|
Token.set_extension("reading_form", default="")
|
||||||
|
doc = ja_tokenizer(text)
|
||||||
|
assert [t._.inflection for t in doc] == inflections
|
||||||
|
assert [t._.reading_form for t in doc] == reading_forms
|
||||||
|
|
||||||
|
|
||||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user