mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Japanese model: add user_dict entries and small refactor (#5573)
* user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0
This commit is contained in:
parent
c34420794a
commit
150a39ccca
|
@ -20,12 +20,7 @@ from ... import util
|
||||||
|
|
||||||
|
|
||||||
# Hold the attributes we need with convenient names
|
# Hold the attributes we need with convenient names
|
||||||
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
|
DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])
|
||||||
|
|
||||||
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
|
|
||||||
# the flow by creating a dummy with the same interface.
|
|
||||||
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
|
|
||||||
DummySpace = DummyNode(" ", " ", " ")
|
|
||||||
|
|
||||||
|
|
||||||
def try_sudachi_import(split_mode="A"):
|
def try_sudachi_import(split_mode="A"):
|
||||||
|
@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def resolve_pos(orth, pos, next_pos):
|
def resolve_pos(orth, tag, next_tag):
|
||||||
"""If necessary, add a field to the POS tag for UD mapping.
|
"""If necessary, add a field to the POS tag for UD mapping.
|
||||||
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||||
be mapped differently depending on the literal token or its context
|
be mapped differently depending on the literal token or its context
|
||||||
|
@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
|
||||||
# Some tokens have their UD tag decided based on the POS of the following
|
# Some tokens have their UD tag decided based on the POS of the following
|
||||||
# token.
|
# token.
|
||||||
|
|
||||||
# orth based rules
|
# apply orth based mapping
|
||||||
if pos[0] in TAG_ORTH_MAP:
|
if tag in TAG_ORTH_MAP:
|
||||||
orth_map = TAG_ORTH_MAP[pos[0]]
|
orth_map = TAG_ORTH_MAP[tag]
|
||||||
if orth in orth_map:
|
if orth in orth_map:
|
||||||
return orth_map[orth], None
|
return orth_map[orth], None # current_pos, next_pos
|
||||||
|
|
||||||
# tag bi-gram mapping
|
# apply tag bi-gram mapping
|
||||||
if next_pos:
|
if next_tag:
|
||||||
tag_bigram = pos[0], next_pos[0]
|
tag_bigram = tag, next_tag
|
||||||
if tag_bigram in TAG_BIGRAM_MAP:
|
if tag_bigram in TAG_BIGRAM_MAP:
|
||||||
bipos = TAG_BIGRAM_MAP[tag_bigram]
|
current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
|
||||||
if bipos[0] is None:
|
if current_pos is None: # apply tag uni-gram mapping for current_pos
|
||||||
return TAG_MAP[pos[0]][POS], bipos[1]
|
return TAG_MAP[tag][POS], next_pos # only next_pos is identified by tag bi-gram mapping
|
||||||
else:
|
else:
|
||||||
return bipos
|
return current_pos, next_pos
|
||||||
|
|
||||||
return TAG_MAP[pos[0]][POS], None
|
# apply tag uni-gram mapping
|
||||||
|
return TAG_MAP[tag][POS], None
|
||||||
|
|
||||||
|
|
||||||
# Use a mapping of paired punctuation to avoid splitting quoted sentences.
|
def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
pairpunct = {'「':'」', '『': '』', '【': '】'}
|
# Compare the content of tokens and text, first
|
||||||
|
|
||||||
|
|
||||||
def separate_sentences(doc):
|
|
||||||
"""Given a doc, mark tokens that start sentences based on Unidic tags.
|
|
||||||
"""
|
|
||||||
|
|
||||||
stack = [] # save paired punctuation
|
|
||||||
|
|
||||||
for i, token in enumerate(doc[:-2]):
|
|
||||||
# Set all tokens after the first to false by default. This is necessary
|
|
||||||
# for the doc code to be aware we've done sentencization, see
|
|
||||||
# `is_sentenced`.
|
|
||||||
token.sent_start = (i == 0)
|
|
||||||
if token.tag_:
|
|
||||||
if token.tag_ == "補助記号-括弧開":
|
|
||||||
ts = str(token)
|
|
||||||
if ts in pairpunct:
|
|
||||||
stack.append(pairpunct[ts])
|
|
||||||
elif stack and ts == stack[-1]:
|
|
||||||
stack.pop()
|
|
||||||
|
|
||||||
if token.tag_ == "補助記号-句点":
|
|
||||||
next_token = doc[i+1]
|
|
||||||
if next_token.tag_ != token.tag_ and not stack:
|
|
||||||
next_token.sent_start = True
|
|
||||||
|
|
||||||
|
|
||||||
def get_dtokens(tokenizer, text):
|
|
||||||
tokens = tokenizer.tokenize(text)
|
|
||||||
words = []
|
|
||||||
for ti, token in enumerate(tokens):
|
|
||||||
tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
|
|
||||||
inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
|
|
||||||
dtoken = DetailedToken(
|
|
||||||
token.surface(),
|
|
||||||
(tag, inf),
|
|
||||||
token.dictionary_form())
|
|
||||||
if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
|
|
||||||
# don't add multiple space tokens in a row
|
|
||||||
continue
|
|
||||||
words.append(dtoken)
|
|
||||||
|
|
||||||
# remove empty tokens. These can be produced with characters like … that
|
|
||||||
# Sudachi normalizes internally.
|
|
||||||
words = [ww for ww in words if len(ww.surface) > 0]
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
|
||||||
def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
|
|
||||||
words = [x.surface for x in dtokens]
|
words = [x.surface for x in dtokens]
|
||||||
if "".join("".join(words).split()) != "".join(text.split()):
|
if "".join("".join(words).split()) != "".join(text.split()):
|
||||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
text_words = []
|
|
||||||
text_lemmas = []
|
text_dtokens = []
|
||||||
text_tags = []
|
|
||||||
text_spaces = []
|
text_spaces = []
|
||||||
text_pos = 0
|
text_pos = 0
|
||||||
# handle empty and whitespace-only texts
|
# handle empty and whitespace-only texts
|
||||||
if len(words) == 0:
|
if len(words) == 0:
|
||||||
return text_words, text_lemmas, text_tags, text_spaces
|
return text_dtokens, text_spaces
|
||||||
elif len([word for word in words if not word.isspace()]) == 0:
|
elif len([word for word in words if not word.isspace()]) == 0:
|
||||||
assert text.isspace()
|
assert text.isspace()
|
||||||
text_words = [text]
|
text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
|
||||||
text_lemmas = [text]
|
|
||||||
text_tags = [gap_tag]
|
|
||||||
text_spaces = [False]
|
text_spaces = [False]
|
||||||
return text_words, text_lemmas, text_tags, text_spaces
|
return text_dtokens, text_spaces
|
||||||
# normalize words to remove all whitespace tokens
|
|
||||||
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
|
# align words and dtokens by referring text, and insert gap tokens for the space char spans
|
||||||
# align words with text
|
for word, dtoken in zip(words, dtokens):
|
||||||
for word, dtoken in zip(norm_words, norm_dtokens):
|
# skip all space tokens
|
||||||
|
if word.isspace():
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
word_start = text[text_pos:].index(word)
|
word_start = text[text_pos:].index(word)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||||
|
|
||||||
|
# space token
|
||||||
if word_start > 0:
|
if word_start > 0:
|
||||||
w = text[text_pos:text_pos + word_start]
|
w = text[text_pos:text_pos + word_start]
|
||||||
text_words.append(w)
|
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
|
||||||
text_lemmas.append(w)
|
|
||||||
text_tags.append(gap_tag)
|
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
text_pos += word_start
|
text_pos += word_start
|
||||||
text_words.append(word)
|
|
||||||
text_lemmas.append(dtoken.lemma)
|
# content word
|
||||||
text_tags.append(dtoken.pos)
|
text_dtokens.append(dtoken)
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
text_pos += len(word)
|
text_pos += len(word)
|
||||||
|
# poll a space char after the word
|
||||||
if text_pos < len(text) and text[text_pos] == " ":
|
if text_pos < len(text) and text[text_pos] == " ":
|
||||||
text_spaces[-1] = True
|
text_spaces[-1] = True
|
||||||
text_pos += 1
|
text_pos += 1
|
||||||
|
|
||||||
|
# trailing space token
|
||||||
if text_pos < len(text):
|
if text_pos < len(text):
|
||||||
w = text[text_pos:]
|
w = text[text_pos:]
|
||||||
text_words.append(w)
|
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
|
||||||
text_lemmas.append(w)
|
|
||||||
text_tags.append(gap_tag)
|
|
||||||
text_spaces.append(False)
|
text_spaces.append(False)
|
||||||
return text_words, text_lemmas, text_tags, text_spaces
|
|
||||||
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
dtokens = get_dtokens(self.tokenizer, text)
|
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||||
|
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||||
|
dtokens = self._get_dtokens(sudachipy_tokens)
|
||||||
|
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||||
|
|
||||||
words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
|
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||||
|
words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
|
||||||
|
sub_tokens_list = list(sub_tokens_list)
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
next_pos = None
|
next_pos = None # for bi-gram rules
|
||||||
for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
|
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
||||||
token.tag_ = unidic_tag[0]
|
token.tag_ = dtoken.tag
|
||||||
if next_pos:
|
if next_pos: # already identified in previous iteration
|
||||||
token.pos = next_pos
|
token.pos = next_pos
|
||||||
next_pos = None
|
next_pos = None
|
||||||
else:
|
else:
|
||||||
token.pos, next_pos = resolve_pos(
|
token.pos, next_pos = resolve_pos(
|
||||||
token.orth_,
|
token.orth_,
|
||||||
unidic_tag,
|
dtoken.tag,
|
||||||
unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
|
tags[idx + 1] if idx + 1 < len(tags) else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# if there's no lemma info (it's an unk) just use the surface
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
token.lemma_ = lemma
|
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||||
doc.user_data["unidic_tags"] = unidic_tags
|
|
||||||
|
doc.user_data["inflections"] = inflections
|
||||||
|
doc.user_data["reading_forms"] = readings
|
||||||
|
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
|
||||||
|
sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
|
||||||
|
dtokens = [
|
||||||
|
DetailedToken(
|
||||||
|
token.surface(), # orth
|
||||||
|
'-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']), # tag
|
||||||
|
','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']), # inf
|
||||||
|
token.dictionary_form(), # lemma
|
||||||
|
token.reading_form(), # user_data['reading_forms']
|
||||||
|
sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens']
|
||||||
|
) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
|
||||||
|
# remove empty tokens which can be produced with characters like … that
|
||||||
|
]
|
||||||
|
# Sudachi normalizes internally and outputs each space char as a token.
|
||||||
|
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
|
||||||
|
return [
|
||||||
|
t for idx, t in enumerate(dtokens) if
|
||||||
|
idx == 0 or
|
||||||
|
not t.surface.isspace() or t.tag != '空白' or
|
||||||
|
not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_sub_tokens(self, sudachipy_tokens):
|
||||||
|
if self.split_mode is None or self.split_mode == "A": # do nothing for default split mode
|
||||||
|
return None
|
||||||
|
|
||||||
|
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||||
|
for token in sudachipy_tokens:
|
||||||
|
sub_a = token.split(self.tokenizer.SplitMode.A)
|
||||||
|
if len(sub_a) == 1: # no sub tokens
|
||||||
|
sub_tokens_list.append(None)
|
||||||
|
elif self.split_mode == "B":
|
||||||
|
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
|
||||||
|
else: # "C"
|
||||||
|
sub_b = token.split(self.tokenizer.SplitMode.B)
|
||||||
|
if len(sub_a) == len(sub_b):
|
||||||
|
dtokens = self._get_dtokens(sub_a, False)
|
||||||
|
sub_tokens_list.append([dtokens, dtokens])
|
||||||
|
else:
|
||||||
|
sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
|
||||||
|
return sub_tokens_list
|
||||||
|
|
||||||
def _get_config(self):
|
def _get_config(self):
|
||||||
config = OrderedDict(
|
config = OrderedDict(
|
||||||
(
|
(
|
||||||
|
|
|
@ -1,144 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
POS_PHRASE_MAP = {
|
|
||||||
"NOUN": "NP",
|
|
||||||
"NUM": "NP",
|
|
||||||
"PRON": "NP",
|
|
||||||
"PROPN": "NP",
|
|
||||||
|
|
||||||
"VERB": "VP",
|
|
||||||
|
|
||||||
"ADJ": "ADJP",
|
|
||||||
|
|
||||||
"ADV": "ADVP",
|
|
||||||
|
|
||||||
"CCONJ": "CCONJP",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
|
|
||||||
def yield_bunsetu(doc, debug=False):
|
|
||||||
bunsetu = []
|
|
||||||
bunsetu_may_end = False
|
|
||||||
phrase_type = None
|
|
||||||
phrase = None
|
|
||||||
prev = None
|
|
||||||
prev_tag = None
|
|
||||||
prev_dep = None
|
|
||||||
prev_head = None
|
|
||||||
for t in doc:
|
|
||||||
pos = t.pos_
|
|
||||||
pos_type = POS_PHRASE_MAP.get(pos, None)
|
|
||||||
tag = t.tag_
|
|
||||||
dep = t.dep_
|
|
||||||
head = t.head.i
|
|
||||||
if debug:
|
|
||||||
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
|
|
||||||
|
|
||||||
# DET is always an individual bunsetu
|
|
||||||
if pos == "DET":
|
|
||||||
if bunsetu:
|
|
||||||
yield bunsetu, phrase_type, phrase
|
|
||||||
yield [t], None, None
|
|
||||||
bunsetu = []
|
|
||||||
bunsetu_may_end = False
|
|
||||||
phrase_type = None
|
|
||||||
phrase = None
|
|
||||||
|
|
||||||
# PRON or Open PUNCT always splits bunsetu
|
|
||||||
elif tag == "補助記号-括弧開":
|
|
||||||
if bunsetu:
|
|
||||||
yield bunsetu, phrase_type, phrase
|
|
||||||
bunsetu = [t]
|
|
||||||
bunsetu_may_end = True
|
|
||||||
phrase_type = None
|
|
||||||
phrase = None
|
|
||||||
|
|
||||||
# bunsetu head not appeared
|
|
||||||
elif phrase_type is None:
|
|
||||||
if bunsetu and prev_tag == "補助記号-読点":
|
|
||||||
yield bunsetu, phrase_type, phrase
|
|
||||||
bunsetu = []
|
|
||||||
bunsetu_may_end = False
|
|
||||||
phrase_type = None
|
|
||||||
phrase = None
|
|
||||||
bunsetu.append(t)
|
|
||||||
if pos_type: # begin phrase
|
|
||||||
phrase = [t]
|
|
||||||
phrase_type = pos_type
|
|
||||||
if pos_type in {"ADVP", "CCONJP"}:
|
|
||||||
bunsetu_may_end = True
|
|
||||||
|
|
||||||
# entering new bunsetu
|
|
||||||
elif pos_type and (
|
|
||||||
pos_type != phrase_type or # different phrase type arises
|
|
||||||
bunsetu_may_end # same phrase type but bunsetu already ended
|
|
||||||
):
|
|
||||||
# exceptional case: NOUN to VERB
|
|
||||||
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
|
|
||||||
bunsetu.append(t)
|
|
||||||
phrase_type = "VP"
|
|
||||||
phrase.append(t)
|
|
||||||
# exceptional case: VERB to NOUN
|
|
||||||
elif phrase_type == "VP" and pos_type == "NP" and (
|
|
||||||
prev_dep == 'compound' and prev_head == t.i or
|
|
||||||
dep == 'compound' and prev == head or
|
|
||||||
prev_dep == 'nmod' and prev_head == t.i
|
|
||||||
):
|
|
||||||
bunsetu.append(t)
|
|
||||||
phrase_type = "NP"
|
|
||||||
phrase.append(t)
|
|
||||||
else:
|
|
||||||
yield bunsetu, phrase_type, phrase
|
|
||||||
bunsetu = [t]
|
|
||||||
bunsetu_may_end = False
|
|
||||||
phrase_type = pos_type
|
|
||||||
phrase = [t]
|
|
||||||
|
|
||||||
# NOUN bunsetu
|
|
||||||
elif phrase_type == "NP":
|
|
||||||
bunsetu.append(t)
|
|
||||||
if not bunsetu_may_end and ((
|
|
||||||
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
|
|
||||||
) or (
|
|
||||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
|
||||||
)):
|
|
||||||
phrase.append(t)
|
|
||||||
else:
|
|
||||||
bunsetu_may_end = True
|
|
||||||
|
|
||||||
# VERB bunsetu
|
|
||||||
elif phrase_type == "VP":
|
|
||||||
bunsetu.append(t)
|
|
||||||
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
|
|
||||||
phrase.append(t)
|
|
||||||
else:
|
|
||||||
bunsetu_may_end = True
|
|
||||||
|
|
||||||
# ADJ bunsetu
|
|
||||||
elif phrase_type == "ADJP" and tag != '連体詞':
|
|
||||||
bunsetu.append(t)
|
|
||||||
if not bunsetu_may_end and ((
|
|
||||||
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
|
|
||||||
) or (
|
|
||||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
|
||||||
)):
|
|
||||||
phrase.append(t)
|
|
||||||
else:
|
|
||||||
bunsetu_may_end = True
|
|
||||||
|
|
||||||
# other bunsetu
|
|
||||||
else:
|
|
||||||
bunsetu.append(t)
|
|
||||||
|
|
||||||
prev = t.i
|
|
||||||
prev_tag = t.tag_
|
|
||||||
prev_dep = t.dep_
|
|
||||||
prev_head = head
|
|
||||||
|
|
||||||
if bunsetu:
|
|
||||||
yield bunsetu, phrase_type, phrase
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese, DetailedToken
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
TOKENIZER_TESTS = [
|
TOKENIZER_TESTS = [
|
||||||
|
@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
assert len(nlp_c(text)) == len_c
|
assert len(nlp_c(text)) == len_c
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"選挙管理委員会",
|
||||||
|
[None, None, None, None],
|
||||||
|
[None, None, [
|
||||||
|
[
|
||||||
|
DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
|
||||||
|
DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
|
||||||
|
]
|
||||||
|
]],
|
||||||
|
[[
|
||||||
|
[
|
||||||
|
DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
|
||||||
|
DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
|
||||||
|
DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
|
||||||
|
DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None),
|
||||||
|
], [
|
||||||
|
DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
|
||||||
|
DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
|
||||||
|
DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None),
|
||||||
|
]
|
||||||
|
]]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c):
|
||||||
|
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
|
||||||
|
nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
|
||||||
|
nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
|
||||||
|
|
||||||
|
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||||
|
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
|
||||||
|
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
|
||||||
|
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,inflections,reading_forms",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"取ってつけた",
|
||||||
|
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
|
||||||
|
("トッ", "テ", "ツケ", "タ"),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
|
||||||
|
assert ja_tokenizer(text).user_data["inflections"] == inflections
|
||||||
|
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
|
||||||
|
|
||||||
|
|
||||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||||
doc = ja_tokenizer("")
|
doc = ja_tokenizer("")
|
||||||
assert len(doc) == 0
|
assert len(doc) == 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user