mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Japanese model: add user_dict entries and small refactor (#5573)
* user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0
This commit is contained in:
		
							parent
							
								
									c34420794a
								
							
						
					
					
						commit
						150a39ccca
					
				|  | @ -20,12 +20,7 @@ from ... import util | |||
| 
 | ||||
| 
 | ||||
| # Hold the attributes we need with convenient names | ||||
| DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) | ||||
| 
 | ||||
| # Handling for multiple spaces in a row is somewhat awkward, this simplifies | ||||
| # the flow by creating a dummy with the same interface. | ||||
| DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) | ||||
| DummySpace = DummyNode(" ", " ", " ") | ||||
| DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]) | ||||
| 
 | ||||
| 
 | ||||
| def try_sudachi_import(split_mode="A"): | ||||
|  | @ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"): | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def resolve_pos(orth, pos, next_pos): | ||||
| def resolve_pos(orth, tag, next_tag): | ||||
|     """If necessary, add a field to the POS tag for UD mapping. | ||||
|     Under Universal Dependencies, sometimes the same Unidic POS tag can | ||||
|     be mapped differently depending on the literal token or its context | ||||
|  | @ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos): | |||
|     # Some tokens have their UD tag decided based on the POS of the following | ||||
|     # token. | ||||
| 
 | ||||
|     # orth based rules | ||||
|     if pos[0] in TAG_ORTH_MAP: | ||||
|         orth_map = TAG_ORTH_MAP[pos[0]] | ||||
|     # apply orth based mapping | ||||
|     if tag in TAG_ORTH_MAP: | ||||
|         orth_map = TAG_ORTH_MAP[tag] | ||||
|         if orth in orth_map: | ||||
|             return orth_map[orth], None | ||||
|             return orth_map[orth], None  # current_pos, next_pos | ||||
| 
 | ||||
|     # tag bi-gram mapping | ||||
|     if next_pos: | ||||
|         tag_bigram = pos[0], next_pos[0] | ||||
|     # apply tag bi-gram mapping | ||||
|     if next_tag: | ||||
|         tag_bigram = tag, next_tag | ||||
|         if tag_bigram in TAG_BIGRAM_MAP: | ||||
|             bipos = TAG_BIGRAM_MAP[tag_bigram] | ||||
|             if bipos[0] is None: | ||||
|                 return TAG_MAP[pos[0]][POS], bipos[1] | ||||
|             current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram] | ||||
|             if current_pos is None:  # apply tag uni-gram mapping for current_pos | ||||
|                 return TAG_MAP[tag][POS], next_pos  # only next_pos is identified by tag bi-gram mapping | ||||
|             else: | ||||
|                 return bipos | ||||
|                 return current_pos, next_pos | ||||
| 
 | ||||
|     return TAG_MAP[pos[0]][POS], None | ||||
|     # apply tag uni-gram mapping | ||||
|     return TAG_MAP[tag][POS], None | ||||
| 
 | ||||
| 
 | ||||
| # Use a mapping of paired punctuation to avoid splitting quoted sentences. | ||||
| pairpunct = {'「':'」', '『': '』', '【': '】'} | ||||
| 
 | ||||
| 
 | ||||
| def separate_sentences(doc): | ||||
|     """Given a doc, mark tokens that start sentences based on Unidic tags. | ||||
|     """ | ||||
| 
 | ||||
|     stack = [] # save paired punctuation | ||||
| 
 | ||||
|     for i, token in enumerate(doc[:-2]): | ||||
|         # Set all tokens after the first to false by default. This is necessary | ||||
|         # for the doc code to be aware we've done sentencization, see | ||||
|         # `is_sentenced`. | ||||
|         token.sent_start = (i == 0) | ||||
|         if token.tag_: | ||||
|             if token.tag_ == "補助記号-括弧開": | ||||
|                 ts = str(token) | ||||
|                 if ts in pairpunct: | ||||
|                     stack.append(pairpunct[ts]) | ||||
|                 elif stack and ts == stack[-1]: | ||||
|                     stack.pop() | ||||
| 
 | ||||
|             if token.tag_ == "補助記号-句点": | ||||
|                 next_token = doc[i+1] | ||||
|                 if next_token.tag_ != token.tag_ and not stack: | ||||
|                     next_token.sent_start = True | ||||
| 
 | ||||
| 
 | ||||
| def get_dtokens(tokenizer, text): | ||||
|     tokens = tokenizer.tokenize(text) | ||||
|     words = [] | ||||
|     for ti, token in enumerate(tokens): | ||||
|         tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) | ||||
|         inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) | ||||
|         dtoken = DetailedToken( | ||||
|                 token.surface(), | ||||
|                 (tag, inf), | ||||
|                 token.dictionary_form()) | ||||
|         if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': | ||||
|             # don't add multiple space tokens in a row | ||||
|             continue | ||||
|         words.append(dtoken) | ||||
| 
 | ||||
|     # remove empty tokens. These can be produced with characters like … that | ||||
|     # Sudachi normalizes internally.  | ||||
|     words = [ww for ww in words if len(ww.surface) > 0] | ||||
|     return words | ||||
| 
 | ||||
| 
 | ||||
| def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): | ||||
| def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): | ||||
|     # Compare the content of tokens and text, first | ||||
|     words = [x.surface for x in dtokens] | ||||
|     if "".join("".join(words).split()) != "".join(text.split()): | ||||
|         raise ValueError(Errors.E194.format(text=text, words=words)) | ||||
|     text_words = [] | ||||
|     text_lemmas = [] | ||||
|     text_tags = [] | ||||
| 
 | ||||
|     text_dtokens = [] | ||||
|     text_spaces = [] | ||||
|     text_pos = 0 | ||||
|     # handle empty and whitespace-only texts | ||||
|     if len(words) == 0: | ||||
|         return text_words, text_lemmas, text_tags, text_spaces | ||||
|         return text_dtokens, text_spaces | ||||
|     elif len([word for word in words if not word.isspace()]) == 0: | ||||
|         assert text.isspace() | ||||
|         text_words = [text] | ||||
|         text_lemmas = [text] | ||||
|         text_tags = [gap_tag] | ||||
|         text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)] | ||||
|         text_spaces = [False] | ||||
|         return text_words, text_lemmas, text_tags, text_spaces | ||||
|     # normalize words to remove all whitespace tokens | ||||
|     norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) | ||||
|     # align words with text | ||||
|     for word, dtoken in zip(norm_words, norm_dtokens): | ||||
|         return text_dtokens, text_spaces | ||||
| 
 | ||||
|     # align words and dtokens by referring text, and insert gap tokens for the space char spans | ||||
|     for word, dtoken in zip(words, dtokens): | ||||
|         # skip all space tokens | ||||
|         if word.isspace(): | ||||
|             continue | ||||
|         try: | ||||
|             word_start = text[text_pos:].index(word) | ||||
|         except ValueError: | ||||
|             raise ValueError(Errors.E194.format(text=text, words=words)) | ||||
| 
 | ||||
|         # space token | ||||
|         if word_start > 0: | ||||
|             w = text[text_pos:text_pos + word_start] | ||||
|             text_words.append(w) | ||||
|             text_lemmas.append(w) | ||||
|             text_tags.append(gap_tag) | ||||
|             text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) | ||||
|             text_spaces.append(False) | ||||
|             text_pos += word_start | ||||
|         text_words.append(word) | ||||
|         text_lemmas.append(dtoken.lemma) | ||||
|         text_tags.append(dtoken.pos) | ||||
| 
 | ||||
|         # content word | ||||
|         text_dtokens.append(dtoken) | ||||
|         text_spaces.append(False) | ||||
|         text_pos += len(word) | ||||
|         # poll a space char after the word | ||||
|         if text_pos < len(text) and text[text_pos] == " ": | ||||
|             text_spaces[-1] = True | ||||
|             text_pos += 1 | ||||
| 
 | ||||
|     # trailing space token | ||||
|     if text_pos < len(text): | ||||
|         w = text[text_pos:] | ||||
|         text_words.append(w) | ||||
|         text_lemmas.append(w) | ||||
|         text_tags.append(gap_tag) | ||||
|         text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) | ||||
|         text_spaces.append(False) | ||||
|     return text_words, text_lemmas, text_tags, text_spaces | ||||
| 
 | ||||
|     return text_dtokens, text_spaces | ||||
| 
 | ||||
| 
 | ||||
| class JapaneseTokenizer(DummyTokenizer): | ||||
|  | @ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer): | |||
|         self.tokenizer = try_sudachi_import(self.split_mode) | ||||
| 
 | ||||
|     def __call__(self, text): | ||||
|         dtokens = get_dtokens(self.tokenizer, text) | ||||
|         # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces | ||||
|         sudachipy_tokens = self.tokenizer.tokenize(text) | ||||
|         dtokens = self._get_dtokens(sudachipy_tokens) | ||||
|         dtokens, spaces = get_dtokens_and_spaces(dtokens, text) | ||||
| 
 | ||||
|         words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) | ||||
|         # create Doc with tag bi-gram based part-of-speech identification rules | ||||
|         words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 | ||||
|         sub_tokens_list = list(sub_tokens_list) | ||||
|         doc = Doc(self.vocab, words=words, spaces=spaces) | ||||
|         next_pos = None | ||||
|         for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): | ||||
|             token.tag_ = unidic_tag[0] | ||||
|             if next_pos: | ||||
|         next_pos = None  # for bi-gram rules | ||||
|         for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): | ||||
|             token.tag_ = dtoken.tag | ||||
|             if next_pos:  # already identified in previous iteration | ||||
|                 token.pos = next_pos | ||||
|                 next_pos = None | ||||
|             else: | ||||
|                 token.pos, next_pos = resolve_pos( | ||||
|                     token.orth_, | ||||
|                     unidic_tag, | ||||
|                     unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None | ||||
|                     dtoken.tag, | ||||
|                     tags[idx + 1] if idx + 1 < len(tags) else None | ||||
|                 ) | ||||
| 
 | ||||
|             # if there's no lemma info (it's an unk) just use the surface | ||||
|             token.lemma_ = lemma | ||||
|         doc.user_data["unidic_tags"] = unidic_tags | ||||
|             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface | ||||
| 
 | ||||
|         doc.user_data["inflections"] = inflections | ||||
|         doc.user_data["reading_forms"] = readings | ||||
|         doc.user_data["sub_tokens"] = sub_tokens_list | ||||
| 
 | ||||
|         return doc | ||||
| 
 | ||||
|     def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True): | ||||
|         sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None | ||||
|         dtokens = [ | ||||
|             DetailedToken( | ||||
|                 token.surface(),  # orth | ||||
|                 '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']),  # tag | ||||
|                 ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']),  # inf | ||||
|                 token.dictionary_form(),  # lemma | ||||
|                 token.reading_form(),  # user_data['reading_forms'] | ||||
|                 sub_tokens_list[idx] if sub_tokens_list else None,  # user_data['sub_tokens'] | ||||
|             ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0 | ||||
|             # remove empty tokens which can be produced with characters like … that | ||||
|         ] | ||||
|         # Sudachi normalizes internally and outputs each space char as a token. | ||||
|         # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens | ||||
|         return [ | ||||
|             t for idx, t in enumerate(dtokens) if | ||||
|             idx == 0 or | ||||
|             not t.surface.isspace() or t.tag != '空白' or | ||||
|             not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白' | ||||
|         ] | ||||
| 
 | ||||
|     def _get_sub_tokens(self, sudachipy_tokens): | ||||
|         if self.split_mode is None or self.split_mode == "A":  # do nothing for default split mode | ||||
|             return None | ||||
| 
 | ||||
|         sub_tokens_list = []  # list of (list of list of DetailedToken | None) | ||||
|         for token in sudachipy_tokens: | ||||
|             sub_a = token.split(self.tokenizer.SplitMode.A) | ||||
|             if len(sub_a) == 1:  # no sub tokens | ||||
|                 sub_tokens_list.append(None) | ||||
|             elif self.split_mode == "B": | ||||
|                 sub_tokens_list.append([self._get_dtokens(sub_a, False)]) | ||||
|             else:  # "C" | ||||
|                 sub_b = token.split(self.tokenizer.SplitMode.B) | ||||
|                 if len(sub_a) == len(sub_b): | ||||
|                     dtokens = self._get_dtokens(sub_a, False) | ||||
|                     sub_tokens_list.append([dtokens, dtokens]) | ||||
|                 else: | ||||
|                     sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)]) | ||||
|         return sub_tokens_list | ||||
| 
 | ||||
|     def _get_config(self): | ||||
|         config = OrderedDict( | ||||
|             ( | ||||
|  |  | |||
|  | @ -1,144 +0,0 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| POS_PHRASE_MAP = { | ||||
|     "NOUN": "NP", | ||||
|     "NUM": "NP", | ||||
|     "PRON": "NP", | ||||
|     "PROPN": "NP", | ||||
| 
 | ||||
|     "VERB": "VP", | ||||
| 
 | ||||
|     "ADJ": "ADJP", | ||||
| 
 | ||||
|     "ADV": "ADVP", | ||||
| 
 | ||||
|     "CCONJ": "CCONJP", | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] | ||||
| def yield_bunsetu(doc, debug=False): | ||||
|     bunsetu = [] | ||||
|     bunsetu_may_end = False | ||||
|     phrase_type = None | ||||
|     phrase = None | ||||
|     prev = None | ||||
|     prev_tag = None | ||||
|     prev_dep = None | ||||
|     prev_head = None | ||||
|     for t in doc: | ||||
|         pos = t.pos_ | ||||
|         pos_type = POS_PHRASE_MAP.get(pos, None) | ||||
|         tag = t.tag_ | ||||
|         dep = t.dep_ | ||||
|         head = t.head.i | ||||
|         if debug: | ||||
|             print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) | ||||
| 
 | ||||
|         # DET is always an individual bunsetu | ||||
|         if pos == "DET": | ||||
|             if bunsetu: | ||||
|                 yield bunsetu, phrase_type, phrase | ||||
|             yield [t], None, None | ||||
|             bunsetu = [] | ||||
|             bunsetu_may_end = False | ||||
|             phrase_type = None | ||||
|             phrase = None | ||||
| 
 | ||||
|         # PRON or Open PUNCT always splits bunsetu | ||||
|         elif tag == "補助記号-括弧開": | ||||
|             if bunsetu: | ||||
|                 yield bunsetu, phrase_type, phrase | ||||
|             bunsetu = [t] | ||||
|             bunsetu_may_end = True | ||||
|             phrase_type = None | ||||
|             phrase = None | ||||
| 
 | ||||
|         # bunsetu head not appeared | ||||
|         elif phrase_type is None: | ||||
|             if bunsetu and prev_tag == "補助記号-読点": | ||||
|                 yield bunsetu, phrase_type, phrase | ||||
|                 bunsetu = [] | ||||
|                 bunsetu_may_end = False | ||||
|                 phrase_type = None | ||||
|                 phrase = None | ||||
|             bunsetu.append(t) | ||||
|             if pos_type:  # begin phrase | ||||
|                 phrase = [t] | ||||
|                 phrase_type = pos_type | ||||
|                 if pos_type in {"ADVP", "CCONJP"}: | ||||
|                     bunsetu_may_end = True | ||||
| 
 | ||||
|         # entering new bunsetu | ||||
|         elif pos_type and ( | ||||
|             pos_type != phrase_type or  # different phrase type arises | ||||
|             bunsetu_may_end  # same phrase type but bunsetu already ended | ||||
|         ): | ||||
|             # exceptional case: NOUN to VERB | ||||
|             if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: | ||||
|                 bunsetu.append(t) | ||||
|                 phrase_type = "VP" | ||||
|                 phrase.append(t) | ||||
|             # exceptional case: VERB to NOUN | ||||
|             elif phrase_type == "VP" and pos_type == "NP" and ( | ||||
|                     prev_dep == 'compound' and prev_head == t.i or | ||||
|                     dep == 'compound' and prev == head or | ||||
|                     prev_dep == 'nmod' and prev_head == t.i | ||||
|             ): | ||||
|                 bunsetu.append(t) | ||||
|                 phrase_type = "NP" | ||||
|                 phrase.append(t) | ||||
|             else: | ||||
|                 yield bunsetu, phrase_type, phrase | ||||
|                 bunsetu = [t] | ||||
|                 bunsetu_may_end = False | ||||
|                 phrase_type = pos_type | ||||
|                 phrase = [t] | ||||
| 
 | ||||
|         # NOUN bunsetu | ||||
|         elif phrase_type == "NP": | ||||
|             bunsetu.append(t) | ||||
|             if not bunsetu_may_end and (( | ||||
|                 (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} | ||||
|             ) or ( | ||||
|                 pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' | ||||
|             )): | ||||
|                 phrase.append(t) | ||||
|             else: | ||||
|                 bunsetu_may_end = True | ||||
| 
 | ||||
|         # VERB bunsetu | ||||
|         elif phrase_type == "VP": | ||||
|             bunsetu.append(t) | ||||
|             if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': | ||||
|                 phrase.append(t) | ||||
|             else: | ||||
|                 bunsetu_may_end = True | ||||
| 
 | ||||
|         # ADJ bunsetu | ||||
|         elif phrase_type == "ADJP" and tag != '連体詞': | ||||
|             bunsetu.append(t) | ||||
|             if not bunsetu_may_end and (( | ||||
|                 pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} | ||||
|             ) or ( | ||||
|                 pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' | ||||
|             )): | ||||
|                 phrase.append(t) | ||||
|             else: | ||||
|                 bunsetu_may_end = True | ||||
| 
 | ||||
|         # other bunsetu | ||||
|         else: | ||||
|             bunsetu.append(t) | ||||
| 
 | ||||
|         prev = t.i | ||||
|         prev_tag = t.tag_ | ||||
|         prev_dep = t.dep_ | ||||
|         prev_head = head | ||||
| 
 | ||||
|     if bunsetu: | ||||
|         yield bunsetu, phrase_type, phrase | ||||
|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | |||
| import pytest | ||||
| 
 | ||||
| from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS | ||||
| from spacy.lang.ja import Japanese | ||||
| from spacy.lang.ja import Japanese, DetailedToken | ||||
| 
 | ||||
| # fmt: off | ||||
| TOKENIZER_TESTS = [ | ||||
|  | @ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): | |||
|     assert len(nlp_c(text)) == len_c | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", | ||||
|     [ | ||||
|         ( | ||||
|             "選挙管理委員会", | ||||
|             [None, None, None, None], | ||||
|             [None, None, [ | ||||
|                 [ | ||||
|                     DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), | ||||
|                     DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), | ||||
|                 ] | ||||
|             ]], | ||||
|             [[ | ||||
|                 [ | ||||
|                     DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), | ||||
|                     DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), | ||||
|                     DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), | ||||
|                     DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), | ||||
|                 ], [ | ||||
|                     DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), | ||||
|                     DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), | ||||
|                     DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None), | ||||
|                 ] | ||||
|             ]] | ||||
|         ), | ||||
|     ] | ||||
| ) | ||||
| def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c): | ||||
|     nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) | ||||
|     nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) | ||||
|     nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) | ||||
| 
 | ||||
|     assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a | ||||
|     assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a | ||||
|     assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b | ||||
|     assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,inflections,reading_forms", | ||||
|     [ | ||||
|         ( | ||||
|             "取ってつけた", | ||||
|             ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), | ||||
|             ("トッ", "テ", "ツケ", "タ"), | ||||
|         ), | ||||
|     ] | ||||
| ) | ||||
| def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): | ||||
|     assert ja_tokenizer(text).user_data["inflections"] == inflections | ||||
|     assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms | ||||
| 
 | ||||
| 
 | ||||
| def test_ja_tokenizer_emptyish_texts(ja_tokenizer): | ||||
|     doc = ja_tokenizer("") | ||||
|     assert len(doc) == 0 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user