mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Japanese model: add user_dict entries and small refactor (#5573)
* user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0
This commit is contained in:
		
							parent
							
								
									c34420794a
								
							
						
					
					
						commit
						150a39ccca
					
				|  | @ -20,12 +20,7 @@ from ... import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Hold the attributes we need with convenient names | # Hold the attributes we need with convenient names | ||||||
| DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) | DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]) | ||||||
| 
 |  | ||||||
| # Handling for multiple spaces in a row is somewhat awkward, this simplifies |  | ||||||
| # the flow by creating a dummy with the same interface. |  | ||||||
| DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) |  | ||||||
| DummySpace = DummyNode(" ", " ", " ") |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def try_sudachi_import(split_mode="A"): | def try_sudachi_import(split_mode="A"): | ||||||
|  | @ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"): | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def resolve_pos(orth, pos, next_pos): | def resolve_pos(orth, tag, next_tag): | ||||||
|     """If necessary, add a field to the POS tag for UD mapping. |     """If necessary, add a field to the POS tag for UD mapping. | ||||||
|     Under Universal Dependencies, sometimes the same Unidic POS tag can |     Under Universal Dependencies, sometimes the same Unidic POS tag can | ||||||
|     be mapped differently depending on the literal token or its context |     be mapped differently depending on the literal token or its context | ||||||
|  | @ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos): | ||||||
|     # Some tokens have their UD tag decided based on the POS of the following |     # Some tokens have their UD tag decided based on the POS of the following | ||||||
|     # token. |     # token. | ||||||
| 
 | 
 | ||||||
|     # orth based rules |     # apply orth based mapping | ||||||
|     if pos[0] in TAG_ORTH_MAP: |     if tag in TAG_ORTH_MAP: | ||||||
|         orth_map = TAG_ORTH_MAP[pos[0]] |         orth_map = TAG_ORTH_MAP[tag] | ||||||
|         if orth in orth_map: |         if orth in orth_map: | ||||||
|             return orth_map[orth], None |             return orth_map[orth], None  # current_pos, next_pos | ||||||
| 
 | 
 | ||||||
|     # tag bi-gram mapping |     # apply tag bi-gram mapping | ||||||
|     if next_pos: |     if next_tag: | ||||||
|         tag_bigram = pos[0], next_pos[0] |         tag_bigram = tag, next_tag | ||||||
|         if tag_bigram in TAG_BIGRAM_MAP: |         if tag_bigram in TAG_BIGRAM_MAP: | ||||||
|             bipos = TAG_BIGRAM_MAP[tag_bigram] |             current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram] | ||||||
|             if bipos[0] is None: |             if current_pos is None:  # apply tag uni-gram mapping for current_pos | ||||||
|                 return TAG_MAP[pos[0]][POS], bipos[1] |                 return TAG_MAP[tag][POS], next_pos  # only next_pos is identified by tag bi-gram mapping | ||||||
|             else: |             else: | ||||||
|                 return bipos |                 return current_pos, next_pos | ||||||
| 
 | 
 | ||||||
|     return TAG_MAP[pos[0]][POS], None |     # apply tag uni-gram mapping | ||||||
|  |     return TAG_MAP[tag][POS], None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Use a mapping of paired punctuation to avoid splitting quoted sentences. | def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): | ||||||
| pairpunct = {'「':'」', '『': '』', '【': '】'} |     # Compare the content of tokens and text, first | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def separate_sentences(doc): |  | ||||||
|     """Given a doc, mark tokens that start sentences based on Unidic tags. |  | ||||||
|     """ |  | ||||||
| 
 |  | ||||||
|     stack = [] # save paired punctuation |  | ||||||
| 
 |  | ||||||
|     for i, token in enumerate(doc[:-2]): |  | ||||||
|         # Set all tokens after the first to false by default. This is necessary |  | ||||||
|         # for the doc code to be aware we've done sentencization, see |  | ||||||
|         # `is_sentenced`. |  | ||||||
|         token.sent_start = (i == 0) |  | ||||||
|         if token.tag_: |  | ||||||
|             if token.tag_ == "補助記号-括弧開": |  | ||||||
|                 ts = str(token) |  | ||||||
|                 if ts in pairpunct: |  | ||||||
|                     stack.append(pairpunct[ts]) |  | ||||||
|                 elif stack and ts == stack[-1]: |  | ||||||
|                     stack.pop() |  | ||||||
| 
 |  | ||||||
|             if token.tag_ == "補助記号-句点": |  | ||||||
|                 next_token = doc[i+1] |  | ||||||
|                 if next_token.tag_ != token.tag_ and not stack: |  | ||||||
|                     next_token.sent_start = True |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_dtokens(tokenizer, text): |  | ||||||
|     tokens = tokenizer.tokenize(text) |  | ||||||
|     words = [] |  | ||||||
|     for ti, token in enumerate(tokens): |  | ||||||
|         tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) |  | ||||||
|         inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) |  | ||||||
|         dtoken = DetailedToken( |  | ||||||
|                 token.surface(), |  | ||||||
|                 (tag, inf), |  | ||||||
|                 token.dictionary_form()) |  | ||||||
|         if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': |  | ||||||
|             # don't add multiple space tokens in a row |  | ||||||
|             continue |  | ||||||
|         words.append(dtoken) |  | ||||||
| 
 |  | ||||||
|     # remove empty tokens. These can be produced with characters like … that |  | ||||||
|     # Sudachi normalizes internally.  |  | ||||||
|     words = [ww for ww in words if len(ww.surface) > 0] |  | ||||||
|     return words |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): |  | ||||||
|     words = [x.surface for x in dtokens] |     words = [x.surface for x in dtokens] | ||||||
|     if "".join("".join(words).split()) != "".join(text.split()): |     if "".join("".join(words).split()) != "".join(text.split()): | ||||||
|         raise ValueError(Errors.E194.format(text=text, words=words)) |         raise ValueError(Errors.E194.format(text=text, words=words)) | ||||||
|     text_words = [] | 
 | ||||||
|     text_lemmas = [] |     text_dtokens = [] | ||||||
|     text_tags = [] |  | ||||||
|     text_spaces = [] |     text_spaces = [] | ||||||
|     text_pos = 0 |     text_pos = 0 | ||||||
|     # handle empty and whitespace-only texts |     # handle empty and whitespace-only texts | ||||||
|     if len(words) == 0: |     if len(words) == 0: | ||||||
|         return text_words, text_lemmas, text_tags, text_spaces |         return text_dtokens, text_spaces | ||||||
|     elif len([word for word in words if not word.isspace()]) == 0: |     elif len([word for word in words if not word.isspace()]) == 0: | ||||||
|         assert text.isspace() |         assert text.isspace() | ||||||
|         text_words = [text] |         text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)] | ||||||
|         text_lemmas = [text] |  | ||||||
|         text_tags = [gap_tag] |  | ||||||
|         text_spaces = [False] |         text_spaces = [False] | ||||||
|         return text_words, text_lemmas, text_tags, text_spaces |         return text_dtokens, text_spaces | ||||||
|     # normalize words to remove all whitespace tokens | 
 | ||||||
|     norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) |     # align words and dtokens by referring text, and insert gap tokens for the space char spans | ||||||
|     # align words with text |     for word, dtoken in zip(words, dtokens): | ||||||
|     for word, dtoken in zip(norm_words, norm_dtokens): |         # skip all space tokens | ||||||
|  |         if word.isspace(): | ||||||
|  |             continue | ||||||
|         try: |         try: | ||||||
|             word_start = text[text_pos:].index(word) |             word_start = text[text_pos:].index(word) | ||||||
|         except ValueError: |         except ValueError: | ||||||
|             raise ValueError(Errors.E194.format(text=text, words=words)) |             raise ValueError(Errors.E194.format(text=text, words=words)) | ||||||
|  | 
 | ||||||
|  |         # space token | ||||||
|         if word_start > 0: |         if word_start > 0: | ||||||
|             w = text[text_pos:text_pos + word_start] |             w = text[text_pos:text_pos + word_start] | ||||||
|             text_words.append(w) |             text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) | ||||||
|             text_lemmas.append(w) |  | ||||||
|             text_tags.append(gap_tag) |  | ||||||
|             text_spaces.append(False) |             text_spaces.append(False) | ||||||
|             text_pos += word_start |             text_pos += word_start | ||||||
|         text_words.append(word) | 
 | ||||||
|         text_lemmas.append(dtoken.lemma) |         # content word | ||||||
|         text_tags.append(dtoken.pos) |         text_dtokens.append(dtoken) | ||||||
|         text_spaces.append(False) |         text_spaces.append(False) | ||||||
|         text_pos += len(word) |         text_pos += len(word) | ||||||
|  |         # poll a space char after the word | ||||||
|         if text_pos < len(text) and text[text_pos] == " ": |         if text_pos < len(text) and text[text_pos] == " ": | ||||||
|             text_spaces[-1] = True |             text_spaces[-1] = True | ||||||
|             text_pos += 1 |             text_pos += 1 | ||||||
|  | 
 | ||||||
|  |     # trailing space token | ||||||
|     if text_pos < len(text): |     if text_pos < len(text): | ||||||
|         w = text[text_pos:] |         w = text[text_pos:] | ||||||
|         text_words.append(w) |         text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) | ||||||
|         text_lemmas.append(w) |  | ||||||
|         text_tags.append(gap_tag) |  | ||||||
|         text_spaces.append(False) |         text_spaces.append(False) | ||||||
|     return text_words, text_lemmas, text_tags, text_spaces | 
 | ||||||
|  |     return text_dtokens, text_spaces | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class JapaneseTokenizer(DummyTokenizer): | class JapaneseTokenizer(DummyTokenizer): | ||||||
|  | @ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer): | ||||||
|         self.tokenizer = try_sudachi_import(self.split_mode) |         self.tokenizer = try_sudachi_import(self.split_mode) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, text): |     def __call__(self, text): | ||||||
|         dtokens = get_dtokens(self.tokenizer, text) |         # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces | ||||||
|  |         sudachipy_tokens = self.tokenizer.tokenize(text) | ||||||
|  |         dtokens = self._get_dtokens(sudachipy_tokens) | ||||||
|  |         dtokens, spaces = get_dtokens_and_spaces(dtokens, text) | ||||||
| 
 | 
 | ||||||
|         words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) |         # create Doc with tag bi-gram based part-of-speech identification rules | ||||||
|  |         words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 | ||||||
|  |         sub_tokens_list = list(sub_tokens_list) | ||||||
|         doc = Doc(self.vocab, words=words, spaces=spaces) |         doc = Doc(self.vocab, words=words, spaces=spaces) | ||||||
|         next_pos = None |         next_pos = None  # for bi-gram rules | ||||||
|         for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): |         for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): | ||||||
|             token.tag_ = unidic_tag[0] |             token.tag_ = dtoken.tag | ||||||
|             if next_pos: |             if next_pos:  # already identified in previous iteration | ||||||
|                 token.pos = next_pos |                 token.pos = next_pos | ||||||
|                 next_pos = None |                 next_pos = None | ||||||
|             else: |             else: | ||||||
|                 token.pos, next_pos = resolve_pos( |                 token.pos, next_pos = resolve_pos( | ||||||
|                     token.orth_, |                     token.orth_, | ||||||
|                     unidic_tag, |                     dtoken.tag, | ||||||
|                     unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None |                     tags[idx + 1] if idx + 1 < len(tags) else None | ||||||
|                 ) |                 ) | ||||||
| 
 |  | ||||||
|             # if there's no lemma info (it's an unk) just use the surface |             # if there's no lemma info (it's an unk) just use the surface | ||||||
|             token.lemma_ = lemma |             token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface | ||||||
|         doc.user_data["unidic_tags"] = unidic_tags | 
 | ||||||
|  |         doc.user_data["inflections"] = inflections | ||||||
|  |         doc.user_data["reading_forms"] = readings | ||||||
|  |         doc.user_data["sub_tokens"] = sub_tokens_list | ||||||
| 
 | 
 | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|  |     def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True): | ||||||
|  |         sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None | ||||||
|  |         dtokens = [ | ||||||
|  |             DetailedToken( | ||||||
|  |                 token.surface(),  # orth | ||||||
|  |                 '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']),  # tag | ||||||
|  |                 ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']),  # inf | ||||||
|  |                 token.dictionary_form(),  # lemma | ||||||
|  |                 token.reading_form(),  # user_data['reading_forms'] | ||||||
|  |                 sub_tokens_list[idx] if sub_tokens_list else None,  # user_data['sub_tokens'] | ||||||
|  |             ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0 | ||||||
|  |             # remove empty tokens which can be produced with characters like … that | ||||||
|  |         ] | ||||||
|  |         # Sudachi normalizes internally and outputs each space char as a token. | ||||||
|  |         # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens | ||||||
|  |         return [ | ||||||
|  |             t for idx, t in enumerate(dtokens) if | ||||||
|  |             idx == 0 or | ||||||
|  |             not t.surface.isspace() or t.tag != '空白' or | ||||||
|  |             not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白' | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |     def _get_sub_tokens(self, sudachipy_tokens): | ||||||
|  |         if self.split_mode is None or self.split_mode == "A":  # do nothing for default split mode | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |         sub_tokens_list = []  # list of (list of list of DetailedToken | None) | ||||||
|  |         for token in sudachipy_tokens: | ||||||
|  |             sub_a = token.split(self.tokenizer.SplitMode.A) | ||||||
|  |             if len(sub_a) == 1:  # no sub tokens | ||||||
|  |                 sub_tokens_list.append(None) | ||||||
|  |             elif self.split_mode == "B": | ||||||
|  |                 sub_tokens_list.append([self._get_dtokens(sub_a, False)]) | ||||||
|  |             else:  # "C" | ||||||
|  |                 sub_b = token.split(self.tokenizer.SplitMode.B) | ||||||
|  |                 if len(sub_a) == len(sub_b): | ||||||
|  |                     dtokens = self._get_dtokens(sub_a, False) | ||||||
|  |                     sub_tokens_list.append([dtokens, dtokens]) | ||||||
|  |                 else: | ||||||
|  |                     sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)]) | ||||||
|  |         return sub_tokens_list | ||||||
|  | 
 | ||||||
|     def _get_config(self): |     def _get_config(self): | ||||||
|         config = OrderedDict( |         config = OrderedDict( | ||||||
|             ( |             ( | ||||||
|  |  | ||||||
|  | @ -1,144 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from .stop_words import STOP_WORDS |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| POS_PHRASE_MAP = { |  | ||||||
|     "NOUN": "NP", |  | ||||||
|     "NUM": "NP", |  | ||||||
|     "PRON": "NP", |  | ||||||
|     "PROPN": "NP", |  | ||||||
| 
 |  | ||||||
|     "VERB": "VP", |  | ||||||
| 
 |  | ||||||
|     "ADJ": "ADJP", |  | ||||||
| 
 |  | ||||||
|     "ADV": "ADVP", |  | ||||||
| 
 |  | ||||||
|     "CCONJ": "CCONJP", |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] |  | ||||||
| def yield_bunsetu(doc, debug=False): |  | ||||||
|     bunsetu = [] |  | ||||||
|     bunsetu_may_end = False |  | ||||||
|     phrase_type = None |  | ||||||
|     phrase = None |  | ||||||
|     prev = None |  | ||||||
|     prev_tag = None |  | ||||||
|     prev_dep = None |  | ||||||
|     prev_head = None |  | ||||||
|     for t in doc: |  | ||||||
|         pos = t.pos_ |  | ||||||
|         pos_type = POS_PHRASE_MAP.get(pos, None) |  | ||||||
|         tag = t.tag_ |  | ||||||
|         dep = t.dep_ |  | ||||||
|         head = t.head.i |  | ||||||
|         if debug: |  | ||||||
|             print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) |  | ||||||
| 
 |  | ||||||
|         # DET is always an individual bunsetu |  | ||||||
|         if pos == "DET": |  | ||||||
|             if bunsetu: |  | ||||||
|                 yield bunsetu, phrase_type, phrase |  | ||||||
|             yield [t], None, None |  | ||||||
|             bunsetu = [] |  | ||||||
|             bunsetu_may_end = False |  | ||||||
|             phrase_type = None |  | ||||||
|             phrase = None |  | ||||||
| 
 |  | ||||||
|         # PRON or Open PUNCT always splits bunsetu |  | ||||||
|         elif tag == "補助記号-括弧開": |  | ||||||
|             if bunsetu: |  | ||||||
|                 yield bunsetu, phrase_type, phrase |  | ||||||
|             bunsetu = [t] |  | ||||||
|             bunsetu_may_end = True |  | ||||||
|             phrase_type = None |  | ||||||
|             phrase = None |  | ||||||
| 
 |  | ||||||
|         # bunsetu head not appeared |  | ||||||
|         elif phrase_type is None: |  | ||||||
|             if bunsetu and prev_tag == "補助記号-読点": |  | ||||||
|                 yield bunsetu, phrase_type, phrase |  | ||||||
|                 bunsetu = [] |  | ||||||
|                 bunsetu_may_end = False |  | ||||||
|                 phrase_type = None |  | ||||||
|                 phrase = None |  | ||||||
|             bunsetu.append(t) |  | ||||||
|             if pos_type:  # begin phrase |  | ||||||
|                 phrase = [t] |  | ||||||
|                 phrase_type = pos_type |  | ||||||
|                 if pos_type in {"ADVP", "CCONJP"}: |  | ||||||
|                     bunsetu_may_end = True |  | ||||||
| 
 |  | ||||||
|         # entering new bunsetu |  | ||||||
|         elif pos_type and ( |  | ||||||
|             pos_type != phrase_type or  # different phrase type arises |  | ||||||
|             bunsetu_may_end  # same phrase type but bunsetu already ended |  | ||||||
|         ): |  | ||||||
|             # exceptional case: NOUN to VERB |  | ||||||
|             if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: |  | ||||||
|                 bunsetu.append(t) |  | ||||||
|                 phrase_type = "VP" |  | ||||||
|                 phrase.append(t) |  | ||||||
|             # exceptional case: VERB to NOUN |  | ||||||
|             elif phrase_type == "VP" and pos_type == "NP" and ( |  | ||||||
|                     prev_dep == 'compound' and prev_head == t.i or |  | ||||||
|                     dep == 'compound' and prev == head or |  | ||||||
|                     prev_dep == 'nmod' and prev_head == t.i |  | ||||||
|             ): |  | ||||||
|                 bunsetu.append(t) |  | ||||||
|                 phrase_type = "NP" |  | ||||||
|                 phrase.append(t) |  | ||||||
|             else: |  | ||||||
|                 yield bunsetu, phrase_type, phrase |  | ||||||
|                 bunsetu = [t] |  | ||||||
|                 bunsetu_may_end = False |  | ||||||
|                 phrase_type = pos_type |  | ||||||
|                 phrase = [t] |  | ||||||
| 
 |  | ||||||
|         # NOUN bunsetu |  | ||||||
|         elif phrase_type == "NP": |  | ||||||
|             bunsetu.append(t) |  | ||||||
|             if not bunsetu_may_end and (( |  | ||||||
|                 (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} |  | ||||||
|             ) or ( |  | ||||||
|                 pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' |  | ||||||
|             )): |  | ||||||
|                 phrase.append(t) |  | ||||||
|             else: |  | ||||||
|                 bunsetu_may_end = True |  | ||||||
| 
 |  | ||||||
|         # VERB bunsetu |  | ||||||
|         elif phrase_type == "VP": |  | ||||||
|             bunsetu.append(t) |  | ||||||
|             if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': |  | ||||||
|                 phrase.append(t) |  | ||||||
|             else: |  | ||||||
|                 bunsetu_may_end = True |  | ||||||
| 
 |  | ||||||
|         # ADJ bunsetu |  | ||||||
|         elif phrase_type == "ADJP" and tag != '連体詞': |  | ||||||
|             bunsetu.append(t) |  | ||||||
|             if not bunsetu_may_end and (( |  | ||||||
|                 pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} |  | ||||||
|             ) or ( |  | ||||||
|                 pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' |  | ||||||
|             )): |  | ||||||
|                 phrase.append(t) |  | ||||||
|             else: |  | ||||||
|                 bunsetu_may_end = True |  | ||||||
| 
 |  | ||||||
|         # other bunsetu |  | ||||||
|         else: |  | ||||||
|             bunsetu.append(t) |  | ||||||
| 
 |  | ||||||
|         prev = t.i |  | ||||||
|         prev_tag = t.tag_ |  | ||||||
|         prev_dep = t.dep_ |  | ||||||
|         prev_head = head |  | ||||||
| 
 |  | ||||||
|     if bunsetu: |  | ||||||
|         yield bunsetu, phrase_type, phrase |  | ||||||
|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS | from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS | ||||||
| from spacy.lang.ja import Japanese | from spacy.lang.ja import Japanese, DetailedToken | ||||||
| 
 | 
 | ||||||
| # fmt: off | # fmt: off | ||||||
| TOKENIZER_TESTS = [ | TOKENIZER_TESTS = [ | ||||||
|  | @ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): | ||||||
|     assert len(nlp_c(text)) == len_c |     assert len(nlp_c(text)) == len_c | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", | ||||||
|  |     [ | ||||||
|  |         ( | ||||||
|  |             "選挙管理委員会", | ||||||
|  |             [None, None, None, None], | ||||||
|  |             [None, None, [ | ||||||
|  |                 [ | ||||||
|  |                     DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), | ||||||
|  |                 ] | ||||||
|  |             ]], | ||||||
|  |             [[ | ||||||
|  |                 [ | ||||||
|  |                     DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), | ||||||
|  |                 ], [ | ||||||
|  |                     DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), | ||||||
|  |                     DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None), | ||||||
|  |                 ] | ||||||
|  |             ]] | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c): | ||||||
|  |     nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) | ||||||
|  |     nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) | ||||||
|  |     nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) | ||||||
|  | 
 | ||||||
|  |     assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a | ||||||
|  |     assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a | ||||||
|  |     assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b | ||||||
|  |     assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("text,inflections,reading_forms", | ||||||
|  |     [ | ||||||
|  |         ( | ||||||
|  |             "取ってつけた", | ||||||
|  |             ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), | ||||||
|  |             ("トッ", "テ", "ツケ", "タ"), | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): | ||||||
|  |     assert ja_tokenizer(text).user_data["inflections"] == inflections | ||||||
|  |     assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_ja_tokenizer_emptyish_texts(ja_tokenizer): | def test_ja_tokenizer_emptyish_texts(ja_tokenizer): | ||||||
|     doc = ja_tokenizer("") |     doc = ja_tokenizer("") | ||||||
|     assert len(doc) == 0 |     assert len(doc) == 0 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user