Japanese model: add user_dict entries and small refactor (#5573)

* user_dict fields: adding inflections, reading_forms, sub_tokens
deleting: unidic_tags
improve code readability around the token alignment procedure

* add test cases, replace fugashi with sudachipy in conftest

* move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer

* tag is space -> both surface and tag are spaces

* consider len(text)==0
This commit is contained in:
Hiroshi Matsuda 2020-06-22 21:32:25 +09:00 committed by GitHub
parent c34420794a
commit 150a39ccca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 152 additions and 248 deletions

View File

@ -20,12 +20,7 @@ from ... import util
# Hold the attributes we need with convenient names
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
DummySpace = DummyNode(" ", " ", " ")
DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])
def try_sudachi_import(split_mode="A"):
@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
)
def resolve_pos(orth, pos, next_pos):
def resolve_pos(orth, tag, next_tag):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
# Some tokens have their UD tag decided based on the POS of the following
# token.
# orth based rules
if pos[0] in TAG_ORTH_MAP:
orth_map = TAG_ORTH_MAP[pos[0]]
# apply orth based mapping
if tag in TAG_ORTH_MAP:
orth_map = TAG_ORTH_MAP[tag]
if orth in orth_map:
return orth_map[orth], None
return orth_map[orth], None # current_pos, next_pos
# tag bi-gram mapping
if next_pos:
tag_bigram = pos[0], next_pos[0]
# apply tag bi-gram mapping
if next_tag:
tag_bigram = tag, next_tag
if tag_bigram in TAG_BIGRAM_MAP:
bipos = TAG_BIGRAM_MAP[tag_bigram]
if bipos[0] is None:
return TAG_MAP[pos[0]][POS], bipos[1]
current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
if current_pos is None: # apply tag uni-gram mapping for current_pos
return TAG_MAP[tag][POS], next_pos # only next_pos is identified by tag bi-gram mapping
else:
return bipos
return current_pos, next_pos
return TAG_MAP[pos[0]][POS], None
# apply tag uni-gram mapping
return TAG_MAP[tag][POS], None
# Use a mapping of paired punctuation to avoid splitting quoted sentences.
pairpunct = {'':'', '': '', '': ''}
def separate_sentences(doc):
"""Given a doc, mark tokens that start sentences based on Unidic tags.
"""
stack = [] # save paired punctuation
for i, token in enumerate(doc[:-2]):
# Set all tokens after the first to false by default. This is necessary
# for the doc code to be aware we've done sentencization, see
# `is_sentenced`.
token.sent_start = (i == 0)
if token.tag_:
if token.tag_ == "補助記号-括弧開":
ts = str(token)
if ts in pairpunct:
stack.append(pairpunct[ts])
elif stack and ts == stack[-1]:
stack.pop()
if token.tag_ == "補助記号-句点":
next_token = doc[i+1]
if next_token.tag_ != token.tag_ and not stack:
next_token.sent_start = True
def get_dtokens(tokenizer, text):
tokens = tokenizer.tokenize(text)
words = []
for ti, token in enumerate(tokens):
tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
dtoken = DetailedToken(
token.surface(),
(tag, inf),
token.dictionary_form())
if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
# don't add multiple space tokens in a row
continue
words.append(dtoken)
# remove empty tokens. These can be produced with characters like … that
# Sudachi normalizes internally.
words = [ww for ww in words if len(ww.surface) > 0]
return words
def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# Compare the content of tokens and text, first
words = [x.surface for x in dtokens]
if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words))
text_words = []
text_lemmas = []
text_tags = []
text_dtokens = []
text_spaces = []
text_pos = 0
# handle empty and whitespace-only texts
if len(words) == 0:
return text_words, text_lemmas, text_tags, text_spaces
return text_dtokens, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
text_words = [text]
text_lemmas = [text]
text_tags = [gap_tag]
text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
text_spaces = [False]
return text_words, text_lemmas, text_tags, text_spaces
# normalize words to remove all whitespace tokens
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
# align words with text
for word, dtoken in zip(norm_words, norm_dtokens):
return text_dtokens, text_spaces
# align words and dtokens by referring text, and insert gap tokens for the space char spans
for word, dtoken in zip(words, dtokens):
# skip all space tokens
if word.isspace():
continue
try:
word_start = text[text_pos:].index(word)
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words))
# space token
if word_start > 0:
w = text[text_pos:text_pos + word_start]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
text_lemmas.append(dtoken.lemma)
text_tags.append(dtoken.pos)
# content word
text_dtokens.append(dtoken)
text_spaces.append(False)
text_pos += len(word)
# poll a space char after the word
if text_pos < len(text) and text[text_pos] == " ":
text_spaces[-1] = True
text_pos += 1
# trailing space token
if text_pos < len(text):
w = text[text_pos:]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
text_spaces.append(False)
return text_words, text_lemmas, text_tags, text_spaces
return text_dtokens, text_spaces
class JapaneseTokenizer(DummyTokenizer):
@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer):
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text):
dtokens = get_dtokens(self.tokenizer, text)
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None
for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
token.tag_ = unidic_tag[0]
if next_pos:
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
unidic_tag,
unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = lemma
doc.user_data["unidic_tags"] = unidic_tags
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
dtokens = [
DetailedToken(
token.surface(), # orth
'-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']), # tag
','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens']
) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t for idx, t in enumerate(dtokens) if
idx == 0 or
not t.surface.isspace() or t.tag != '空白' or
not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
]
def _get_sub_tokens(self, sudachipy_tokens):
if self.split_mode is None or self.split_mode == "A": # do nothing for default split mode
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
return sub_tokens_list
def _get_config(self):
config = OrderedDict(
(

View File

@ -1,144 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
POS_PHRASE_MAP = {
"NOUN": "NP",
"NUM": "NP",
"PRON": "NP",
"PROPN": "NP",
"VERB": "VP",
"ADJ": "ADJP",
"ADV": "ADVP",
"CCONJ": "CCONJP",
}
# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
def yield_bunsetu(doc, debug=False):
bunsetu = []
bunsetu_may_end = False
phrase_type = None
phrase = None
prev = None
prev_tag = None
prev_dep = None
prev_head = None
for t in doc:
pos = t.pos_
pos_type = POS_PHRASE_MAP.get(pos, None)
tag = t.tag_
dep = t.dep_
head = t.head.i
if debug:
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
# DET is always an individual bunsetu
if pos == "DET":
if bunsetu:
yield bunsetu, phrase_type, phrase
yield [t], None, None
bunsetu = []
bunsetu_may_end = False
phrase_type = None
phrase = None
# PRON or Open PUNCT always splits bunsetu
elif tag == "補助記号-括弧開":
if bunsetu:
yield bunsetu, phrase_type, phrase
bunsetu = [t]
bunsetu_may_end = True
phrase_type = None
phrase = None
# bunsetu head not appeared
elif phrase_type is None:
if bunsetu and prev_tag == "補助記号-読点":
yield bunsetu, phrase_type, phrase
bunsetu = []
bunsetu_may_end = False
phrase_type = None
phrase = None
bunsetu.append(t)
if pos_type: # begin phrase
phrase = [t]
phrase_type = pos_type
if pos_type in {"ADVP", "CCONJP"}:
bunsetu_may_end = True
# entering new bunsetu
elif pos_type and (
pos_type != phrase_type or # different phrase type arises
bunsetu_may_end # same phrase type but bunsetu already ended
):
# exceptional case: NOUN to VERB
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
bunsetu.append(t)
phrase_type = "VP"
phrase.append(t)
# exceptional case: VERB to NOUN
elif phrase_type == "VP" and pos_type == "NP" and (
prev_dep == 'compound' and prev_head == t.i or
dep == 'compound' and prev == head or
prev_dep == 'nmod' and prev_head == t.i
):
bunsetu.append(t)
phrase_type = "NP"
phrase.append(t)
else:
yield bunsetu, phrase_type, phrase
bunsetu = [t]
bunsetu_may_end = False
phrase_type = pos_type
phrase = [t]
# NOUN bunsetu
elif phrase_type == "NP":
bunsetu.append(t)
if not bunsetu_may_end and ((
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
) or (
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
)):
phrase.append(t)
else:
bunsetu_may_end = True
# VERB bunsetu
elif phrase_type == "VP":
bunsetu.append(t)
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
phrase.append(t)
else:
bunsetu_may_end = True
# ADJ bunsetu
elif phrase_type == "ADJP" and tag != '連体詞':
bunsetu.append(t)
if not bunsetu_may_end and ((
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
) or (
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
)):
phrase.append(t)
else:
bunsetu_may_end = True
# other bunsetu
else:
bunsetu.append(t)
prev = t.i
prev_tag = t.tag_
prev_dep = t.dep_
prev_head = head
if bunsetu:
yield bunsetu, phrase_type, phrase

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
from spacy.lang.ja import Japanese
from spacy.lang.ja import Japanese, DetailedToken
# fmt: off
TOKENIZER_TESTS = [
@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
assert len(nlp_c(text)) == len_c
@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c",
[
(
"選挙管理委員会",
[None, None, None, None],
[None, None, [
[
DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
DetailedToken(surface='', tag='名詞-普通名詞-一般', inf='', lemma='', reading='カイ', sub_tokens=None),
]
]],
[[
[
DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None),
DetailedToken(surface='', tag='名詞-普通名詞-一般', inf='', lemma='', reading='カイ', sub_tokens=None),
], [
DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None),
DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None),
DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None),
]
]]
),
]
)
def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c):
nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@pytest.mark.parametrize("text,inflections,reading_forms",
[
(
"取ってつけた",
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
("トッ", "", "ツケ", ""),
),
]
)
def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
assert ja_tokenizer(text).user_data["inflections"] == inflections
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
doc = ja_tokenizer("")
assert len(doc) == 0