spaCy/spacy/lang/ja/__init__.py
adrianeboyd f162815f45
Handle empty and whitespace-only docs for Japanese (#5564)
Handle empty and whitespace-only docs in the custom alignment method
used by the Japanese tokenizer.
2020-06-08 21:09:23 +02:00

292 lines
9.6 KiB
Python

# encoding: utf8
from __future__ import unicode_literals, print_function
import srsly
from collections import namedtuple, OrderedDict
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
from ...attrs import LANG
from ...compat import copy_reg
from ...errors import Errors
from ...language import Language
from ...symbols import POS
from ...tokens import Doc
from ...util import DummyTokenizer
from ... import util
# Hold the attributes we need with convenient names
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
DummySpace = DummyNode(" ", " ", " ")
def try_sudachi_import(split_mode="A"):
"""SudachiPy is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it.
split_mode should be one of these values: "A", "B", "C", None->"A"."""
try:
from sudachipy import dictionary, tokenizer
split_mode = {
None: tokenizer.Tokenizer.SplitMode.A,
"A": tokenizer.Tokenizer.SplitMode.A,
"B": tokenizer.Tokenizer.SplitMode.B,
"C": tokenizer.Tokenizer.SplitMode.C,
}[split_mode]
tok = dictionary.Dictionary().create(
mode=split_mode
)
return tok
except ImportError:
raise ImportError(
"Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy"
)
def resolve_pos(orth, pos, next_pos):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function returns resolved POSs for both token
and next_token by tuple.
"""
# Some tokens have their UD tag decided based on the POS of the following
# token.
# orth based rules
if pos[0] in TAG_ORTH_MAP:
orth_map = TAG_ORTH_MAP[pos[0]]
if orth in orth_map:
return orth_map[orth], None
# tag bi-gram mapping
if next_pos:
tag_bigram = pos[0], next_pos[0]
if tag_bigram in TAG_BIGRAM_MAP:
bipos = TAG_BIGRAM_MAP[tag_bigram]
if bipos[0] is None:
return TAG_MAP[pos[0]][POS], bipos[1]
else:
return bipos
return TAG_MAP[pos[0]][POS], None
# Use a mapping of paired punctuation to avoid splitting quoted sentences.
pairpunct = {'':'', '': '', '': ''}
def separate_sentences(doc):
"""Given a doc, mark tokens that start sentences based on Unidic tags.
"""
stack = [] # save paired punctuation
for i, token in enumerate(doc[:-2]):
# Set all tokens after the first to false by default. This is necessary
# for the doc code to be aware we've done sentencization, see
# `is_sentenced`.
token.sent_start = (i == 0)
if token.tag_:
if token.tag_ == "補助記号-括弧開":
ts = str(token)
if ts in pairpunct:
stack.append(pairpunct[ts])
elif stack and ts == stack[-1]:
stack.pop()
if token.tag_ == "補助記号-句点":
next_token = doc[i+1]
if next_token.tag_ != token.tag_ and not stack:
next_token.sent_start = True
def get_dtokens(tokenizer, text):
tokens = tokenizer.tokenize(text)
words = []
for ti, token in enumerate(tokens):
tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
dtoken = DetailedToken(
token.surface(),
(tag, inf),
token.dictionary_form())
if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
# don't add multiple space tokens in a row
continue
words.append(dtoken)
# remove empty tokens. These can be produced with characters like … that
# Sudachi normalizes internally.
words = [ww for ww in words if len(ww.surface) > 0]
return words
def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
words = [x.surface for x in dtokens]
if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words))
text_words = []
text_lemmas = []
text_tags = []
text_spaces = []
text_pos = 0
# handle empty and whitespace-only texts
if len(words) == 0:
return text_words, text_lemmas, text_tags, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
text_words = [text]
text_lemmas = [text]
text_tags = [gap_tag]
text_spaces = [False]
return text_words, text_lemmas, text_tags, text_spaces
# normalize words to remove all whitespace tokens
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
# align words with text
for word, dtoken in zip(norm_words, norm_dtokens):
try:
word_start = text[text_pos:].index(word)
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words))
if word_start > 0:
w = text[text_pos:text_pos + word_start]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
text_lemmas.append(dtoken.lemma)
text_tags.append(dtoken.pos)
text_spaces.append(False)
text_pos += len(word)
if text_pos < len(text) and text[text_pos] == " ":
text_spaces[-1] = True
text_pos += 1
if text_pos < len(text):
w = text[text_pos:]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_spaces.append(False)
return text_words, text_lemmas, text_tags, text_spaces
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.split_mode = config.get("split_mode", None)
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text):
dtokens = get_dtokens(self.tokenizer, text)
words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None
for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
token.tag_ = unidic_tag[0]
if next_pos:
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
unidic_tag,
unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = lemma
doc.user_data["unidic_tags"] = unidic_tags
separate_sentences(doc)
return doc
def _get_config(self):
config = OrderedDict(
(
("split_mode", self.split_mode),
)
)
return config
def _set_config(self, config={}):
self.split_mode = config.get("split_mode", None)
def to_bytes(self, **kwargs):
serializers = OrderedDict(
(
("cfg", lambda: srsly.json_dumps(self._get_config())),
)
)
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
deserializers = OrderedDict(
(
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
)
)
util.from_bytes(data, deserializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
def to_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(
("cfg", lambda p: srsly.write_json(p, self._get_config())),
)
)
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(
("cfg", lambda p: self._set_config(srsly.read_json(p))),
)
)
util.from_disk(path, serializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
return JapaneseTokenizer(cls, nlp, config)
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
def make_doc(self, text):
return self.tokenizer(text)
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = ["Japanese"]