mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
291 lines
9.6 KiB
Python
291 lines
9.6 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import srsly
|
|
from collections import namedtuple, OrderedDict
|
|
|
|
from .stop_words import STOP_WORDS
|
|
from .syntax_iterators import SYNTAX_ITERATORS
|
|
from .tag_map import TAG_MAP
|
|
from .tag_orth_map import TAG_ORTH_MAP
|
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
|
from ...attrs import LANG
|
|
from ...compat import copy_reg
|
|
from ...errors import Errors
|
|
from ...language import Language
|
|
from ...symbols import POS
|
|
from ...tokens import Doc
|
|
from ...util import DummyTokenizer
|
|
from ... import util
|
|
|
|
|
|
# Hold the attributes we need with convenient names
|
|
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
|
|
|
|
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
|
|
# the flow by creating a dummy with the same interface.
|
|
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
|
|
DummySpace = DummyNode(" ", " ", " ")
|
|
|
|
|
|
def try_sudachi_import(split_mode="A"):
|
|
"""SudachiPy is required for Japanese support, so check for it.
|
|
It it's not available blow up and explain how to fix it.
|
|
split_mode should be one of these values: "A", "B", "C", None->"A"."""
|
|
try:
|
|
from sudachipy import dictionary, tokenizer
|
|
split_mode = {
|
|
None: tokenizer.Tokenizer.SplitMode.A,
|
|
"A": tokenizer.Tokenizer.SplitMode.A,
|
|
"B": tokenizer.Tokenizer.SplitMode.B,
|
|
"C": tokenizer.Tokenizer.SplitMode.C,
|
|
}[split_mode]
|
|
tok = dictionary.Dictionary().create(
|
|
mode=split_mode
|
|
)
|
|
return tok
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy"
|
|
)
|
|
|
|
|
|
def resolve_pos(orth, pos, next_pos):
|
|
"""If necessary, add a field to the POS tag for UD mapping.
|
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
|
be mapped differently depending on the literal token or its context
|
|
in the sentence. This function returns resolved POSs for both token
|
|
and next_token by tuple.
|
|
"""
|
|
|
|
# Some tokens have their UD tag decided based on the POS of the following
|
|
# token.
|
|
|
|
# orth based rules
|
|
if pos[0] in TAG_ORTH_MAP:
|
|
orth_map = TAG_ORTH_MAP[pos[0]]
|
|
if orth in orth_map:
|
|
return orth_map[orth], None
|
|
|
|
# tag bi-gram mapping
|
|
if next_pos:
|
|
tag_bigram = pos[0], next_pos[0]
|
|
if tag_bigram in TAG_BIGRAM_MAP:
|
|
bipos = TAG_BIGRAM_MAP[tag_bigram]
|
|
if bipos[0] is None:
|
|
return TAG_MAP[pos[0]][POS], bipos[1]
|
|
else:
|
|
return bipos
|
|
|
|
return TAG_MAP[pos[0]][POS], None
|
|
|
|
|
|
# Use a mapping of paired punctuation to avoid splitting quoted sentences.
|
|
pairpunct = {'「':'」', '『': '』', '【': '】'}
|
|
|
|
|
|
def separate_sentences(doc):
|
|
"""Given a doc, mark tokens that start sentences based on Unidic tags.
|
|
"""
|
|
|
|
stack = [] # save paired punctuation
|
|
|
|
for i, token in enumerate(doc[:-2]):
|
|
# Set all tokens after the first to false by default. This is necessary
|
|
# for the doc code to be aware we've done sentencization, see
|
|
# `is_sentenced`.
|
|
token.sent_start = (i == 0)
|
|
if token.tag_:
|
|
if token.tag_ == "補助記号-括弧開":
|
|
ts = str(token)
|
|
if ts in pairpunct:
|
|
stack.append(pairpunct[ts])
|
|
elif stack and ts == stack[-1]:
|
|
stack.pop()
|
|
|
|
if token.tag_ == "補助記号-句点":
|
|
next_token = doc[i+1]
|
|
if next_token.tag_ != token.tag_ and not stack:
|
|
next_token.sent_start = True
|
|
|
|
|
|
def get_dtokens(tokenizer, text):
|
|
tokens = tokenizer.tokenize(text)
|
|
words = []
|
|
for ti, token in enumerate(tokens):
|
|
tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
|
|
inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
|
|
dtoken = DetailedToken(
|
|
token.surface(),
|
|
(tag, inf),
|
|
token.dictionary_form())
|
|
if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
|
|
# don't add multiple space tokens in a row
|
|
continue
|
|
words.append(dtoken)
|
|
|
|
# remove empty tokens. These can be produced with characters like … that
|
|
# Sudachi normalizes internally.
|
|
words = [ww for ww in words if len(ww.surface) > 0]
|
|
return words
|
|
|
|
|
|
def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
|
|
words = [x.surface for x in dtokens]
|
|
if "".join("".join(words).split()) != "".join(text.split()):
|
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
|
text_words = []
|
|
text_lemmas = []
|
|
text_tags = []
|
|
text_spaces = []
|
|
text_pos = 0
|
|
# handle empty and whitespace-only texts
|
|
if len(words) == 0:
|
|
return text_words, text_lemmas, text_tags, text_spaces
|
|
elif len([word for word in words if not word.isspace()]) == 0:
|
|
assert text.isspace()
|
|
text_words = [text]
|
|
text_lemmas = [text]
|
|
text_tags = [gap_tag]
|
|
text_spaces = [False]
|
|
return text_words, text_lemmas, text_tags, text_spaces
|
|
# normalize words to remove all whitespace tokens
|
|
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
|
|
# align words with text
|
|
for word, dtoken in zip(norm_words, norm_dtokens):
|
|
try:
|
|
word_start = text[text_pos:].index(word)
|
|
except ValueError:
|
|
raise ValueError(Errors.E194.format(text=text, words=words))
|
|
if word_start > 0:
|
|
w = text[text_pos:text_pos + word_start]
|
|
text_words.append(w)
|
|
text_lemmas.append(w)
|
|
text_tags.append(gap_tag)
|
|
text_spaces.append(False)
|
|
text_pos += word_start
|
|
text_words.append(word)
|
|
text_lemmas.append(dtoken.lemma)
|
|
text_tags.append(dtoken.pos)
|
|
text_spaces.append(False)
|
|
text_pos += len(word)
|
|
if text_pos < len(text) and text[text_pos] == " ":
|
|
text_spaces[-1] = True
|
|
text_pos += 1
|
|
if text_pos < len(text):
|
|
w = text[text_pos:]
|
|
text_words.append(w)
|
|
text_lemmas.append(w)
|
|
text_tags.append(gap_tag)
|
|
text_spaces.append(False)
|
|
return text_words, text_lemmas, text_tags, text_spaces
|
|
|
|
|
|
class JapaneseTokenizer(DummyTokenizer):
|
|
def __init__(self, cls, nlp=None, config={}):
|
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
self.split_mode = config.get("split_mode", None)
|
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
|
|
def __call__(self, text):
|
|
dtokens = get_dtokens(self.tokenizer, text)
|
|
|
|
words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
|
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
|
next_pos = None
|
|
for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
|
|
token.tag_ = unidic_tag[0]
|
|
if next_pos:
|
|
token.pos = next_pos
|
|
next_pos = None
|
|
else:
|
|
token.pos, next_pos = resolve_pos(
|
|
token.orth_,
|
|
unidic_tag,
|
|
unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
|
|
)
|
|
|
|
# if there's no lemma info (it's an unk) just use the surface
|
|
token.lemma_ = lemma
|
|
doc.user_data["unidic_tags"] = unidic_tags
|
|
|
|
return doc
|
|
|
|
def _get_config(self):
|
|
config = OrderedDict(
|
|
(
|
|
("split_mode", self.split_mode),
|
|
)
|
|
)
|
|
return config
|
|
|
|
def _set_config(self, config={}):
|
|
self.split_mode = config.get("split_mode", None)
|
|
|
|
def to_bytes(self, **kwargs):
|
|
serializers = OrderedDict(
|
|
(
|
|
("cfg", lambda: srsly.json_dumps(self._get_config())),
|
|
)
|
|
)
|
|
return util.to_bytes(serializers, [])
|
|
|
|
def from_bytes(self, data, **kwargs):
|
|
deserializers = OrderedDict(
|
|
(
|
|
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
|
|
)
|
|
)
|
|
util.from_bytes(data, deserializers, [])
|
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
return self
|
|
|
|
def to_disk(self, path, **kwargs):
|
|
path = util.ensure_path(path)
|
|
serializers = OrderedDict(
|
|
(
|
|
("cfg", lambda p: srsly.write_json(p, self._get_config())),
|
|
)
|
|
)
|
|
return util.to_disk(path, serializers, [])
|
|
|
|
def from_disk(self, path, **kwargs):
|
|
path = util.ensure_path(path)
|
|
serializers = OrderedDict(
|
|
(
|
|
("cfg", lambda p: self._set_config(srsly.read_json(p))),
|
|
)
|
|
)
|
|
util.from_disk(path, serializers, [])
|
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
|
|
|
|
class JapaneseDefaults(Language.Defaults):
|
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
lex_attr_getters[LANG] = lambda _text: "ja"
|
|
stop_words = STOP_WORDS
|
|
tag_map = TAG_MAP
|
|
syntax_iterators = SYNTAX_ITERATORS
|
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
|
|
|
@classmethod
|
|
def create_tokenizer(cls, nlp=None, config={}):
|
|
return JapaneseTokenizer(cls, nlp, config)
|
|
|
|
|
|
class Japanese(Language):
|
|
lang = "ja"
|
|
Defaults = JapaneseDefaults
|
|
|
|
def make_doc(self, text):
|
|
return self.tokenizer(text)
|
|
|
|
|
|
def pickle_japanese(instance):
|
|
return Japanese, tuple()
|
|
|
|
|
|
copy_reg.pickle(Japanese, pickle_japanese)
|
|
|
|
__all__ = ["Japanese"]
|