2015-08-26 20:17:21 +03:00
|
|
|
from os import path
|
2015-08-27 10:16:11 +03:00
|
|
|
from .lemmatizer import Lemmatizer
|
2014-12-09 13:16:17 +03:00
|
|
|
|
2015-08-26 20:17:21 +03:00
|
|
|
try:
|
|
|
|
import ujson as json
|
|
|
|
except ImportError:
|
|
|
|
import json
|
2014-12-09 17:02:04 +03:00
|
|
|
|
2015-08-28 03:02:33 +03:00
|
|
|
from .parts_of_speech import UNIV_POS_NAMES
|
2015-10-09 11:02:42 +03:00
|
|
|
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
2015-08-28 00:11:51 +03:00
|
|
|
|
|
|
|
|
2015-08-26 20:17:21 +03:00
|
|
|
cdef class Morphology:
|
2015-08-28 04:44:54 +03:00
|
|
|
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
2015-08-28 03:02:33 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self.strings = string_store
|
2015-08-27 10:16:11 +03:00
|
|
|
self.lemmatizer = lemmatizer
|
2015-08-28 04:44:54 +03:00
|
|
|
self.n_tags = len(tag_map) + 1
|
2015-08-26 20:17:21 +03:00
|
|
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
2015-08-28 03:02:33 +03:00
|
|
|
self.reverse_index = {}
|
2015-08-28 04:44:54 +03:00
|
|
|
|
|
|
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
2015-08-28 03:02:33 +03:00
|
|
|
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
|
|
|
self.rich_tags[i].id = i
|
|
|
|
self.rich_tags[i].name = self.strings[tag_str]
|
|
|
|
self.rich_tags[i].morph = 0
|
2015-09-09 15:30:24 +03:00
|
|
|
self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
|
2015-08-28 03:02:33 +03:00
|
|
|
self.reverse_index[self.rich_tags[i].name] = i
|
|
|
|
self._cache = PreshMapArray(self.n_tags)
|
2015-08-26 20:17:21 +03:00
|
|
|
|
2015-08-28 00:11:51 +03:00
|
|
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
2015-08-28 04:44:54 +03:00
|
|
|
cdef int tag_id
|
|
|
|
if isinstance(tag, basestring):
|
2015-10-08 06:34:11 +03:00
|
|
|
tag_id = self.reverse_index[self.strings[tag]]
|
2015-08-28 04:44:54 +03:00
|
|
|
else:
|
|
|
|
tag_id = tag
|
2015-08-28 03:02:33 +03:00
|
|
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
2015-08-28 00:11:51 +03:00
|
|
|
if analysis is NULL:
|
|
|
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
2015-08-28 03:02:33 +03:00
|
|
|
analysis.tag = self.rich_tags[tag_id]
|
2015-08-28 04:44:54 +03:00
|
|
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
2015-09-08 16:39:24 +03:00
|
|
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
2015-08-28 00:11:51 +03:00
|
|
|
token.lemma = analysis.lemma
|
2015-08-28 03:02:33 +03:00
|
|
|
token.pos = analysis.tag.pos
|
|
|
|
token.tag = analysis.tag.name
|
|
|
|
token.morph = analysis.tag.morph
|
2015-08-26 20:17:21 +03:00
|
|
|
|
2015-08-28 03:02:33 +03:00
|
|
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
2015-08-26 20:17:21 +03:00
|
|
|
pass
|
|
|
|
|
|
|
|
def load_morph_exceptions(self, dict exc):
|
2015-08-28 03:02:33 +03:00
|
|
|
# Map (form, pos) to (lemma, rich tag)
|
2015-08-28 00:11:51 +03:00
|
|
|
cdef unicode pos_str
|
|
|
|
cdef unicode form_str
|
|
|
|
cdef unicode lemma_str
|
|
|
|
cdef dict entries
|
|
|
|
cdef dict props
|
|
|
|
cdef int lemma
|
|
|
|
cdef attr_t orth
|
2015-09-10 15:52:23 +03:00
|
|
|
cdef attr_t tag_id
|
2015-08-28 00:11:51 +03:00
|
|
|
cdef int pos
|
2015-09-10 15:52:23 +03:00
|
|
|
cdef RichTagC rich_tag
|
2015-08-28 03:02:33 +03:00
|
|
|
for tag_str, entries in exc.items():
|
|
|
|
tag = self.strings[tag_str]
|
2015-09-10 15:52:23 +03:00
|
|
|
tag_id = self.reverse_index[tag]
|
|
|
|
rich_tag = self.rich_tags[tag_id]
|
2015-08-28 00:11:51 +03:00
|
|
|
for form_str, props in entries.items():
|
|
|
|
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
2015-09-10 15:52:23 +03:00
|
|
|
cached.tag = rich_tag
|
2015-08-28 03:02:33 +03:00
|
|
|
orth = self.strings[form_str]
|
|
|
|
for name_str, value_str in props.items():
|
|
|
|
if name_str == 'L':
|
|
|
|
cached.lemma = self.strings[value_str]
|
|
|
|
else:
|
|
|
|
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
|
|
|
if cached.lemma == 0:
|
|
|
|
cached.lemma = self.lemmatize(rich_tag.pos, orth)
|
2015-09-10 15:52:23 +03:00
|
|
|
self._cache.set(tag_id, orth, <void*>cached)
|
2015-08-26 20:17:21 +03:00
|
|
|
|
2015-08-28 03:02:33 +03:00
|
|
|
def lemmatize(self, const univ_pos_t pos, attr_t orth):
|
|
|
|
if self.lemmatizer is None:
|
|
|
|
return orth
|
|
|
|
cdef unicode py_string = self.strings[orth]
|
2015-10-09 11:02:42 +03:00
|
|
|
if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT:
|
2015-08-28 03:02:33 +03:00
|
|
|
return orth
|
|
|
|
cdef set lemma_strings
|
|
|
|
cdef unicode lemma_string
|
|
|
|
lemma_strings = self.lemmatizer(py_string, pos)
|
|
|
|
lemma_string = sorted(lemma_strings)[0]
|
|
|
|
lemma = self.strings[lemma_string]
|
|
|
|
return lemma
|