mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Refactor loading of morphology exceptions, adding a method add_special_case.
This commit is contained in:
parent
46e98ec029
commit
57c4341453
|
@ -1,4 +1,7 @@
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -85,36 +88,45 @@ cdef class Morphology:
|
||||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def load_morph_exceptions(self, dict exc):
|
def add_special_case(self, unicode tag_str, unicode orth_str, props, force=False):
|
||||||
# Map (form, pos) to (lemma, rich tag)
|
'''Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
cdef unicode pos_str
|
tag and orth match the rule will receive the specified properties.
|
||||||
cdef unicode form_str
|
|
||||||
cdef unicode lemma_str
|
Arguments:
|
||||||
cdef dict entries
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
cdef dict props
|
orth (unicode): The word-form to key the exception.
|
||||||
cdef int lemma
|
'''
|
||||||
cdef attr_t orth
|
|
||||||
cdef attr_t tag_id
|
|
||||||
cdef int pos
|
|
||||||
cdef RichTagC rich_tag
|
|
||||||
for tag_str, entries in exc.items():
|
|
||||||
tag = self.strings[tag_str]
|
tag = self.strings[tag_str]
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
|
orth = self.strings[orth_str]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
for form_str, props in entries.items():
|
props = _normalize_props(props)
|
||||||
|
|
||||||
|
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
|
||||||
|
if cached is NULL:
|
||||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
cached.tag = rich_tag
|
elif force:
|
||||||
orth = self.strings[form_str]
|
memset(cached, 0, sizeof(cached))
|
||||||
for name_str, value_str in props.items():
|
|
||||||
if name_str == 'L':
|
|
||||||
cached.lemma = self.strings[value_str]
|
|
||||||
else:
|
else:
|
||||||
|
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
||||||
|
"to overwrite.")
|
||||||
|
msg = msg % (tag_str, orth_str)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
cached.tag = rich_tag
|
||||||
|
for name_str, value_str in props.items():
|
||||||
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
||||||
if cached.lemma == 0:
|
if cached.lemma == 0:
|
||||||
cached.lemma = self.lemmatize(rich_tag.pos, orth,
|
cached.lemma = self.lemmatize(rich_tag.pos, orth,
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
self._cache.set(tag_id, orth, <void*>cached)
|
||||||
|
|
||||||
|
def load_morph_exceptions(self, dict exc):
|
||||||
|
# Map (form, pos) to (lemma, rich tag)
|
||||||
|
for tag_str, entries in exc.items():
|
||||||
|
for form_str, props in entries.items():
|
||||||
|
self.add_special_case(tag_str, form_str, props)
|
||||||
|
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user