Update Morphology to load exceptions as MORPH_RULES

Update `Morphology` to load exceptions in `Morphology.__init__` and
`Morphology.load_morph_exceptions` from the format used in `MORPH_RULES`
rather than the internal format with tuple keys.

* Rename to `Morphology.exc` to `Morphology._exc` for internal use with
tuple keys
* Add `Morphology.exc` as a property that converts the internal `_exc`
back to `MORPH_RULES` format, primarily for serialization
This commit is contained in:
Adriane Boyd 2020-07-16 21:16:49 +02:00
parent d83e3c44c5
commit d106cf66dd
3 changed files with 25 additions and 17 deletions

View File

@ -78,9 +78,7 @@ class BaseDefaults:
BASE_NORMS,
vocab.lookups.get_table("lexeme_norm"),
)
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
vocab.morphology.load_morph_exceptions(cls.morph_rules)
return vocab
@classmethod

View File

@ -20,7 +20,7 @@ cdef class Morphology:
cdef readonly object tag_map
cdef readonly object tag_names
cdef readonly object reverse_index
cdef readonly object exc
cdef readonly object _exc
cdef readonly PreshMapArray _cache
cdef readonly int n_tags

View File

@ -82,12 +82,9 @@ cdef class Morphology:
self._load_from_tag_map(tag_map)
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
self._exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
self.load_morph_exceptions(exc)
def _load_from_tag_map(self, tag_map):
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
@ -98,7 +95,7 @@ cdef class Morphology:
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
self._exc), None, None)
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not
@ -208,7 +205,7 @@ cdef class Morphology:
attrs = _normalize_props(attrs)
self.add(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
@ -254,21 +251,34 @@ cdef class Morphology:
token.pos = <univ_pos_t>pos
token.tag = self.strings[tag_str]
token.morph = self.add(features)
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
if (self.tag_names[tag_id], token.lex.orth) in self._exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self.exc[key]
attrs = self._exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
def load_morph_exceptions(self, dict exc):
def load_morph_exceptions(self, dict morph_rules):
self._exc = {}
# Map (form, pos) to attributes
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
for tag, exc in morph_rules.items():
for orth, attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
@property
def exc(self):
# generate the serializable exc in the MORPH_RULES format from the
# internal tuple-key format
morph_rules = {}
for (tag, orth) in sorted(self._exc):
if not tag in morph_rules:
morph_rules[tag] = {}
morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
return morph_rules
@staticmethod
def feats_to_dict(feats):