mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 13:14:11 +03:00
Update Morphology to load exceptions as MORPH_RULES
Update `Morphology` to load exceptions in `Morphology.__init__` and `Morphology.load_morph_exceptions` from the format used in `MORPH_RULES` rather than the internal format with tuple keys. * Rename to `Morphology.exc` to `Morphology._exc` for internal use with tuple keys * Add `Morphology.exc` as a property that converts the internal `_exc` back to `MORPH_RULES` format, primarily for serialization
This commit is contained in:
parent
d83e3c44c5
commit
d106cf66dd
|
@ -78,9 +78,7 @@ class BaseDefaults:
|
||||||
BASE_NORMS,
|
BASE_NORMS,
|
||||||
vocab.lookups.get_table("lexeme_norm"),
|
vocab.lookups.get_table("lexeme_norm"),
|
||||||
)
|
)
|
||||||
for tag_str, exc in cls.morph_rules.items():
|
vocab.morphology.load_morph_exceptions(cls.morph_rules)
|
||||||
for orth_str, attrs in exc.items():
|
|
||||||
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
|
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -20,7 +20,7 @@ cdef class Morphology:
|
||||||
cdef readonly object tag_map
|
cdef readonly object tag_map
|
||||||
cdef readonly object tag_names
|
cdef readonly object tag_names
|
||||||
cdef readonly object reverse_index
|
cdef readonly object reverse_index
|
||||||
cdef readonly object exc
|
cdef readonly object _exc
|
||||||
cdef readonly PreshMapArray _cache
|
cdef readonly PreshMapArray _cache
|
||||||
cdef readonly int n_tags
|
cdef readonly int n_tags
|
||||||
|
|
||||||
|
|
|
@ -82,12 +82,9 @@ cdef class Morphology:
|
||||||
self._load_from_tag_map(tag_map)
|
self._load_from_tag_map(tag_map)
|
||||||
|
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
self.exc = {}
|
self._exc = {}
|
||||||
if exc is not None:
|
if exc is not None:
|
||||||
for (tag, orth), attrs in exc.items():
|
self.load_morph_exceptions(exc)
|
||||||
attrs = _normalize_props(attrs)
|
|
||||||
self.add_special_case(
|
|
||||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
|
||||||
|
|
||||||
def _load_from_tag_map(self, tag_map):
|
def _load_from_tag_map(self, tag_map):
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
|
@ -98,7 +95,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
self.exc), None, None)
|
self._exc), None, None)
|
||||||
|
|
||||||
def add(self, features):
|
def add(self, features):
|
||||||
"""Insert a morphological analysis in the morphology table, if not
|
"""Insert a morphological analysis in the morphology table, if not
|
||||||
|
@ -208,7 +205,7 @@ cdef class Morphology:
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add(attrs)
|
self.add(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||||
|
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
"""Set morphological attributes on a token without a POS tag. Uses
|
"""Set morphological attributes on a token without a POS tag. Uses
|
||||||
|
@ -254,21 +251,34 @@ cdef class Morphology:
|
||||||
token.pos = <univ_pos_t>pos
|
token.pos = <univ_pos_t>pos
|
||||||
token.tag = self.strings[tag_str]
|
token.tag = self.strings[tag_str]
|
||||||
token.morph = self.add(features)
|
token.morph = self.add(features)
|
||||||
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
|
if (self.tag_names[tag_id], token.lex.orth) in self._exc:
|
||||||
self._assign_tag_from_exceptions(token, tag_id)
|
self._assign_tag_from_exceptions(token, tag_id)
|
||||||
|
|
||||||
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
|
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
|
||||||
key = (self.tag_names[tag_id], token.lex.orth)
|
key = (self.tag_names[tag_id], token.lex.orth)
|
||||||
cdef dict attrs
|
cdef dict attrs
|
||||||
attrs = self.exc[key]
|
attrs = self._exc[key]
|
||||||
token.pos = attrs.get(POS, token.pos)
|
token.pos = attrs.get(POS, token.pos)
|
||||||
token.lemma = attrs.get(LEMMA, token.lemma)
|
token.lemma = attrs.get(LEMMA, token.lemma)
|
||||||
|
|
||||||
def load_morph_exceptions(self, dict exc):
|
def load_morph_exceptions(self, dict morph_rules):
|
||||||
|
self._exc = {}
|
||||||
# Map (form, pos) to attributes
|
# Map (form, pos) to attributes
|
||||||
for tag_str, entries in exc.items():
|
for tag, exc in morph_rules.items():
|
||||||
for form_str, attrs in entries.items():
|
for orth, attrs in exc.items():
|
||||||
self.add_special_case(tag_str, form_str, attrs)
|
attrs = _normalize_props(attrs)
|
||||||
|
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def exc(self):
|
||||||
|
# generate the serializable exc in the MORPH_RULES format from the
|
||||||
|
# internal tuple-key format
|
||||||
|
morph_rules = {}
|
||||||
|
for (tag, orth) in sorted(self._exc):
|
||||||
|
if not tag in morph_rules:
|
||||||
|
morph_rules[tag] = {}
|
||||||
|
morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)]
|
||||||
|
return morph_rules
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def feats_to_dict(feats):
|
def feats_to_dict(feats):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user