diff --git a/spacy/language.py b/spacy/language.py index 5f35357a4..1cfc8cd8d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -78,9 +78,7 @@ class BaseDefaults: BASE_NORMS, vocab.lookups.get_table("lexeme_norm"), ) - for tag_str, exc in cls.morph_rules.items(): - for orth_str, attrs in exc.items(): - vocab.morphology.add_special_case(tag_str, orth_str, attrs) + vocab.morphology.load_morph_exceptions(cls.morph_rules) return vocab @classmethod diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index c57e3a1db..3dec1bc70 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -20,7 +20,7 @@ cdef class Morphology: cdef readonly object tag_map cdef readonly object tag_names cdef readonly object reverse_index - cdef readonly object exc + cdef readonly object _exc cdef readonly PreshMapArray _cache cdef readonly int n_tags diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a3aa8be22..6ea33c256 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -82,12 +82,9 @@ cdef class Morphology: self._load_from_tag_map(tag_map) self._cache = PreshMapArray(self.n_tags) - self.exc = {} + self._exc = {} if exc is not None: - for (tag, orth), attrs in exc.items(): - attrs = _normalize_props(attrs) - self.add_special_case( - self.strings.as_string(tag), self.strings.as_string(orth), attrs) + self.load_morph_exceptions(exc) def _load_from_tag_map(self, tag_map): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): @@ -98,7 +95,7 @@ cdef class Morphology: def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, - self.exc), None, None) + self._exc), None, None) def add(self, features): """Insert a morphological analysis in the morphology table, if not @@ -208,7 +205,7 @@ cdef class Morphology: attrs = _normalize_props(attrs) self.add(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - self.exc[(tag_str, self.strings.add(orth_str))] = attrs + self._exc[(tag_str, self.strings.add(orth_str))] = attrs cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses @@ -254,21 +251,34 @@ cdef class Morphology: token.pos = pos token.tag = self.strings[tag_str] token.morph = self.add(features) - if (self.tag_names[tag_id], token.lex.orth) in self.exc: + if (self.tag_names[tag_id], token.lex.orth) in self._exc: self._assign_tag_from_exceptions(token, tag_id) cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: key = (self.tag_names[tag_id], token.lex.orth) cdef dict attrs - attrs = self.exc[key] + attrs = self._exc[key] token.pos = attrs.get(POS, token.pos) token.lemma = attrs.get(LEMMA, token.lemma) - def load_morph_exceptions(self, dict exc): + def load_morph_exceptions(self, dict morph_rules): + self._exc = {} # Map (form, pos) to attributes - for tag_str, entries in exc.items(): - for form_str, attrs in entries.items(): - self.add_special_case(tag_str, form_str, attrs) + for tag, exc in morph_rules.items(): + for orth, attrs in exc.items(): + attrs = _normalize_props(attrs) + self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs) + + @property + def exc(self): + # generate the serializable exc in the MORPH_RULES format from the + # internal tuple-key format + morph_rules = {} + for (tag, orth) in sorted(self._exc): + if not tag in morph_rules: + morph_rules[tag] = {} + morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)] + return morph_rules @staticmethod def feats_to_dict(feats):