Improve tag map initialization and updating (#5768)

* Improve tag map initialization and updating Generalize tag map initialization and updating so that a provided tag map can be loaded correctly in the CLI. * normalize provided tag map as necessary * use the same method for initializing and overwriting the tag map * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
2026-03-05 12:21:27 +03:00 · 2020-07-19 11:13:39 +02:00 · 2020-07-19 11:13:39 +02:00 · 597bcc629e
commit 597bcc629e
parent 7e14272096
3 changed files with 20 additions and 19 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -70,8 +70,8 @@ def debug_data(
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)

    msg.divider("Data format validation")

--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -250,8 +250,8 @@ def train(
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -150,6 +150,19 @@ cdef class Morphology:
        self.mem = Pool()
        self.strings = string_store
        self.tags = PreshMap()
+        self._feat_map = MorphologyClassMap(FEATURES)
+        self.load_tag_map(tag_map)
+        self.lemmatizer = lemmatizer
+
+        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag, orth), attrs in exc.items():
+                attrs = _normalize_props(attrs)
+                self.add_special_case(
+                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
+
+    def load_tag_map(self, tag_map):
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        if '_SP' in tag_map:
@ -160,29 +173,17 @@ cdef class Morphology:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
            tag_map['_SP'] = space_attrs
-        self.tag_names = tuple(sorted(tag_map.keys()))
        self.tag_map = {}
-        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map)
        self.reverse_index = {}
-        self._feat_map = MorphologyClassMap(FEATURES)
-        self._load_from_tag_map(tag_map)
-
-        self._cache = PreshMapArray(self.n_tags)
-        self.exc = {}
-        if exc is not None:
-            for (tag, orth), attrs in exc.items():
-                attrs = _normalize_props(attrs)
-                self.add_special_case(
-                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    def _load_from_tag_map(self, tag_map):
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = _normalize_props(attrs)
            self.add({self._feat_map.id2feat[feat] for feat in attrs
                      if feat in self._feat_map.id2feat})
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i
+        self.tag_names = tuple(sorted(self.tag_map.keys()))
+        self.n_tags = len(self.tag_map)
+        self._cache = PreshMapArray(self.n_tags)

    def __reduce__(self):
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,