Improve tag map initialization and updating (#5764)

* Improve tag map initialization and updating Generalize tag map initialization and updating so that the tag map can be loaded correctly prior to loading a `Corpus` with `spacy debug-data` and `spacy train`. * normalize provided tag map as necessary * use the same method for initializing and updating the tag map * Replace rather than update tag map Replace rather than update tag map when loading a custom tag map. Updating the tag map is problematic due to the sorted list of tag names and the fact that the tag map will contain lingering/unwanted tags from the default tag map. * Update CLI scripts * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
2025-07-31 18:39:49 +03:00 · 2020-07-19 13:13:57 +02:00 · 2020-07-19 13:13:57 +02:00 · 9ee1c54f40
commit 9ee1c54f40
parent b81a89f0a9
4 changed files with 22 additions and 22 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -131,8 +131,8 @@ def debug_data(
    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)

    msg.divider("Data file validation")

--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -124,8 +124,8 @@ def train(
        )
        nlp.begin_training(lambda: train_examples)

-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -64,6 +64,20 @@ cdef class Morphology:
        self.mem = Pool()
        self.strings = strings
        self.tags = PreshMap()
+        self.load_tag_map(tag_map)
+        self.lemmatizer = lemmatizer
+
+        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag, orth), attrs in exc.items():
+                attrs = _normalize_props(attrs)
+                self.add_special_case(
+                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
+
+    def load_tag_map(self, tag_map):
+        self.tag_map = {}
+        self.reverse_index = {}
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        if '_SP' in tag_map:
@ -74,27 +88,14 @@ cdef class Morphology:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
            tag_map['_SP'] = space_attrs
-        self.tag_names = tuple(sorted(tag_map.keys()))
-        self.tag_map = {}
-        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map)
-        self.reverse_index = {}
-        self._load_from_tag_map(tag_map)
-
-        self._cache = PreshMapArray(self.n_tags)
-        self.exc = {}
-        if exc is not None:
-            for (tag, orth), attrs in exc.items():
-                attrs = _normalize_props(attrs)
-                self.add_special_case(
-                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    def _load_from_tag_map(self, tag_map):
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = _normalize_props(attrs)
            self.add(attrs)
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i
+        self.tag_names = tuple(sorted(self.tag_map.keys()))
+        self.n_tags = len(self.tag_map)
+        self._cache = PreshMapArray(self.n_tags)

    def __reduce__(self):
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -27,8 +27,7 @@ def test_overfitting_IO():
    # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
    nlp = English()
    tagger = nlp.create_pipe("tagger")
-    for tag, values in TAG_MAP.items():
-        tagger.add_label(tag, values)
+    nlp.vocab.morphology.load_tag_map(TAG_MAP)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))