From 9ee1c54f40e901533ef16cd148556cbf83cca6a7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 13:13:57 +0200
Subject: [PATCH] Improve tag map initialization and updating (#5764)

* Improve tag map initialization and updating

Generalize tag map initialization and updating so that the tag map can
be loaded correctly prior to loading a `Corpus` with `spacy debug-data`
and `spacy train`.

* normalize provided tag map as necessary
* use the same method for initializing and updating the tag map

* Replace rather than update tag map

Replace rather than update tag map when loading a custom tag map.
Updating the tag map is problematic due to the sorted list of tag names
and the fact that the tag map will contain lingering/unwanted tags from
the default tag map.

* Update CLI scripts

* Reinitialize cache after loading new tag map

Reinitialize the cache with the right size after loading a new tag map.
---
 spacy/cli/debug_data.py             |  4 ++--
 spacy/cli/train.py                  |  4 ++--
 spacy/morphology.pyx                | 33 +++++++++++++++--------------
 spacy/tests/pipeline/test_tagger.py |  3 +--
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 9d1986d8a..49bfa9e82 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -131,8 +131,8 @@ def debug_data(
     tag_map = {}
     if tag_map_path is not None:
         tag_map = srsly.read_json(tag_map_path)
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     msg.divider("Data file validation")
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index feebc30d4..f69ad5b60 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -124,8 +124,8 @@ def train(
         )
         nlp.begin_training(lambda: train_examples)
 
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 0852418f2..dac10137b 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -64,6 +64,20 @@ cdef class Morphology:
         self.mem = Pool()
         self.strings = strings
         self.tags = PreshMap()
+        self.load_tag_map(tag_map)
+        self.lemmatizer = lemmatizer
+
+        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag, orth), attrs in exc.items():
+                attrs = _normalize_props(attrs)
+                self.add_special_case(
+                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
+
+    def load_tag_map(self, tag_map):
+        self.tag_map = {}
+        self.reverse_index = {}
         # Add special space symbol. We prefix with underscore, to make sure it
         # always sorts to the end.
         if '_SP' in tag_map:
@@ -74,27 +88,14 @@ cdef class Morphology:
             self.strings.add('_SP')
             tag_map = dict(tag_map)
             tag_map['_SP'] = space_attrs
-        self.tag_names = tuple(sorted(tag_map.keys()))
-        self.tag_map = {}
-        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map)
-        self.reverse_index = {}
-        self._load_from_tag_map(tag_map)
-
-        self._cache = PreshMapArray(self.n_tags)
-        self.exc = {}
-        if exc is not None:
-            for (tag, orth), attrs in exc.items():
-                attrs = _normalize_props(attrs)
-                self.add_special_case(
-                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    def _load_from_tag_map(self, tag_map):
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
             attrs = _normalize_props(attrs)
             self.add(attrs)
             self.tag_map[tag_str] = dict(attrs)
             self.reverse_index[self.strings.add(tag_str)] = i
+        self.tag_names = tuple(sorted(self.tag_map.keys()))
+        self.n_tags = len(self.tag_map)
+        self._cache = PreshMapArray(self.n_tags)
 
     def __reduce__(self):
         return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index aedf8e2b3..ec7a15115 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -27,8 +27,7 @@ def test_overfitting_IO():
     # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
     nlp = English()
     tagger = nlp.create_pipe("tagger")
-    for tag, values in TAG_MAP.items():
-        tagger.add_label(tag, values)
+    nlp.vocab.morphology.load_tag_map(TAG_MAP)
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))