From 68fade8f76d6e00d29df807f3090098d5ee37580 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 19 Jul 2020 00:02:29 +0200
Subject: [PATCH 1/4] Add Plausible [ci skip]

---
 website/gatsby-config.js  |  5 +++++
 website/meta/site.json    |  2 ++
 website/package-lock.json | 35 +++++++++++++++++++++++++++++++++++
 website/package.json      |  1 +
 4 files changed, 43 insertions(+)

diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index d08c574c6..752628749 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -19,6 +19,7 @@ const isNightly = !!+process.env.SPACY_NIGHTLY || site.nightlyBranches.includes(
 const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png`
 const binderBranch = isNightly ? 'nightly' : site.binderBranch
 const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl
+const domain = isNightly ? site.domainNightly : site.domain
 
 module.exports = {
     siteMetadata: {
@@ -148,6 +149,10 @@ module.exports = {
                 respectDNT: true,
             },
         },
+        {
+            resolve: `gatsby-plugin-plausible`,
+            options: { domain },
+        },
         {
             resolve: 'gatsby-plugin-robots-txt',
             options: {
diff --git a/website/meta/site.json b/website/meta/site.json
index 5fb1a4533..7e6f4c692 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -3,7 +3,9 @@
     "description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.",
     "slogan": "Industrial-strength Natural Language Processing in Python",
     "siteUrl": "https://spacy.io",
+    "domain": "spacy.io",
     "siteUrlNightly": "https://nightly.spacy.io",
+    "domainNightly": "nightly.spacy.io",
     "nightlyBranches": ["nightly.spacy.io"],
     "email": "contact@explosion.ai",
     "company": "Explosion",
diff --git a/website/package-lock.json b/website/package-lock.json
index 96a10a8af..02bc3a27b 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -12915,6 +12915,41 @@
         }
       }
     },
+    "gatsby-plugin-plausible": {
+      "version": "0.0.6",
+      "resolved": "https://registry.npmjs.org/gatsby-plugin-plausible/-/gatsby-plugin-plausible-0.0.6.tgz",
+      "integrity": "sha512-qUdPQ3haeX2DIywGZ2boMpmFAnSbWzqS9cG9/OO0mWLigA0sDLWwGkpHIAvrfepgbB9U/roLtXflctBwOIxtcQ==",
+      "requires": {
+        "@babel/runtime": "^7.9.2",
+        "minimatch": "3.0.4",
+        "react": "^16.13.1"
+      },
+      "dependencies": {
+        "@babel/runtime": {
+          "version": "7.10.5",
+          "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.10.5.tgz",
+          "integrity": "sha512-otddXKhdNn7d0ptoFRHtMLa8LqDxLYwTjB4nYgM1yy5N6gU/MUf8zqyyLltCH3yAVitBzmwK4us+DD0l/MauAg==",
+          "requires": {
+            "regenerator-runtime": "^0.13.4"
+          }
+        },
+        "react": {
+          "version": "16.13.1",
+          "resolved": "https://registry.npmjs.org/react/-/react-16.13.1.tgz",
+          "integrity": "sha512-YMZQQq32xHLX0bz5Mnibv1/LHb3Sqzngu7xstSM+vrkE5Kzr9xE0yMByK5kMoTK30YVJE61WfbxIFFvfeDKT1w==",
+          "requires": {
+            "loose-envify": "^1.1.0",
+            "object-assign": "^4.1.1",
+            "prop-types": "^15.6.2"
+          }
+        },
+        "regenerator-runtime": {
+          "version": "0.13.5",
+          "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.5.tgz",
+          "integrity": "sha512-ZS5w8CpKFinUzOwW3c83oPeVXoNsrLsaCoLtJvAClH135j/R77RuymhiSErhm2lKcwSCIpmvIWSbDkIfAqKQlA=="
+        }
+      }
+    },
     "gatsby-plugin-react-helmet": {
       "version": "3.0.6",
       "resolved": "https://registry.npmjs.org/gatsby-plugin-react-helmet/-/gatsby-plugin-react-helmet-3.0.6.tgz",
diff --git a/website/package.json b/website/package.json
index 3c76014b3..12702692d 100644
--- a/website/package.json
+++ b/website/package.json
@@ -23,6 +23,7 @@
         "gatsby-plugin-google-analytics": "^2.0.14",
         "gatsby-plugin-manifest": "^2.0.17",
         "gatsby-plugin-offline": "^2.0.24",
+        "gatsby-plugin-plausible": "0.0.6",
         "gatsby-plugin-react-helmet": "^3.0.6",
         "gatsby-plugin-react-svg": "^2.0.0",
         "gatsby-plugin-robots-txt": "^1.5.1",

From 38b59d728d8c77d1a0767b765eea958683a85533 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Sun, 19 Jul 2020 11:10:31 +0200
Subject: [PATCH 2/4] Upgrade of UD eval script (#5776)

* new morph feature format

* add new languages with tokenization

* update with all new pretrained models
---
 bin/ud/run_eval.py | 72 ++++++++++++++++++++++++----------------------
 bin/ud/ud_train.py |  4 ++-
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/bin/ud/run_eval.py b/bin/ud/run_eval.py
index 2da476721..3a30c0ee9 100644
--- a/bin/ud/run_eval.py
+++ b/bin/ud/run_eval.py
@@ -12,11 +12,11 @@ from ud_train import write_conllu
 from spacy.lang.lex_attrs import word_shape
 from spacy.util import get_lang_class
 
-# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
-ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
-                 "ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
+# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later)
+ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr,"
+                 "ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb,"
                  "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
-                 "tr, tt, uk, ur, vi, zh")
+                 "tr, tt, uk, ur, vi, yo, zh")
 
 # Non-parsing tasks that will be evaluated (works for default models)
 EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
@@ -251,39 +251,43 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train
 
     # initialize all models with the multi-lang model
     for lang in languages:
-        models[lang] = [multi] if multi else []
-        # add default models if we don't want to evaluate parsing info
-        if not check_parse:
-            # Norwegian is 'nb' in spaCy but 'no' in the UD corpora
-            if lang == 'no':
-                models['no'].append(load_default_model_sentencizer('nb'))
-            else:
-                models[lang].append(load_default_model_sentencizer(lang))
+        UD_lang = lang
+        # Norwegian is 'nb' in spaCy but 'no' in the UD corpora
+        if lang == "nb":
+            UD_lang = "no"
+        try:
+            models[UD_lang] = [multi] if multi else []
+            # add default models if we don't want to evaluate parsing info
+            if not check_parse:
+                models[UD_lang].append(load_default_model_sentencizer(lang))
+        except:
+            print(f"Exception initializing lang {lang} - skipping")
 
     # language-specific trained models
     if not exclude_trained_models:
-        if 'de' in models:
-            models['de'].append(load_model('de_core_news_sm'))
-            models['de'].append(load_model('de_core_news_md'))
-        if 'el' in models:
-            models['el'].append(load_model('el_core_news_sm'))
-            models['el'].append(load_model('el_core_news_md'))
-        if 'en' in models:
-            models['en'].append(load_model('en_core_web_sm'))
-            models['en'].append(load_model('en_core_web_md'))
-            models['en'].append(load_model('en_core_web_lg'))
-        if 'es' in models:
-            models['es'].append(load_model('es_core_news_sm'))
-            models['es'].append(load_model('es_core_news_md'))
-        if 'fr' in models:
-            models['fr'].append(load_model('fr_core_news_sm'))
-            models['fr'].append(load_model('fr_core_news_md'))
-        if 'it' in models:
-            models['it'].append(load_model('it_core_news_sm'))
-        if 'nl' in models:
-            models['nl'].append(load_model('nl_core_news_sm'))
-        if 'pt' in models:
-            models['pt'].append(load_model('pt_core_news_sm'))
+        news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"]
+        news_languages = ["nb"]
+        web_languages = ["en", "zh"]
+        sizes = ["sm", "md", "lg"]
+        for lang in web_languages:
+            UD_lang = lang
+            for size in sizes:
+                model_name = f'{lang}_core_web_{size}'
+                try:
+                    models[UD_lang].append(load_model(model_name))
+                except Exception as e:
+                    print(f"Error loading {model_name}: {e}")
+
+        for lang in news_languages:
+            UD_lang = lang
+            if lang == "nb":
+                UD_lang = "no"
+            for size in sizes:
+                model_name = f'{lang}_core_news_{size}'
+                try:
+                    models[UD_lang].append(load_model(model_name))
+                except Exception as e:
+                    print(f"Error loading {model_name}: {e}")
 
     with out_path.open(mode='w', encoding='utf-8') as out_file:
         run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index 88c534d0a..ac5987aa4 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -303,7 +303,9 @@ def get_token_conllu(token, i):
     feat_str = []
     replacements = {"one": "1", "two": "2", "three": "3"}
     for feat in features:
-        if not feat.startswith("begin") and not feat.startswith("end"):
+        if "=" in feat:
+            feat_str.append(feat)
+        elif not feat.startswith("begin") and not feat.startswith("end"):
             key, value = feat.split("_", 1)
             value = replacements.get(value, value)
             feat_str.append("%s=%s" % (key, value.title()))

From b81a89f0a94ce5a191720ba0eccff43667da6ba9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 11:10:51 +0200
Subject: [PATCH 3/4] Update morphologizer (#5766)

* update `Morphologizer.begin_training` for use with `Example`

* make init and begin_training more consistent

* add `Morphology.normalize_features` to normalize outside of
`Morphology.add`

* make sure `get_loss` doesn't create unknown labels when the POS and
morph alignments differ
---
 spacy/morphology.pyx                       | 29 ++++++---
 spacy/pipeline/morphologizer.pyx           | 71 ++++++++++++++--------
 spacy/tests/pipeline/test_morphologizer.py | 31 +++++++---
 3 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index a3aa8be22..0852418f2 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -58,7 +58,7 @@ cdef class Morphology:
     FEATURE_SEP = "|"
     FIELD_SEP = "="
     VALUE_SEP = ","
-    EMPTY_MORPH = "_"
+    EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
 
     def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
         self.mem = Pool()
@@ -117,13 +117,7 @@ cdef class Morphology:
         if not isinstance(features, dict):
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        features = _normalize_props(features)
         string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(sorted([
-                self.FIELD_SEP.join([field, values])
-            for field, values in string_features.items()
-        ]))
         # intified ("Field", "Field=Value") pairs
         field_feature_pairs = []
         for field in sorted(string_features):
@@ -137,6 +131,7 @@ cdef class Morphology:
         # the hash key for the tag is either the hash of the normalized UFEATS
         # string or the hash of an empty placeholder (using the empty string
         # would give a hash key of 0, which is not good for PreshMap)
+        norm_feats_string = self.normalize_features(features)
         if norm_feats_string:
             tag.key = self.strings.add(norm_feats_string)
         else:
@@ -144,6 +139,26 @@ cdef class Morphology:
         self.insert(tag)
         return tag.key
 
+    def normalize_features(self, features):
+        """Create a normalized UFEATS string from a features string or dict.
+
+        features (Union[dict, str]): Features as dict or UFEATS string.
+        RETURNS (str): Features as normalized UFEATS string.
+        """
+        if isinstance(features, str):
+            features = self.feats_to_dict(features)
+        if not isinstance(features, dict):
+            warnings.warn(Warnings.W100.format(feature=features))
+            features = {}
+        features = _normalize_props(features)
+        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+        # normalized UFEATS string with sorted fields and values
+        norm_feats_string = self.FEATURE_SEP.join(sorted([
+                self.FIELD_SEP.join([field, values])
+            for field, values in string_features.items()
+        ]))
+        return norm_feats_string or self.EMPTY_MORPH
+
     cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
         """Creates a MorphAnalysisC from a list of intified
         ("Field", "Field=Value") tuples where fields with multiple values have
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 57b778434..bc77dda47 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -23,29 +23,45 @@ from .defaults import default_morphologizer
 @component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
 class Morphologizer(Tagger):
 
+    POS_FEAT = "POS"
+
     def __init__(self, vocab, model, **cfg):
         self.vocab = vocab
         self.model = model
         self._rehearsal_model = None
         self.cfg = dict(sorted(cfg.items()))
-        self.cfg.setdefault("labels", {})
-        self.cfg.setdefault("morph_pos", {})
+        # to be able to set annotations without string operations on labels,
+        # store mappings from morph+POS labels to token-level annotations:
+        # 1) labels_morph stores a mapping from morph+POS->morph
+        self.cfg.setdefault("labels_morph", {})
+        # 2) labels_pos stores a mapping from morph+POS->POS
+        self.cfg.setdefault("labels_pos", {})
+        # add mappings for empty morph
+        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
+        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
 
     @property
     def labels(self):
-        return tuple(self.cfg["labels"].keys())
+        return tuple(self.cfg["labels_morph"].keys())
 
     def add_label(self, label):
         if not isinstance(label, str):
             raise ValueError(Errors.E187)
         if label in self.labels:
             return 0
-        morph = Morphology.feats_to_dict(label)
-        norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
-        pos = morph.get("POS", "")
-        if norm_morph_pos not in self.cfg["labels"]:
-            self.cfg["labels"][norm_morph_pos] = norm_morph_pos
-            self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
+        # normalize label
+        norm_label = self.vocab.morphology.normalize_features(label)
+        # extract separate POS and morph tags
+        label_dict = Morphology.feats_to_dict(label)
+        pos = label_dict.get(self.POS_FEAT, "")
+        if self.POS_FEAT in label_dict:
+            label_dict.pop(self.POS_FEAT)
+        # normalize morph string and add to morphology table
+        norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+        # add label mappings
+        if norm_label not in self.cfg["labels_morph"]:
+            self.cfg["labels_morph"][norm_label] = norm_morph
+            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
@@ -53,14 +69,16 @@ class Morphologizer(Tagger):
         for example in get_examples():
             for i, token in enumerate(example.reference):
                 pos = token.pos_
-                morph = token.morph
-                norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
+                morph = token.morph_
+                # create and add the combined morph+POS label
+                morph_dict = Morphology.feats_to_dict(morph)
                 if pos:
-                    morph["POS"] = pos
-                norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
-                if norm_morph_pos not in self.cfg["labels"]:
-                    self.cfg["labels"][norm_morph_pos] = norm_morph
-                    self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
+                    morph_dict[self.POS_FEAT] = pos
+                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                # add label->morph and label->POS mappings
+                if norm_label not in self.cfg["labels_morph"]:
+                    self.cfg["labels_morph"][norm_label] = morph
+                    self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         self.set_output(len(self.labels))
         self.model.initialize()
         link_vectors_to_models(self.vocab)
@@ -79,8 +97,8 @@ class Morphologizer(Tagger):
                 doc_tag_ids = doc_tag_ids.get()
             for j, tag_id in enumerate(doc_tag_ids):
                 morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
-                doc.c[j].pos = self.cfg["morph_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
+                doc.c[j].pos = self.cfg["labels_pos"][morph]
 
             doc.is_morphed = True
 
@@ -94,14 +112,17 @@ class Morphologizer(Tagger):
             for i in range(len(morphs)):
                 pos = pos_tags[i]
                 morph = morphs[i]
-                feats = Morphology.feats_to_dict(morph)
+                # POS may align (same value for multiple tokens) when morph
+                # doesn't, so if either is None, treat both as None here so that
+                # truths doesn't end up with an unknown morph+POS combination
+                if pos is None or morph is None:
+                    pos = None
+                    morph = None
+                label_dict = Morphology.feats_to_dict(morph)
                 if pos:
-                    feats["POS"] = pos
-                if len(feats) > 0:
-                    morph = self.vocab.strings[self.vocab.morphology.add(feats)]
-                if morph == "":
-                    morph = Morphology.EMPTY_MORPH
-                eg_truths.append(morph)
+                    label_dict[self.POS_FEAT] = pos
+                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                eg_truths.append(label)
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 9b7e2788d..757c9214c 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -5,6 +5,7 @@ from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
+from spacy.morphology import Morphology
 
 
 def test_label_types():
@@ -23,9 +24,10 @@ TRAIN_DATA = [
             "pos": ["NOUN", "VERB", "ADJ", "NOUN"],
         },
     ),
+    # test combinations of morph+POS
     (
         "Eat blue ham",
-        {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
+        {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},
     ),
 ]
 
@@ -38,7 +40,12 @@ def test_overfitting_IO():
     for inst in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
         for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
-            morphologizer.add_label(morph + "|POS=" + pos)
+            if morph and pos:
+                morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos)
+            elif pos:
+                morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
+            elif morph:
+                morphologizer.add_label(morph)
     nlp.add_pipe(morphologizer)
     optimizer = nlp.begin_training()
 
@@ -48,19 +55,27 @@ def test_overfitting_IO():
     assert losses["morphologizer"] < 0.00001
 
     # test the trained model
-    test_text = "I like blue eggs"
+    test_text = "I like blue ham"
     doc = nlp(test_text)
     gold_morphs = [
-        "Feat=N|POS=NOUN",
-        "Feat=V|POS=VERB",
-        "Feat=J|POS=ADJ",
-        "Feat=N|POS=NOUN",
+        "Feat=N",
+        "Feat=V",
+        "",
+        "",
+    ]
+    gold_pos_tags = [
+        "NOUN",
+        "VERB",
+        "ADJ",
+        "",
     ]
     assert [t.morph_ for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        assert gold_morphs == [t.morph_ for t in doc2]
+        assert [t.morph_ for t in doc2] == gold_morphs
+        assert [t.pos_ for t in doc2] == gold_pos_tags

From 9ee1c54f40e901533ef16cd148556cbf83cca6a7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 19 Jul 2020 13:13:57 +0200
Subject: [PATCH 4/4] Improve tag map initialization and updating (#5764)

* Improve tag map initialization and updating

Generalize tag map initialization and updating so that the tag map can
be loaded correctly prior to loading a `Corpus` with `spacy debug-data`
and `spacy train`.

* normalize provided tag map as necessary
* use the same method for initializing and updating the tag map

* Replace rather than update tag map

Replace rather than update tag map when loading a custom tag map.
Updating the tag map is problematic due to the sorted list of tag names
and the fact that the tag map will contain lingering/unwanted tags from
the default tag map.

* Update CLI scripts

* Reinitialize cache after loading new tag map

Reinitialize the cache with the right size after loading a new tag map.
---
 spacy/cli/debug_data.py             |  4 ++--
 spacy/cli/train.py                  |  4 ++--
 spacy/morphology.pyx                | 33 +++++++++++++++--------------
 spacy/tests/pipeline/test_tagger.py |  3 +--
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 9d1986d8a..49bfa9e82 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -131,8 +131,8 @@ def debug_data(
     tag_map = {}
     if tag_map_path is not None:
         tag_map = srsly.read_json(tag_map_path)
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     msg.divider("Data file validation")
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index feebc30d4..f69ad5b60 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -124,8 +124,8 @@ def train(
         )
         nlp.begin_training(lambda: train_examples)
 
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
+    # Replace tag map with provided mapping
+    nlp.vocab.morphology.load_tag_map(tag_map)
 
     # Create empty extra lexeme tables so the data from spacy-lookups-data
     # isn't loaded if these features are accessed
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 0852418f2..dac10137b 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -64,6 +64,20 @@ cdef class Morphology:
         self.mem = Pool()
         self.strings = strings
         self.tags = PreshMap()
+        self.load_tag_map(tag_map)
+        self.lemmatizer = lemmatizer
+
+        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag, orth), attrs in exc.items():
+                attrs = _normalize_props(attrs)
+                self.add_special_case(
+                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
+
+    def load_tag_map(self, tag_map):
+        self.tag_map = {}
+        self.reverse_index = {}
         # Add special space symbol. We prefix with underscore, to make sure it
         # always sorts to the end.
         if '_SP' in tag_map:
@@ -74,27 +88,14 @@ cdef class Morphology:
             self.strings.add('_SP')
             tag_map = dict(tag_map)
             tag_map['_SP'] = space_attrs
-        self.tag_names = tuple(sorted(tag_map.keys()))
-        self.tag_map = {}
-        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map)
-        self.reverse_index = {}
-        self._load_from_tag_map(tag_map)
-
-        self._cache = PreshMapArray(self.n_tags)
-        self.exc = {}
-        if exc is not None:
-            for (tag, orth), attrs in exc.items():
-                attrs = _normalize_props(attrs)
-                self.add_special_case(
-                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)
-
-    def _load_from_tag_map(self, tag_map):
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
             attrs = _normalize_props(attrs)
             self.add(attrs)
             self.tag_map[tag_str] = dict(attrs)
             self.reverse_index[self.strings.add(tag_str)] = i
+        self.tag_names = tuple(sorted(self.tag_map.keys()))
+        self.n_tags = len(self.tag_map)
+        self._cache = PreshMapArray(self.n_tags)
 
     def __reduce__(self):
         return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index aedf8e2b3..ec7a15115 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -27,8 +27,7 @@ def test_overfitting_IO():
     # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
     nlp = English()
     tagger = nlp.create_pipe("tagger")
-    for tag, values in TAG_MAP.items():
-        tagger.add_label(tag, values)
+    nlp.vocab.morphology.load_tag_map(TAG_MAP)
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))