From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 11:54:48 +0200
Subject: [PATCH 01/29] Also rename to include_static_vectors in CharEmbed

---
 spacy/ml/models/tok2vec.py           | 6 +++---
 spacy/pipeline/morphologizer.pyx     | 2 +-
 spacy/tests/pipeline/test_tok2vec.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 23cfe883b..6ef7b2325 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -177,7 +177,7 @@ def CharacterEmbed(
     rows: int,
     nM: int,
     nC: int,
-    also_use_static_vectors: bool,
+    include_static_vectors: bool,
     feature: Union[int, str] = "LOWER",
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedded representation based on character embeddings, using
@@ -204,13 +204,13 @@ def CharacterEmbed(
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
-    also_use_static_vectors (bool): Whether to also use static word vectors.
+    include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
     feature = intify_attr(feature)
     if feature is None:
         raise ValueError(Errors.E911(feat=feature))
-    if also_use_static_vectors:
+    if include_static_vectors:
         model = chain(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index a456b7a0f..00188a762 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,7 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
-also_use_static_vectors = false
+include_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 90882ae3f..ec4ed17dd 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From 18dfb279850adb00c3b3efa18bbb6d58c17bc453 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:05:33 +0200
Subject: [PATCH 02/29] Add custom error when evaluation throws a KeyError

---
 spacy/errors.py        | 3 +++
 spacy/training/loop.py | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 2bc2f3e20..06653edcf 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,9 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already trained and initialized. "
+            "You can also consider moving them to the 'disabled' list instead.")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 242113cc6..8e688a27d 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -249,7 +249,10 @@ def create_evaluation_callback(
 
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
+        try:
+            scores = nlp.evaluate(dev_examples)
+        except KeyError as e:
+            raise KeyError(Errors.E900) from e
         # Calculate a weighted sum based on score_weights for the main score.
         # We can only consider scores that are ints/floats, not dicts like
         # entity scores per type etc.

From 8316bc7d4a6dbd989d53f97a8c7a06758c8d356c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:06:20 +0200
Subject: [PATCH 03/29] bugfix DisabledPipes

---
 spacy/language.py                         |  3 +++
 spacy/tests/pipeline/test_pipe_methods.py | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 1fb559657..24e593043 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1034,6 +1034,9 @@ class Language:
                     )
                 )
             disable = to_disable
+        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
+        # those pipes that were already disabled.
+        disable = [d for d in disable if d not in self._disabled]
         return DisabledPipes(self, disable)
 
     def make_doc(self, text: str) -> Doc:
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index c693a7487..cd18b0159 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
 
 @pytest.mark.parametrize("name", ["my_component"])
 def test_disable_pipes_context(nlp, name):
+    """Test that an enabled component stays enabled after running the context manager."""
     nlp.add_pipe("new_pipe", name=name)
     assert nlp.has_pipe(name)
     with nlp.select_pipes(disable=name):
@@ -136,6 +137,19 @@ def test_disable_pipes_context(nlp, name):
     assert nlp.has_pipe(name)
 
 
+@pytest.mark.parametrize("name", ["my_component"])
+def test_disable_pipes_context_restore(nlp, name):
+    """Test that a disabled component stays disabled after running the context manager."""
+    nlp.add_pipe("new_pipe", name=name)
+    assert nlp.has_pipe(name)
+    nlp.disable_pipes(name)
+    assert not nlp.has_pipe(name)
+    with nlp.select_pipes(disable=name):
+        assert not nlp.has_pipe(name)
+    assert not nlp.has_pipe(name)
+
+
+
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From 2cafba5f50d83a93582bddea6bd1f569f98207f7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:17:35 +0200
Subject: [PATCH 04/29] shorten error message for clarity

---
 spacy/errors.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 06653edcf..3ab9661e0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,8 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. "
-            "You can also consider moving them to the 'disabled' list instead.")
+            "frozen components, make sure they were already trained and initialized. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "

From 06b9d213fd91397896a24dcf5fa4f90950570e9d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:19:47 +0200
Subject: [PATCH 05/29] formatting

---
 spacy/errors.py                           | 2 +-
 spacy/tests/pipeline/test_pipe_methods.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 3ab9661e0..0932ba0fd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,7 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. ")
+            "frozen components, make sure they were already initialized and trained. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index cd18b0159..b744aed98 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -149,7 +149,6 @@ def test_disable_pipes_context_restore(nlp, name):
     assert not nlp.has_pipe(name)
 
 
-
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From 853edace37af044e21b0631d8d35ede18d16a482 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 14:11:06 +0200
Subject: [PATCH 06/29] fix MultiHashEmbed example in documentation

---
 spacy/ml/models/tok2vec.py                    | 2 +-
 website/docs/usage/embeddings-transformers.md | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 23cfe883b..1a78cf75e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -110,7 +110,7 @@ def MultiHashEmbed(
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
-    account some subword information, without construction a fully character-based
+    account some subword information, without constructing a fully character-based
     representation. If pretrained vectors are available, they can be included in
     the representation as well, with the vectors table will be kept static
     (i.e. it's not updated).
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 73540b3d3..856685dad 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -516,16 +516,14 @@ Many neural network models are able to use word vector tables as additional
 features, which sometimes results in significant improvements in accuracy.
 spaCy's built-in embedding layer,
 [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
-word vector tables using the `also_use_static_vectors` flag. This setting is
-also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
-layer, which builds the default token-to-vector encoding architecture.
+word vector tables using the `include_static_vectors` flag. 
 
 ```ini
 [tagger.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = 128
-rows = 7000
-also_embed_subwords = true
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [7000, 3500, 3500, 3500]
 also_use_static_vectors = true
 ```
 

From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:07 +0200
Subject: [PATCH 07/29] Update docs

---
 website/docs/usage/embeddings-transformers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 549c3bcc4..942fc4e7b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional
 features, which sometimes results in significant improvements in accuracy.
 spaCy's built-in embedding layer,
 [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
-word vector tables using the `also_use_static_vectors` flag. This setting is
+word vector tables using the `include_static_vectors` flag. This setting is
 also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
 layer, which builds the default token-to-vector encoding architecture.
 
@@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture.
 [tagger.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = 128
-rows = 7000
-also_embed_subwords = true
-also_use_static_vectors = true
+attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = true
 ```
 
 <Infobox title="How it works" emoji="💡">

From 727370c633b37457ddbedc80aecf07e1dc2c967d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:51 +0200
Subject: [PATCH 08/29] Remove Span._recalculate_indices

Remove `Span._recalculate_indices`, which is a remnant from the
deprecated `Span.merge`.
---
 spacy/tests/doc/test_doc_api.py          |  9 +++------
 spacy/tests/doc/test_retokenize_merge.py |  1 +
 spacy/tokens/span.pxd                    |  1 -
 spacy/tokens/span.pyx                    | 17 -----------------
 4 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ea832c136..db8a6d1c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -608,14 +608,11 @@ def test_doc_init_iob():
         doc = Doc(Vocab(), words=words, ents=ents)
 
 
-@pytest.mark.xfail
-def test_doc_set_ents_spans(en_tokenizer):
+def test_doc_set_ents_invalid_spans(en_tokenizer):
     doc = en_tokenizer("Some text about Colombia and the Czech Republic")
     spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
     with doc.retokenize() as retokenizer:
         for span in spans:
             retokenizer.merge(span)
-    # If this line is uncommented, it works:
-    # print(spans)
-    doc.ents = spans
-    assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
+    with pytest.raises(IndexError):
+        doc.ents = spans
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index cb886545a..b483255c8 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
         attrs = {"lemma": "none", "ent_type": "none"}
         retokenizer.merge(doc[0:2], attrs=attrs)
         retokenizer.merge(doc[-2:], attrs=attrs)
+    sent1, sent2 = list(doc.sents)
     assert len(sent1) == init_len - 1
     assert len(sent2) == init_len2 - 1
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index f6f88a23e..cc6b908bb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -16,5 +16,4 @@ cdef class Span:
     cdef public _vector
     cdef public _vector_norm
 
-    cpdef int _recalculate_indices(self) except -1
     cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 64c3c7df0..491ba0266 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -150,7 +150,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#len
         """
-        self._recalculate_indices()
         if self.end < self.start:
             return 0
         return self.end - self.start
@@ -167,7 +166,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#getitem
         """
-        self._recalculate_indices()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
@@ -188,7 +186,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#iter
         """
-        self._recalculate_indices()
         for i in range(self.start, self.end):
             yield self.doc[i]
 
@@ -339,19 +336,6 @@ cdef class Span:
                 output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
         return output
 
-    cpdef int _recalculate_indices(self) except -1:
-        if self.end > self.doc.length \
-        or self.doc.c[self.start].idx != self.start_char \
-        or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
-            start = token_by_start(self.doc.c, self.doc.length, self.start_char)
-            if self.start == -1:
-                raise IndexError(Errors.E036.format(start=self.start_char))
-            end = token_by_end(self.doc.c, self.doc.length, self.end_char)
-            if end == -1:
-                raise IndexError(Errors.E037.format(end=self.end_char))
-            self.start = start
-            self.end = end + 1
-
     @property
     def vocab(self):
         """RETURNS (Vocab): The Span's Doc's vocab."""
@@ -520,7 +504,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#root
         """
-        self._recalculate_indices()
         if "root" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["root"](self)
         # This should probably be called 'head', and the other one called

From 040c7c054125d32da2af9c73f604b811e6ae0d97 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 15:40:58 +0200
Subject: [PATCH 09/29] fix get_dim calls in build_simple_cnn_text_classifier

---
 spacy/ml/models/textcat.py | 4 ++--
 spacy/util.py              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 1117b4fde..ec8998e2d 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier(
     """
     with Model.define_operators({">>": chain}):
         if exclusive_classes:
-            output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
+            output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
             model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
             model.set_ref("output_layer", output_layer)
         else:
-            linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
+            linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
             model = (
                 tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
             )
diff --git a/spacy/util.py b/spacy/util.py
index 3d567a425..47fbcce1c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
     if not path.parent.exists():
         raise IOError(Errors.E052.format(path=path.parent))
     if not path.exists() or not path.is_file():
-        raise IOError(Errors.E053.format(path=path, name="meta.json"))
+        raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
     meta = srsly.read_json(path)
     for setting in ["lang", "name", "version"]:
         if setting not in meta or not meta[setting]:

From e972ecba727a35d59080dc0e217faa02044abb4e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 16:03:14 +0200
Subject: [PATCH 10/29] add utf8 encoding for opening file

---
 spacy/cli/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e4559929e..8413c639b 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path):
     if converter == "auto":
         converter = input_path.suffix[1:]
     if converter == "ner" or converter == "iob":
-        with input_path.open() as file_:
+        with input_path.open(encoding="utf8") as file_:
             input_data = file_.read()
         converter_autodetect = autodetect_ner_format(input_data)
         if converter_autodetect == "ner":

From 97ff090e495208a5944561e210c76ef77e93eab3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 16:03:57 +0200
Subject: [PATCH 11/29] Fix docs example [ci skip]

---
 website/docs/usage/processing-pipelines.md | 54 +++++++++-------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index fdae6d3e5..83134962b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
 
 This example shows the implementation of a pipeline component that fetches
 country meta data via the [REST Countries API](https://restcountries.eu), sets
-entity annotations for countries, merges entities into one token and sets custom
-attributes on the `Doc`, `Span` and `Token` – for example, the capital,
-latitude/longitude coordinates and even the country flag.
+entity annotations for countries and sets custom attributes on the `Doc` and
+`Span` – for example, the capital, latitude/longitude coordinates and even the
+country flag.
 
 ```python
 ### {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
         # Set up the PhraseMatcher with Doc patterns for each country name
         self.matcher = PhraseMatcher(nlp.vocab)
         self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
-        # Register attribute on the Token. We'll be overwriting this based on
+        # Register attributes on the Span. We'll be overwriting this based on
         # the matches, so we're only setting a default value, not a getter.
-        Token.set_extension("is_country", default=False)
-        Token.set_extension("country_capital", default=False)
-        Token.set_extension("country_latlng", default=False)
-        Token.set_extension("country_flag", default=False)
-        # Register attributes on Doc and Span via a getter that checks if one of
-        # the contained tokens is set to is_country == True.
+        Span.set_extension("is_country", default=None)
+        Span.set_extension("country_capital", default=None)
+        Span.set_extension("country_latlng", default=None)
+        Span.set_extension("country_flag", default=None)
+        # Register attribute on Doc via a getter that checks if the Doc
+        # contains a country entity
         Doc.set_extension("has_country", getter=self.has_country)
-        Span.set_extension("has_country", getter=self.has_country)
 
     def __call__(self, doc):
         spans = []  # keep the spans for later so we can merge them afterwards
         for _, start, end in self.matcher(doc):
             # Generate Span representing the entity & set label
             entity = Span(doc, start, end, label=self.label)
+            # Set custom attributes on entity. Can be extended with other data
+            # returned by the API, like currencies, country code, calling code etc.
+            entity._.set("is_country", True)
+            entity._.set("country_capital", self.countries[entity.text]["capital"])
+            entity._.set("country_latlng", self.countries[entity.text]["latlng"])
+            entity._.set("country_flag", self.countries[entity.text]["flag"])
             spans.append(entity)
-            # Set custom attribute on each token of the entity
-            # Can be extended with other data returned by the API, like
-            # currencies, country code, flag, calling code etc.
-            for token in entity:
-                token._.set("is_country", True)
-                token._.set("country_capital", self.countries[entity.text]["capital"])
-                token._.set("country_latlng", self.countries[entity.text]["latlng"])
-                token._.set("country_flag", self.countries[entity.text]["flag"])
-        # Iterate over all spans and merge them into one token
-        with doc.retokenize() as retokenizer:
-            for span in spans:
-                retokenizer.merge(span)
         # Overwrite doc.ents and add entity – be careful not to replace!
         doc.ents = list(doc.ents) + spans
         return doc  # don't forget to return the Doc!
 
-    def has_country(self, tokens):
-        """Getter for Doc and Span attributes. Since the getter is only called
-        when we access the attribute, we can refer to the Token's 'is_country'
+    def has_country(self, doc):
+        """Getter for Doc attributes. Since the getter is only called
+        when we access the attribute, we can refer to the Span's 'is_country'
         attribute here, which is already set in the processing step."""
-        return any([t._.get("is_country") for t in tokens])
+        return any([entity._.get("is_country") for entity in doc.ents])
 
 nlp = English()
 nlp.add_pipe("rest_countries", config={"label": "GPE"})
 doc = nlp("Some text about Colombia and the Czech Republic")
 print("Pipeline", nlp.pipe_names)  # pipeline contains component name
 print("Doc has countries", doc._.has_country)  # Doc contains countries
-for token in doc:
-    if token._.is_country:
-        print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
-print("Entities", [(e.text, e.label_) for e in doc.ents])
+for ent in doc.ents:
+    if ent._.is_country:
+        print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
 ```
 
 In this case, all data can be fetched on initialization in one request. However,

From 8ac5f222531dcb602d08118693618598bc0c045d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:16 +0200
Subject: [PATCH 12/29] Adjust error message

---
 spacy/errors.py        | 5 +++--
 spacy/training/loop.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 0932ba0fd..be327a784 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,8 +456,9 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
-    E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already initialized and trained. ")
+    E900 = ("Could not run the full pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already initialized and "
+            "trained. Full pipeline: {pipeline}")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 8e688a27d..c3fa83b39 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -252,7 +252,7 @@ def create_evaluation_callback(
         try:
             scores = nlp.evaluate(dev_examples)
         except KeyError as e:
-            raise KeyError(Errors.E900) from e
+            raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e
         # Calculate a weighted sum based on score_weights for the main score.
         # We can only consider scores that are ints/floats, not dicts like
         # entity scores per type etc.

From 525f7988416f9d944f5993a793f999f91e8685f8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:21 +0200
Subject: [PATCH 13/29] Fix typo in test

---
 spacy/tests/pipeline/test_pipe_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index b744aed98..6a21ddfaa 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -142,7 +142,7 @@ def test_disable_pipes_context_restore(nlp, name):
     """Test that a disabled component stays disabled after running the context manager."""
     nlp.add_pipe("new_pipe", name=name)
     assert nlp.has_pipe(name)
-    nlp.disable_pipes(name)
+    nlp.disable_pipe(name)
     assert not nlp.has_pipe(name)
     with nlp.select_pipes(disable=name):
         assert not nlp.has_pipe(name)

From 796f8b9424737b51da81d35fe33b8383f1d5bdf7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:27 +0200
Subject: [PATCH 14/29] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 095d726a0..763faa3eb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a36"
+__version__ = "3.0.0a37"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From bfa3931c9dc9f1ab960d81c985ddaf4bb4a4d023 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 18:55:07 +0200
Subject: [PATCH 15/29] Revert added_strings change (#6236)

---
 spacy/about.py                                |  2 +-
 spacy/errors.py                               |  8 ++++
 spacy/kb.pxd                                  |  1 -
 spacy/kb.pyx                                  | 15 ++----
 spacy/pipeline/attributeruler.py              | 17 +++----
 spacy/pipeline/entity_linker.py               |  3 +-
 spacy/pipeline/lemmatizer.py                  |  4 ++
 spacy/pipeline/morphologizer.pyx              |  5 --
 spacy/pipeline/senter.pyx                     |  1 -
 spacy/pipeline/tagger.pyx                     |  3 +-
 spacy/pipeline/textcat.py                     |  3 +-
 spacy/pipeline/tok2vec.py                     |  1 -
 spacy/pipeline/trainable_pipe.pxd             |  1 -
 spacy/pipeline/trainable_pipe.pyx             | 36 +++++++++------
 spacy/pipeline/transition_parser.pyx          | 27 ++++++-----
 spacy/tests/pipeline/test_entity_linker.py    | 19 ++------
 spacy/tests/pipeline/test_morphologizer.py    |  1 -
 spacy/tests/pipeline/test_senter.py           |  1 -
 spacy/tests/pipeline/test_tagger.py           |  2 -
 spacy/tests/pipeline/test_textcat.py          |  2 -
 spacy/tests/regression/test_issue5230.py      |  4 +-
 .../serialize/test_serialize_pipeline.py      | 46 ++++++++++++++++---
 spacy/util.py                                 |  2 +-
 23 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 763faa3eb..efdfd26c0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a37"
+__version__ = "3.0.0a38"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/errors.py b/spacy/errors.py
index be327a784..5fab0bab1 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,14 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the component model as `self.model` in your "
+            "component's __init__ method.")
+    E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the current `nlp` object's vocab as `self.vocab` in "
+            "your component's __init__ method.")
     E900 = ("Could not run the full pipeline for evaluation. If you specified "
             "frozen components, make sure they were already initialized and "
             "trained. Full pipeline: {pipeline}")
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d61bd43fa..4a71b26a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -30,7 +30,6 @@ cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly Vocab vocab
     cdef int64_t entity_vector_length
-    cdef public set _added_strings
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _KBEntryC struct in the _entries vector).
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 478579d71..10aa377eb 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -92,7 +92,6 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.vocab = vocab
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
-        self._added_strings = set()
 
     @property
     def entity_vector_length(self):
@@ -114,16 +113,12 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
         Return the hash of the entity ID/name at the end.
         """
-        cdef hash_t entity_hash = self.add_string(entity)
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 
         # Return if this entity was added before
         if entity_hash in self._entry_index:
@@ -157,7 +152,7 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash
         while i < len(entity_list):
             # only process this entity if its unique ID hadn't been added before
-            entity_hash = self.add_string(entity_list[i])
+            entity_hash = self.vocab.strings.add(entity_list[i])
             if entity_hash in self._entry_index:
                 warnings.warn(Warnings.W018.format(entity=entity_list[i]))
 
@@ -203,7 +198,7 @@ cdef class KnowledgeBase:
         if prob_sum > 1.00001:
             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 
-        cdef hash_t alias_hash = self.add_string(alias)
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
         # Check whether this alias was added before
         if alias_hash in self._alias_index:
@@ -332,7 +327,7 @@ cdef class KnowledgeBase:
             raise ValueError(Errors.E928.format(loc=path))
         serialize = {}
         serialize["contents"] = lambda p: self.write_contents(p)
-        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
@@ -343,7 +338,7 @@ cdef class KnowledgeBase:
             raise ValueError(Errors.E928.format(loc=path))
         deserialize = {}
         deserialize["contents"] = lambda p: self.read_contents(p)
-        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
+        deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
         util.from_disk(path, deserialize, exclude)
 
     def write_contents(self, file_path):
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 7a6a1de5b..e17d3be98 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
+from typing import List, Dict, Union, Iterable, Any, Optional, Callable
 from typing import Tuple
 import srsly
 from pathlib import Path
@@ -57,7 +57,6 @@ class AttributeRuler(Pipe):
         self.attrs = []
         self._attrs_unnormed = []  # store for reference
         self.indices = []
-        self._added_strings = set()
 
     def clear(self) -> None:
         """Reset all patterns."""
@@ -187,16 +186,12 @@ class AttributeRuler(Pipe):
         # We need to make a string here, because otherwise the ID we pass back
         # will be interpreted as the hash of a string, rather than an ordinal.
         key = str(len(self.attrs))
-        self.matcher.add(self.add_string(key), patterns)
+        self.matcher.add(self.vocab.strings.add(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)
         self.indices.append(index)
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
         """Add patterns from a list of pattern dicts with the keys as the
         arguments to AttributeRuler.add.
@@ -256,8 +251,8 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
         """
         serialize = {}
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
-        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(
@@ -276,7 +271,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.msgpack_loads(b))
 
         deserialize = {
-            "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
+            "vocab": lambda b: self.vocab.from_bytes(b),
             "patterns": load_patterns,
         }
         util.from_bytes(bytes_data, deserialize, exclude)
@@ -293,7 +288,7 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
         """
         serialize = {
-            "strings.json": lambda p: srsly.write_json(p, self._added_strings),
+            "vocab": lambda p: self.vocab.to_disk(p),
             "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
         }
         util.to_disk(path, serialize, exclude)
@@ -314,7 +309,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.read_msgpack(p))
 
         deserialize = {
-            "strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
+            "vocab": lambda p: self.vocab.from_disk(p),
             "patterns": load_patterns,
         }
         util.from_disk(path, deserialize, exclude)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 881e98785..3bb449b4d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe):
         DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
         """
         serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
         serialize["kb"] = lambda p: self.kb.to_disk(p)
         serialize["model"] = lambda p: self.model.to_disk(p)
@@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe):
         deserialize["kb"] = lambda p: self.kb.from_disk(p)
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
-        for s in self.kb._added_strings:
-            self.vocab.strings.add(s)
         return self
 
     def rehearse(self, examples, *, sgd=None, losses=None, **config):
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 7f5370753..9be596868 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -281,6 +281,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
         """
         serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["lookups"] = lambda p: self.lookups.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
@@ -296,6 +297,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
         """
         deserialize = {}
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
         util.from_disk(path, deserialize, exclude)
         self._validate_tables()
@@ -310,6 +312,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
         """
         serialize = {}
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["lookups"] = self.lookups.to_bytes
         return util.to_bytes(serialize, exclude)
 
@@ -325,6 +328,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
         """
         deserialize = {}
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
         util.from_bytes(bytes_data, deserialize, exclude)
         self._validate_tables()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 00188a762..ac111f28b 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -95,7 +95,6 @@ class Morphologizer(Tagger):
         # add mappings for empty morph
         self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
         self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
-        self._added_strings = set()
 
     @property
     def labels(self):
@@ -129,7 +128,6 @@ class Morphologizer(Tagger):
             label_dict.pop(self.POS_FEAT)
         # normalize morph string and add to morphology table
         norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
-        self.add_string(norm_morph)
         # add label mappings
         if norm_label not in self.cfg["labels_morph"]:
             self.cfg["labels_morph"][norm_label] = norm_morph
@@ -161,7 +159,6 @@ class Morphologizer(Tagger):
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
-                    self.add_string(norm_label)
                     # add label->morph and label->POS mappings
                     if norm_label not in self.cfg["labels_morph"]:
                         self.cfg["labels_morph"][norm_label] = morph
@@ -179,7 +176,6 @@ class Morphologizer(Tagger):
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
-                self.add_string(norm_label)
                 gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
             doc_sample.append(example.x)
             label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
@@ -238,7 +234,6 @@ class Morphologizer(Tagger):
                 if pos:
                     label_dict[self.POS_FEAT] = pos
                 label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
-                self.add_string(label)
                 eg_truths.append(label)
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 8ea4ed1b3..15a21902a 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger):
         self.name = name
         self._rehearsal_model = None
         self.cfg = {}
-        self._added_strings = set()
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 535b71270..1b0f79cea 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -78,7 +78,6 @@ class Tagger(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": labels or []}
         self.cfg = dict(sorted(cfg.items()))
-        self._added_strings = set()
 
     @property
     def labels(self):
@@ -313,7 +312,7 @@ class Tagger(TrainablePipe):
             return 0
         self._allow_extra_label()
         self.cfg["labels"].append(label)
-        self.add_string(label)
+        self.vocab.strings.add(label)
         return 1
 
     def score(self, examples, **kwargs):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index e57954184..5ebe0e104 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": [], "threshold": threshold, "positive_label": None}
         self.cfg = dict(cfg)
-        self._added_strings = set()
 
     @property
     def labels(self) -> Tuple[str]:
@@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe):
             return 0
         self._allow_extra_label()
         self.cfg["labels"].append(label)
-        self.add_string(label)
+        self.vocab.strings.add(label)
         return 1
 
     def initialize(
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index b4625291b..0ad875035 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe):
         self.name = name
         self.listeners = []
         self.cfg = {}
-        self._added_strings = set()
 
     def add_listener(self, listener: "Tok2VecListener") -> None:
         """Add a listener for a downstream component. Usually internals."""
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index 8df5cb775..d5cdbb511 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe):
     cdef public Vocab vocab
     cdef public object model
     cdef public object cfg
-    cdef public set _added_strings
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 07a308953..88e50e7c6 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -13,6 +13,7 @@ from ..vocab import Vocab
 from ..language import Language
 from ..training import Example
 
+
 cdef class TrainablePipe(Pipe):
     """This class is a base class and not instantiated directly. Trainable
     pipeline components like the EntityRecognizer or TextCategorizer inherit
@@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe):
         self.model = model
         self.name = name
         self.cfg = dict(cfg)
-        self._added_strings = set()
 
     def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
@@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     @property
     def is_trainable(self) -> bool:
         return True
@@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe):
         """
         self.model.finish_update(sgd)
 
+    def _validate_serialization_attrs(self):
+        """Check that the pipe implements the required attributes. If a subclass
+        implements a custom __init__ method but doesn't set these attributes,
+        the currently default to None, so we need to perform additonal checks.
+        """
+        if not hasattr(self, "vocab") or self.vocab is None:
+            raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
+        if not hasattr(self, "model") or self.model is None:
+            raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
+
     def to_bytes(self, *, exclude=tuple()):
         """Serialize the pipe to a bytestring.
 
@@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#to_bytes
         """
+        self._validate_serialization_attrs()
         serialize = {}
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["model"] = self.model.to_bytes
-        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, *, exclude=tuple()):
@@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#from_bytes
         """
+        self._validate_serialization_attrs()
 
         def load_model(b):
             try:
@@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe):
                 raise ValueError(Errors.E149) from None
 
         deserialize = {}
-        deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["model"] = load_model
         util.from_bytes(bytes_data, deserialize, exclude)
         return self
@@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#to_disk
         """
+        self._validate_serialization_attrs()
         serialize = {}
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["model"] = lambda p: self.model.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
@@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#from_disk
         """
+        self._validate_serialization_attrs()
 
         def load_model(p):
             try:
@@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe):
                 raise ValueError(Errors.E149) from None
 
         deserialize = {}
-        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
         return self
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 3743e1018..63a8595cc 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe):
             self.add_multitask_objective(multitask)
 
         self._rehearsal_model = None
-        self._added_strings = set()
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe):
                 resized = True
         if resized:
             self._resize()
-            self.add_string(label)
+            self.vocab.strings.add(label)
             return 1
         return 0
 
@@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe):
 
     def to_disk(self, path, exclude=tuple()):
         serializers = {
-            'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
-            'strings.json': lambda p: srsly.write_json(p, self._added_strings),
-            'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
-            'cfg': lambda p: srsly.write_json(p, self.cfg)
+            "model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
+            "vocab": lambda p: self.vocab.to_disk(p),
+            "moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
+            "cfg": lambda p: srsly.write_json(p, self.cfg)
         }
         util.to_disk(path, serializers, exclude)
 
     def from_disk(self, path, exclude=tuple()):
         deserializers = {
-            'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
-            'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
-            'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
-            'model': lambda p: None,
+            "vocab": lambda p: self.vocab.from_disk(p),
+            "moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
+            "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
+            "model": lambda p: None,
         }
         util.from_disk(path, deserializers, exclude)
-        if 'model' not in exclude:
+        if "model" not in exclude:
             path = util.ensure_path(path)
-            with (path / 'model').open('rb') as file_:
+            with (path / "model").open("rb") as file_:
                 bytes_data = file_.read()
             try:
                 self._resize()
@@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe):
     def to_bytes(self, exclude=tuple()):
         serializers = {
             "model": lambda: (self.model.to_bytes()),
-            "strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
+            "vocab": lambda: self.vocab.to_bytes(),
             "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
             "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
         }
@@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe):
 
     def from_bytes(self, bytes_data, exclude=tuple()):
         deserializers = {
-            "strings.json": lambda b: [self.add_string(s) for s in  srsly.json_loads(b)],
+            "vocab": lambda b: self.vocab.from_bytes(b),
             "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
             "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
             "model": lambda b: None,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 71496327b..ff2e33fc7 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -121,9 +121,7 @@ def test_kb_default(nlp):
 
 def test_kb_custom_length(nlp):
     """Test that the default (empty) KB can be configured with a custom entity length"""
-    entity_linker = nlp.add_pipe(
-        "entity_linker", config={"entity_vector_length": 35}
-    )
+    entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35})
     assert len(entity_linker.kb) == 0
     assert entity_linker.kb.get_size_entities() == 0
     assert entity_linker.kb.get_size_aliases() == 0
@@ -213,16 +211,11 @@ def test_el_pipe_configuration(nlp):
         kb = KnowledgeBase(vocab, entity_vector_length=1)
         kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
         kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
-        kb.add_alias(
-            alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
-        )
+        kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
         return kb
 
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
-    entity_linker = nlp.add_pipe(
-        "entity_linker",
-        config={"incl_context": False},
-    )
+    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},)
     entity_linker.set_kb(create_kb)
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
@@ -453,14 +446,10 @@ def test_overfitting_IO():
         return mykb
 
     # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe(
-        "entity_linker",
-        last=True,
-    )
+    entity_linker = nlp.add_pipe("entity_linker", last=True,)
     entity_linker.set_kb(create_kb)
     assert "Q2146908" in entity_linker.vocab.strings
     assert "Q2146908" in entity_linker.kb.vocab.strings
-    assert "Q2146908" in entity_linker.kb._added_strings
 
     # train the NEL pipe
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index ce9c0fa54..fd7aa05be 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -101,4 +101,3 @@ def test_overfitting_IO():
         doc2 = nlp2(test_text)
         assert [str(t.morph) for t in doc2] == gold_morphs
         assert [t.pos_ for t in doc2] == gold_pos_tags
-        assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 472216512..c9722e5de 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -80,4 +80,3 @@ def test_overfitting_IO():
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
         assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
-        assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 590c22233..b9db76cdf 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -98,7 +98,6 @@ def test_overfitting_IO():
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["tagger"] < 0.00001
-    assert tagger._added_strings == {"J", "N", "V"}
 
     # test the trained model
     test_text = "I like blue eggs"
@@ -117,7 +116,6 @@ def test_overfitting_IO():
         assert doc2[1].tag_ is "V"
         assert doc2[2].tag_ is "J"
         assert doc2[3].tag_ is "N"
-        assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
 
 
 def test_tagger_requires_labels():
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 7eb7ff658..dd2f1070b 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -146,7 +146,6 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert textcat.model.get_dim("nO") == 2
-    assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
 
     for i in range(50):
         losses = {}
@@ -168,7 +167,6 @@ def test_overfitting_IO():
         cats2 = doc2.cats
         assert cats2["POSITIVE"] > 0.9
         assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
-        assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
 
     # Test scoring
     scores = nlp.evaluate(train_examples)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 02d0c70dd..a00b2a688 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer
 from spacy.vectors import Vectors
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.vocab import Vocab
 
 from ..util import make_tempdir
 
@@ -50,8 +51,9 @@ def custom_pipe():
             else:
                 self.cfg = None
             self.model = SerializableDummy()
+            self.vocab = vocab
 
-    return MyPipe(None)
+    return MyPipe(Vocab())
 
 
 def tagger():
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index dfd7f6bd4..951dd3035 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,13 +1,13 @@
 import pytest
-import srsly
 from spacy import registry, Vocab
 from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import TextCategorizer, SentenceRecognizer
+from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.lang.en import English
+from thinc.api import Linear
 import spacy
 
 from ..util import make_tempdir
@@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser):
     assert label not in vocab2.strings
     parser2 = Parser(vocab2, model, **config)
     parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
-    assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
     assert label in parser2.vocab.strings
 
 
@@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
         # check that custom labels are serialized as part of the component's strings.jsonl
         tagger.add_label(label)
         assert label in tagger.vocab.strings
-        assert tagger._added_strings == {label}
         file_path = d / "tagger1"
         tagger.to_disk(file_path)
-        strings = srsly.read_json(file_path / "strings.json")
-        assert strings == ["SomeWeirdLabel"]
         # ensure that the custom strings are loaded back in when using the tagger in another pipeline
         cfg = {"model": DEFAULT_TAGGER_MODEL}
         model = registry.resolve(cfg, validate=True)["model"]
         tagger2 = Tagger(de_vocab, model).from_disk(file_path)
         assert label in tagger2.vocab.strings
-        assert tagger2._added_strings == {label}
 
 
 def test_serialize_textcat_empty(en_vocab):
@@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable():
     assert nlp5.pipe_names == ["ner"]
     assert nlp5.component_names == ["ner"]
     assert nlp5.disabled == []
+
+
+def test_serialize_custom_trainable_pipe():
+    class BadCustomPipe1(TrainablePipe):
+        def __init__(self, vocab):
+            pass
+
+    class BadCustomPipe2(TrainablePipe):
+        def __init__(self, vocab):
+            self.vocab = vocab
+            self.model = None
+
+    class CustomPipe(TrainablePipe):
+        def __init__(self, vocab, model):
+            self.vocab = vocab
+            self.model = model
+
+    pipe = BadCustomPipe1(Vocab())
+    with pytest.raises(ValueError):
+        pipe.to_bytes()
+    with make_tempdir() as d:
+        with pytest.raises(ValueError):
+            pipe.to_disk(d)
+    pipe = BadCustomPipe2(Vocab())
+    with pytest.raises(ValueError):
+        pipe.to_bytes()
+    with make_tempdir() as d:
+        with pytest.raises(ValueError):
+            pipe.to_disk(d)
+    pipe = CustomPipe(Vocab(), Linear())
+    pipe_bytes = pipe.to_bytes()
+    new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes)
+    assert new_pipe.to_bytes() == pipe_bytes
+    with make_tempdir() as d:
+        pipe.to_disk(d)
+        new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
+    assert new_pipe.to_bytes() == pipe_bytes
diff --git a/spacy/util.py b/spacy/util.py
index 47fbcce1c..58f951f86 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str:
     obj (Any): The Python object, typically a function or class.
     RETURNS (str): A human-readable name.
     """
-    if hasattr(obj, "name"):
+    if hasattr(obj, "name") and obj.name is not None:
         return obj.name
     if hasattr(obj, "__name__"):
         return obj.__name__

From 74972744e589969af8d0ebc83259d92c9e9a5f2f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 19:08:57 +0200
Subject: [PATCH 16/29] Update Thinc

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d48886e0c..c175ded66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a43,<8.0.0a50",
+    "thinc>=8.0.0a44,<8.0.0a50",
     "blis>=0.4.0,<0.8.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 3f3886a60..d6b6267a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a43,<8.0.0a50
+thinc>=8.0.0a44,<8.0.0a50
 blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 69d4e6347..d9414a4f4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a43,<8.0.0a50
+    thinc>=8.0.0a44,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a43,<8.0.0a50
+    thinc>=8.0.0a44,<8.0.0a50
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.3.0,<3.0.0

From 539b0c10daef8bb5d6f7e4f230a02452c6569996 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 19:14:48 +0200
Subject: [PATCH 17/29] Tidy up and auto-format

---
 spacy/lang/tr/lex_attrs.py                 |  5 +++--
 spacy/lang/tr/syntax_iterators.py          |  7 +++----
 spacy/language.py                          |  6 ++++--
 spacy/tests/conftest.py                    |  2 ++
 spacy/tests/lang/tr/test_parser.py         | 19 ++++++++++++-------
 spacy/tests/lang/tr/test_text.py           |  5 ++---
 spacy/tests/pipeline/test_entity_linker.py |  2 +-
 spacy/tests/regression/test_issue6207.py   |  4 ++--
 spacy/tests/test_models.py                 | 13 +++----------
 9 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 3615f4b4c..d9e12c4aa 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -62,6 +62,7 @@ _ordinal_words = [
 
 _ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
 
+
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
@@ -75,11 +76,11 @@ def like_num(text):
 
     text_lower = text.lower()
 
-    #Check cardinal number
+    # Check cardinal number
     if text_lower in _num_words:
         return True
 
-    #Check ordinal number
+    # Check ordinal number
     if text_lower in _ordinal_words:
         return True
     if text_lower.endswith(_ordinal_endings):
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 665ccb590..d9b342949 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -49,11 +49,10 @@ def noun_chunks(doclike):
             prev_end = word.left_edge.i
             yield word.left_edge.i, extend_right(word), np_label
         elif word.dep == conj:
-            cc_token = word.left_edge  
+            cc_token = word.left_edge
             prev_end = cc_token.i
-            yield cc_token.right_edge.i + 1, extend_right(word), np_label  # Shave off cc tokens from the NP
-
-
+            # Shave off cc tokens from the NP
+            yield cc_token.right_edge.i + 1, extend_right(word), np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/language.py b/spacy/language.py
index 24e593043..dd790e85f 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,5 +1,5 @@
 from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
-from typing import Tuple, Iterator
+from typing import Tuple
 from dataclasses import dataclass
 import random
 import itertools
@@ -1197,7 +1197,9 @@ class Language:
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
         if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
+            err = Errors.E930.format(
+                method="Language.initialize", obj=type(get_examples)
+            )
             raise TypeError(err)
         # Make sure the config is interpolated so we can resolve subsections
         config = self.config.interpolate()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7f8ab6768..3b0de899b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -239,10 +239,12 @@ def th_tokenizer():
 def tr_tokenizer():
     return get_lang_class("tr")().tokenizer
 
+
 @pytest.fixture(scope="session")
 def tr_vocab():
     return get_lang_class("tr").Defaults.create_vocab()
 
+
 @pytest.fixture(scope="session")
 def tt_tokenizer():
     return get_lang_class("tt")().tokenizer
diff --git a/spacy/tests/lang/tr/test_parser.py b/spacy/tests/lang/tr/test_parser.py
index ff71ac3d4..b23d0869c 100644
--- a/spacy/tests/lang/tr/test_parser.py
+++ b/spacy/tests/lang/tr/test_parser.py
@@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
     assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
 
 
-def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
+def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
     text = "bildiğim bir turizm şirketi"
     heads = [3, 3, 3, 3]
     deps = ["acl", "det", "nmod", "ROOT"]
@@ -308,7 +308,7 @@ def test_tr_noun_chunks_np_recursive_four_nouns(tr_tokenizer):
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım "
 
-    
+
 def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
     text = "içine birkaç çiçek konmuş olan bir vazo"
     heads = [3, 2, 3, 6, 3, 6, 6]
@@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
 def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
     text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
     heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
-    deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
+    deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
     pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
     tokens = tr_tokenizer(text)
     doc = Doc(
@@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
     )
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 1
-    assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
+    assert (
+        chunks[0].text_with_ws
+        == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
+    )
 
 
 def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
@@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
 
-def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
+
+def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
     text = "tatlı ve gürbüz çocuklar"
     heads = [3, 2, 0, 3]
     deps = ["amod", "cc", "conj", "ROOT"]
@@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
     assert chunks[0].text_with_ws == "ben "
     assert chunks[1].text_with_ws == "Sen "
 
+
 def test_tr_noun_chunks_conj_three(tr_tokenizer):
     text = "sen, ben ve ondan"
     heads = [0, 2, 0, 4, 0]
@@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
     assert chunks[2].text_with_ws == "sen "
 
 
-def test_tr_noun_chunks_conj_three(tr_tokenizer):
+def test_tr_noun_chunks_conj_three2(tr_tokenizer):
     text = "ben ya da sen ya da onlar"
     heads = [0, 3, 1, 0, 6, 4, 3]
     deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
@@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
     assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
 
 
-def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
+def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
     text = "Ahmet Vefik Paşa"
     heads = [2, 0, 2]
     deps = ["nmod", "flat", "ROOT"]
diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py
index 01e279d76..ed7dbb805 100644
--- a/spacy/tests/lang/tr/test_text.py
+++ b/spacy/tests/lang/tr/test_text.py
@@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
         "üçüncü",
         "beşinci",
         "100üncü",
-        "8inci"
-    ]
+        "8inci",
+    ],
 )
 def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
     assert like_num(word)
@@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
 def test_tr_lex_attrs_capitals(word):
     assert like_num(word)
     assert like_num(word.upper())
-
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index ff2e33fc7..e0c63d09e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -446,7 +446,7 @@ def test_overfitting_IO():
         return mykb
 
     # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe("entity_linker", last=True,)
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
     entity_linker.set_kb(create_kb)
     assert "Q2146908" in entity_linker.vocab.strings
     assert "Q2146908" in entity_linker.kb.vocab.strings
diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py
index 47e3803e9..9d8b047bf 100644
--- a/spacy/tests/regression/test_issue6207.py
+++ b/spacy/tests/regression/test_issue6207.py
@@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
 
     # Make spans
     s1 = doc[:4]
-    s2 = doc[3:6]   # overlaps with s1
-    s3 = doc[5:7]   # overlaps with s2, not s1
+    s2 = doc[3:6]  # overlaps with s1
+    s3 = doc[5:7]  # overlaps with s2, not s1
 
     result = filter_spans((s1, s2, s3))
     assert s1 in result
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 8ca7f8b66..e8884e6b2 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -1,10 +1,8 @@
 from typing import List
-
 import pytest
 from thinc.api import fix_random_seed, Adam, set_dropout_rate
 from numpy.testing import assert_array_equal
 import numpy
-
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
 from spacy.ml.staticvectors import StaticVectors
@@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
     assert_array_equal(get_all_params(model1), get_all_params(model2))
 
 
-@pytest.mark.parametrize(
-    "model_func,kwargs",
-    [
-        (StaticVectors, {"nO": 128, "nM": 300}),
-    ]
-)
+@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
 def test_empty_docs(model_func, kwargs):
     nlp = English()
     model = model_func(**kwargs).initialize()
@@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
     for n_docs in range(3):
         docs = [nlp("") for _ in range(n_docs)]
         # Test predict
-        _ = model.predict(docs)
+        model.predict(docs)
         # Test backprop
         output, backprop = model.begin_update(docs)
-        _ = backprop(output)
+        backprop(output)

From 68d79796c65d83b934e785bb3d8ffbea16fe832f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 10 Oct 2020 20:59:48 +0200
Subject: [PATCH 18/29] add test for vocab after serializing KB

---
 spacy/pipeline/trainable_pipe.pyx          |  2 +-
 spacy/tests/pipeline/test_entity_linker.py | 28 +++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 88e50e7c6..07cb01059 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -243,7 +243,7 @@ cdef class TrainablePipe(Pipe):
     def _validate_serialization_attrs(self):
         """Check that the pipe implements the required attributes. If a subclass
         implements a custom __init__ method but doesn't set these attributes,
-        the currently default to None, so we need to perform additonal checks.
+        they currently default to None, so we need to perform additonal checks.
         """
         if not hasattr(self, "vocab") or self.vocab is None:
             raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e0c63d09e..673a354dd 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -5,6 +5,7 @@ from spacy.kb import KnowledgeBase, get_candidates, Candidate
 from spacy.vocab import Vocab
 
 from spacy import util, registry
+from spacy.ml import load_kb
 from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
@@ -215,7 +216,7 @@ def test_el_pipe_configuration(nlp):
         return kb
 
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
-    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},)
+    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False})
     entity_linker.set_kb(create_kb)
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
@@ -496,6 +497,31 @@ def test_overfitting_IO():
         assert predictions == GOLD_entities
 
 
+def test_kb_serialization():
+    # Test that the KB can be used in a pipeline with a different vocab
+    vector_length = 3
+    with make_tempdir() as tmp_dir:
+        kb_dir = tmp_dir / "kb"
+        nlp1 = English()
+        assert "Q2146908" not in nlp1.vocab.strings
+        mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
+        assert "Q2146908" in nlp1.vocab.strings
+        mykb.to_disk(kb_dir)
+
+        nlp2 = English()
+        nlp2.vocab.strings.add("RandomWord")
+        assert "RandomWord" in nlp2.vocab.strings
+        assert "Q2146908" not in nlp2.vocab.strings
+
+        # Create the Entity Linker component with the KB from file, and check the final vocab
+        entity_linker = nlp2.add_pipe("entity_linker", last=True)
+        entity_linker.set_kb(load_kb(kb_dir))
+        assert "Q2146908" in nlp2.vocab.strings
+        assert "RandomWord" in nlp2.vocab.strings
+
+
 def test_scorer_links():
     train_examples = []
     nlp = English()

From 3a505e7e14acf70e82910ca285b762259f20d5d4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 10 Oct 2020 21:05:28 +0200
Subject: [PATCH 19/29] small edit to ensure the new word was indeed new

---
 spacy/tests/pipeline/test_entity_linker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 673a354dd..f2e6defcb 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -511,6 +511,7 @@ def test_kb_serialization():
         mykb.to_disk(kb_dir)
 
         nlp2 = English()
+        assert "RandomWord" not in nlp2.vocab.strings
         nlp2.vocab.strings.add("RandomWord")
         assert "RandomWord" in nlp2.vocab.strings
         assert "Q2146908" not in nlp2.vocab.strings

From 99606e46fe90a8cb813a10d62d2d234ebdf4540f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 12:30:57 +0200
Subject: [PATCH 20/29] Relax meta.json schema [ci skip]

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 07d17d193..f3664acff 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
     sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
     vectors: Dict[str, Any] = Field({}, title="Included word vectors")
     labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
-    performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
+    performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
     spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
     # fmt: on
 

From ab890a35f9b54c625d423930cf81e75a27bfa69d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 12:55:46 +0200
Subject: [PATCH 21/29] Make console logger table more compact

---
 spacy/training/loggers.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index b431ecf06..79459a89b 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -11,11 +11,25 @@ if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
+def setup_table(
+    *, cols: List[str], widths: List[int], max_width: int = 13
+) -> Tuple[List[str], List[int], List[str]]:
+    final_cols = []
+    final_widths = []
+    for col, width in zip(cols, widths):
+        if len(col) > max_width:
+            col = col[: max_width - 3] + "..."  # shorten column if too long
+        final_cols.append(col.upper())
+        final_widths.append(max(len(col), width))
+    return final_cols, final_widths, ["r" for _ in final_widths]
+
+
 @registry.loggers("spacy.ConsoleLogger.v1")
 def console_logger(progress_bar: bool = False):
     def setup_printer(
         nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
     ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
+        write = lambda text: stdout.write(f"{text}\n")
         msg = Printer(no_print=True)
         # ensure that only trainable components are logged
         logged_pipes = [
@@ -26,15 +40,14 @@ def console_logger(progress_bar: bool = False):
         eval_frequency = nlp.config["training"]["eval_frequency"]
         score_weights = nlp.config["training"]["score_weights"]
         score_cols = [col for col, value in score_weights.items() if value is not None]
-        score_widths = [max(len(col), 6) for col in score_cols]
         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
-        loss_widths = [max(len(col), 8) for col in loss_cols]
-        table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
-        table_header = [col.upper() for col in table_header]
-        table_widths = [3, 6] + loss_widths + score_widths + [6]
-        table_aligns = ["r" for _ in table_widths]
-        stdout.write(msg.row(table_header, widths=table_widths) + "\n")
-        stdout.write(msg.row(["-" * width for width in table_widths]) + "\n")
+        spacing = 2
+        table_header, table_widths, table_aligns = setup_table(
+            cols=["E", "#"] + loss_cols + score_cols + ["Score"],
+            widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
+        )
+        write(msg.row(table_header, widths=table_widths, spacing=spacing))
+        write(msg.row(["-" * width for width in table_widths], spacing=spacing))
         progress = None
 
         def log_step(info: Optional[Dict[str, Any]]) -> None:
@@ -70,7 +83,9 @@ def console_logger(progress_bar: bool = False):
             )
             if progress is not None:
                 progress.close()
-            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n")
+            write(
+                msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
+            )
             if progress_bar:
                 # Set disable=None, so that it disables on non-TTY
                 progress = tqdm.tqdm(

From 4fa967ea843c2b1db0147a2b4d303266e5563f73 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 13:10:58 +0200
Subject: [PATCH 22/29] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index efdfd26c0..38efce3e9 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a38"
+__version__ = "3.0.0a39"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 40276fd3be231be6969f8c51889c13e77a726fa8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 12 Oct 2020 11:41:27 +0200
Subject: [PATCH 23/29] update NEL docs after latest refactor

---
 spacy/ml/models/entity_linker.py  |  3 +-
 website/docs/api/architectures.md | 19 +++----
 website/docs/api/entitylinker.md  | 84 ++++++++++++++++++++-----------
 3 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index d945e5fba..f37203b1b 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Optional, Callable, Iterable
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
 from thinc.api import Model, Maxout, Linear
@@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
 
 
 @registry.misc.register("spacy.KBFromFile.v1")
-def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
+def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
     def kb_from_file(vocab):
         kb = KnowledgeBase(vocab, entity_vector_length=1)
         kb.from_disk(kb_path)
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 5246a3ed6..3157c261a 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -637,13 +637,6 @@ into the "real world". This requires 3 main components:
 > window_size = 1
 > maxout_pieces = 3
 > subword_features = true
->
-> [kb_loader]
-> @misc = "spacy.EmptyKB.v1"
-> entity_vector_length = 64
->
-> [get_candidates]
-> @misc = "spacy.CandidateGenerator.v1"
 > ```
 
 The `EntityLinker` model architecture is a Thinc `Model` with a
@@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 
 ### spacy.EmptyKB.v1 {#EmptyKB}
 
-A function that creates a default, empty `KnowledgeBase` from a
-[`Vocab`](/api/vocab) instance.
+A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
+instance. This is the default when a new entity linker component is created.
 
 | Name                   | Description                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 
+### spacy.KBFromFile.v1 {#KBFromFile}
+
+A function that reads an existing `KnowledgeBase` from file.
+
+| Name      | Description                                              |
+| --------- | -------------------------------------------------------- |
+| `kb_path` | The location of the KB that was stored to file. ~~Path~~ |
+
 ### spacy.CandidateGenerator.v1 {#CandidateGenerator}
 
 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 169a175e2..0904bbf72 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters.
 >    "incl_prior": True,
 >    "incl_context": True,
 >    "model": DEFAULT_NEL_MODEL,
->    "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
+>    "entity_vector_length": 64,
 >    "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
 > }
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting          | Description                                                                                                                                                                                                                                                              |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
-| `incl_prior`     | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
-| `incl_context`   | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
-| `model`          | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
-| `kb_loader`      | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~                |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| Setting                | Description                                                                                                                                                                                                                                                              |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `labels_discard`       | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
+| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
+| `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
+| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~                                                                                                                                                                                                              |
+| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters.
 > config = {"model": {"@architectures": "my_el.v1"}}
 > entity_linker = nlp.add_pipe("entity_linker", config=config)
 >
-> # Construction via add_pipe with custom KB and candidate generation
-> config = {"kb": {"@misc": "my_kb.v1"}}
-> entity_linker = nlp.add_pipe("entity_linker", config=config)
->
 > # Construction from class
 > from spacy.pipeline import EntityLinker
 > entity_linker = EntityLinker(nlp.vocab, model)
@@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters.
 
 Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
-`KnowledgeBase` as well as the Candidate generator can be customized by
-providing custom registered functions.
+[`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name             | Description                                                                                                                      |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`          | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
-| `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_   |                                                                                                                                  |
-| `kb_loader`      | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~                 |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
-| `incl_prior`     | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
-| `incl_context`   | Whether or not to include the local context in the model. ~~bool~~                                                               |
+Upon construction of the entity linker component, an empty knowledge base is
+constructed with the provided `entity_vector_length`. If you want to use a
+custom knowledge base, you should either call
+[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
+[`initialize`](/api/entitylinker#initialize) call.
+
+| Name                   | Description                                                                                                                      |
+| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
+| `name`                 | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_         |                                                                                                                                  |
+| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~                                                                                      |
+| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `labels_discard`       | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
+| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
+| `incl_context`         | Whether or not to include the local context in the model. ~~bool~~                                                               |
 
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
 
@@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
+## EntityLinker.set_kb {#initialize tag="method" new="3"}
+
+The `kb_loader` should be a function that takes a `Vocab` instance and creates
+the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced
+with the current vocab.
+
+> #### Example
+>
+> ```python
+> def create_kb(vocab):
+>     kb = KnowledgeBase(vocab, entity_vector_length=128)
+>     kb.add_entity(...)
+>     kb.add_alias(...)
+>     return kb
+> entity_linker = nlp.add_pipe("entity_linker")
+> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb)
+> ```
+
+| Name        | Description                                                                                                      |
+| ----------- | ---------------------------------------------------------------------------------------------------------------- |
+| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
+
 ## EntityLinker.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
@@ -150,6 +172,11 @@ network,
 setting up the label scheme based on the data. This method is typically called
 by [`Language.initialize`](/api/language#initialize).
 
+Optionally, a `kb_loader` argument may be specified to change the internal
+knowledge base. This argument should be a function that takes a `Vocab` instance
+and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
+are synced with the current vocab.
+
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
 This method was previously called `begin_training`.
@@ -160,7 +187,7 @@ This method was previously called `begin_training`.
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> entity_linker.initialize(lambda: [], nlp=nlp)
+> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -168,6 +195,7 @@ This method was previously called `begin_training`.
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| `kb_loader`    | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~                      |
 
 ## EntityLinker.predict {#predict tag="method"}
 

From 1f465bea185d6aff3f4320b84f6a006b72b71917 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 13 Oct 2020 09:27:19 +0200
Subject: [PATCH 24/29] if-else

---
 spacy/util.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 58f951f86..8335a4fcc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1361,11 +1361,12 @@ def check_bool_env_var(env_var: str) -> bool:
 def _pipe(docs, proc, kwargs):
     if hasattr(proc, "pipe"):
         yield from proc.pipe(docs, **kwargs)
-    # We added some args for pipe that __call__ doesn't expect.
-    kwargs = dict(kwargs)
-    for arg in ["batch_size"]:
-        if arg in kwargs:
-            kwargs.pop(arg)
-    for doc in docs:
-        doc = proc(doc, **kwargs)
-        yield doc
+    else:
+        # We added some args for pipe that __call__ doesn't expect.
+        kwargs = dict(kwargs)
+        for arg in ["batch_size"]:
+            if arg in kwargs:
+                kwargs.pop(arg)
+        for doc in docs:
+            doc = proc(doc, **kwargs)
+            yield doc

From a0e12c136b5864e7c0390a70902b2b158118d9b8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 10:00:53 +0200
Subject: [PATCH 25/29] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 38efce3e9..2aeef3c8d 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a39"
+__version__ = "3.0.0a40"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 4d99d2b94a73d7d950f92526efc5a5f6f9b98121 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 11:38:52 +0200
Subject: [PATCH 26/29] Update docs [ci skip]

---
 website/docs/api/entitylinker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 0904bbf72..683927b1c 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -46,7 +46,7 @@ architectures and their arguments and hyperparameters.
 | `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
 | `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
 | `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~                                                                                                                                                                                                              |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
 | `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 
 ```python

From 86d648740fc4f1fea9ac5c779c2d578c2431cafe Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 11:39:03 +0200
Subject: [PATCH 27/29] Fix morph representation in Doc.to_json

---
 spacy/tokens/doc.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4a57e4c83..abc82030d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1419,7 +1419,7 @@ cdef class Doc:
             if include_annotation["POS"]:
                 token_data["pos"] = token.pos_
             if include_annotation["MORPH"]:
-                token_data["morph"] = token.morph
+                token_data["morph"] = token.morph.to_json()
             if include_annotation["LEMMA"]:
                 token_data["lemma"] = token.lemma_
             if include_annotation["DEP"]:

From f8a1c1afd6fff111b4434e6d19a2b1aec5b55501 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 13 Oct 2020 14:39:59 +0200
Subject: [PATCH 28/29] avoid dropout at runtime (#6247)

---
 spacy/about.py            | 2 +-
 spacy/ml/staticvectors.py | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 2aeef3c8d..9c5dd0b4f 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a40"
+__version__ = "3.0.0a41"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index da731dadb..f0213a9b8 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -39,7 +39,6 @@ def forward(
     key_attr = model.attrs["key_attr"]
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
     V = cast(Floats2d, docs[0].vocab.vectors.data)
-    mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
     rows = model.ops.flatten(
         [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
     )
@@ -47,8 +46,11 @@ def forward(
         model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
         model.ops.asarray([len(doc) for doc in docs], dtype="i"),
     )
-    if mask is not None:
-        output.data *= mask
+    mask = None
+    if is_train:
+        mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
+        if mask is not None:
+            output.data *= mask
 
     def backprop(d_output: Ragged) -> List[Doc]:
         if mask is not None:

From 1f4930086209128876e2804ae070ded54471e6f2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 15:41:17 +0200
Subject: [PATCH 29/29] Update transformer recommendations [ci skip]

---
 .../quickstart_training_recommendations.yml   | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml
index 206e69954..54aec2e31 100644
--- a/spacy/cli/templates/quickstart_training_recommendations.yml
+++ b/spacy/cli/templates/quickstart_training_recommendations.yml
@@ -32,10 +32,10 @@ es:
   word_vectors: null
   transformer:
     efficiency:
-      name: mrm8488/RuPERTa-base
+      name: dccuchile/bert-base-spanish-wwm-cased
       size_factor: 3
     accuracy:
-      name: mrm8488/RuPERTa-base
+      name: dccuchile/bert-base-spanish-wwm-cased
       size_factor: 3
 sv:
   word_vectors: null
@@ -101,3 +101,21 @@ pl:
     accuracy:
       name: dkleczek/bert-base-polish-cased-v1
       size_factor: 3
+nl:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: pdelobelle/robbert-v2-dutch-base
+      size_factor: 3
+    accuracy:
+      name: pdelobelle/robbert-v2-dutch-base
+      size_factor: 3
+pt:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: neuralmind/bert-base-portuguese-cased
+      size_factor: 3
+    accuracy:
+      name: neuralmind/bert-base-portuguese-cased
+      size_factor: 3