From ecbb9c4b9f89120ba04642852780d592c024b6ef Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 11:50:42 +0100
Subject: [PATCH 01/41] load Underscore state when multiprocessing

---
 spacy/language.py          | 11 ++++++++---
 spacy/tokens/underscore.py |  8 ++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 5544b6341..71180a65d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -15,6 +15,7 @@ import multiprocessing as mp
 from itertools import chain, cycle
 
 from .tokenizer import Tokenizer
+from .tokens.underscore import Underscore
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .lookups import Lookups
@@ -852,7 +853,10 @@ class Language(object):
         sender.send()
 
         procs = [
-            mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
+            mp.Process(
+                target=_apply_pipes,
+                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+            )
             for rch, sch in zip(texts_q, bytedocs_send_ch)
         ]
         for proc in procs:
@@ -1107,7 +1111,7 @@ def _pipe(docs, proc, kwargs):
         yield doc
 
 
-def _apply_pipes(make_doc, pipes, reciever, sender):
+def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
     """Worker for Language.pipe
 
     receiver (multiprocessing.Connection): Pipe to receive text. Usually
@@ -1115,8 +1119,9 @@ def _apply_pipes(make_doc, pipes, reciever, sender):
     sender (multiprocessing.Connection): Pipe to send doc. Usually created by
         `multiprocessing.Pipe()`
     """
+    Underscore.load_state(underscore_state)
     while True:
-        texts = reciever.get()
+        texts = receiver.get()
         docs = (make_doc(text) for text in texts)
         for pipe in pipes:
             docs = pipe(docs)
diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py
index b36fe9294..8dac8526e 100644
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@@ -79,6 +79,14 @@ class Underscore(object):
     def _get_key(self, name):
         return ("._.", name, self._start, self._end)
 
+    @classmethod
+    def get_state(cls):
+        return cls.token_extensions, cls.span_extensions, cls.doc_extensions
+
+    @classmethod
+    def load_state(cls, state):
+        cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
+
 
 def get_ext_args(**kwargs):
     """Validate and convert arguments. Reused in Doc, Token and Span."""

From 05dedaa2cf2e57469ac860fbd0af638c27c02148 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 12:00:13 +0100
Subject: [PATCH 02/41] add unit test

---
 spacy/tests/regression/test_issue4903.py | 40 ++++++++++++++++++++++++
 spacy/tests/regression/test_issue4924.py |  2 +-
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/regression/test_issue4903.py

diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
new file mode 100644
index 000000000..97293aec7
--- /dev/null
+++ b/spacy/tests/regression/test_issue4903.py
@@ -0,0 +1,40 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import Span, Doc
+
+
+class CustomPipe:
+    name = "my_pipe"
+
+    def __init__(self):
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
+def test_issue4903():
+    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
+    nlp = spacy.load("en_core_web_sm")
+    custom_component = CustomPipe()
+    nlp.add_pipe(custom_component, after="parser")
+
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    # works without 'n_process'
+    for doc in nlp.pipe(text, n_process=2):
+        print(doc)
diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py
index 8aea2c3d5..0e45291a9 100644
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@@ -11,6 +11,6 @@ def nlp():
     return spacy.blank("en")
 
 
-def test_evaluate(nlp):
+def test_issue4924(nlp):
     docs_golds = [("", {})]
     nlp.evaluate(docs_golds)

From 65f5b48b5db0e8e11e73e505469ccdb38e8f07af Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 12:06:27 +0100
Subject: [PATCH 03/41] add comment

---
 spacy/language.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/language.py b/spacy/language.py
index 71180a65d..737e0bf3c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1118,6 +1118,7 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
         created by `multiprocessing.Pipe()`
     sender (multiprocessing.Connection): Pipe to send doc. Usually created by
         `multiprocessing.Pipe()`
+    underscore_state (tuple): The data in the Underscore class of the parent
     """
     Underscore.load_state(underscore_state)
     while True:

From 51d37033c8b2f280cfc0ddf2b1ecf0537f347532 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 12:10:05 +0100
Subject: [PATCH 04/41] remove old comment

---
 spacy/tests/regression/test_issue4903.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index 97293aec7..d09b32849 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -35,6 +35,5 @@ def test_issue4903():
     nlp.add_pipe(custom_component, after="parser")
 
     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    # works without 'n_process'
     for doc in nlp.pipe(text, n_process=2):
         print(doc)

From 46628d88903edaa2c3614339a0d464b9fcdcc690 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 12:12:52 +0100
Subject: [PATCH 05/41] add some asserts

---
 spacy/tests/regression/test_issue4903.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index d09b32849..0a255d9a8 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -35,5 +35,7 @@ def test_issue4903():
     nlp.add_pipe(custom_component, after="parser")
 
     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    for doc in nlp.pipe(text, n_process=2):
-        print(doc)
+    docs = list(nlp.pipe(text, n_process=2))
+    assert docs[0].text == "I like bananas."
+    assert docs[1].text == "Do you like them?"
+    assert docs[2].text == "No, I prefer wasabi."

From 7939c6388656e1abb932b2deb1af90928c297aa2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 12:26:27 +0100
Subject: [PATCH 06/41] use English instead of model

---
 spacy/tests/regression/test_issue4903.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index 0a255d9a8..82e21b79f 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import spacy
+from spacy.lang.en import English
 from spacy.tokens import Span, Doc
 
 
@@ -30,9 +31,10 @@ class CustomPipe:
 
 def test_issue4903():
     # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
-    nlp = spacy.load("en_core_web_sm")
+    nlp = English()
     custom_component = CustomPipe()
-    nlp.add_pipe(custom_component, after="parser")
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(custom_component, after="sentencizer")
 
     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
     docs = list(nlp.pipe(text, n_process=2))

From 6e717c62ed2d0407b37ae0e19c033964425419cc Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Feb 2020 13:21:31 +0100
Subject: [PATCH 07/41] avoid the tests interacting with eachother through the
 global Underscore variable

---
 spacy/tests/regression/test_issue4849.py | 6 ++++++
 spacy/tests/regression/test_issue4903.py | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py
index 834219773..7e58243bc 100644
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@@ -3,11 +3,17 @@ from __future__ import unicode_literals
 
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
+from spacy.tokens.underscore import Underscore
 
 
 def test_issue4849():
     nlp = English()
 
+    # reset the Underscore object because test_underscore has a lambda function that can't be pickled
+    Underscore.doc_extensions = {}
+    Underscore.span_extensions = {}
+    Underscore.token_extensions = {}
+
     ruler = EntityRuler(
         nlp, patterns=[
             {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index 82e21b79f..156845558 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Span, Doc
+from spacy.tokens.underscore import Underscore
 
 
 class CustomPipe:
@@ -31,6 +32,12 @@ class CustomPipe:
 
 def test_issue4903():
     # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
+
+    # reset the Underscore object because test_underscore has a lambda function that can't be pickled
+    Underscore.doc_extensions = {}
+    Underscore.span_extensions = {}
+    Underscore.token_extensions = {}
+
     nlp = English()
     custom_component = CustomPipe()
     nlp.add_pipe(nlp.create_pipe("sentencizer"))

From d1f0b397b5a8cba5e59dd5448a831932055c7f45 Mon Sep 17 00:00:00 2001
From: questoph <christoph@purschke.info>
Date: Thu, 13 Feb 2020 22:18:51 +0100
Subject: [PATCH 08/41] Update punctuation.py

---
 spacy/lang/lb/punctuation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
index 1571e13d7..2a4587856 100644
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_
 
 ELISION = " ' ’ ".strip().replace(" ", "")
 
+abbrev = ("d", "D")
+
 _infixes = (
     LIST_ELLIPSES
     + LIST_ICONS
     + [
-        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+        r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION),
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),

From 5352fc8fc3f06e99a2d5159da7d4c226be1b82c1 Mon Sep 17 00:00:00 2001
From: questoph <christoph@purschke.info>
Date: Fri, 14 Feb 2020 12:02:15 +0100
Subject: [PATCH 09/41] Update tokenizer_exceptions.py

---
 spacy/lang/lb/tokenizer_exceptions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
index b32daa58c..1c9b2dde3 100644
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -10,6 +10,8 @@ _exc = {}
 
 # translate / delete what is not necessary
 for exc_data in [
+    {ORTH: "’t", LEMMA: "et", NORM: "et"},
+    {ORTH: "’T", LEMMA: "et", NORM: "et"},
     {ORTH: "'t", LEMMA: "et", NORM: "et"},
     {ORTH: "'T", LEMMA: "et", NORM: "et"},
     {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},

From 3853d385faad420f94f39223da148265113149e1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Feb 2020 13:41:24 +0100
Subject: [PATCH 10/41] Fix formatting in Token API

---
 website/docs/api/token.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 68402d1b4..c30c01c20 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -437,8 +437,8 @@ The L2 norm of the token's vector representation.
 | `norm_`                                      | unicode      | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions).                                 |
 | `lower`                                      | int          | Lowercase form of the token.                                                                                                                                                                                                                                  |
 | `lower_`                                     | unicode      | Lowercase form of the token text. Equivalent to `Token.text.lower()`.                                                                                                                                                                                         |
-| `shape`                                      | int          | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `shape_`                                     | unicode      | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
+| `shape`                                      | int          | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
+| `shape_`                                     | unicode      | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
 | `prefix`                                     | int          | Hash value of a length-N substring from the start of the token. Defaults to `N=1`.                                                                                                                                                                            |
 | `prefix_`                                    | unicode      | A length-N substring from the start of the token. Defaults to `N=1`.                                                                                                                                                                                          |
 | `suffix`                                     | int          | Hash value of a length-N substring from the end of the token. Defaults to `N=3`.                                                                                                                                                                              |

From b49a3afd0cde67debd2128b2cf2c816322c6d0d7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 23 Feb 2020 15:49:20 +0100
Subject: [PATCH 11/41] use clean_underscore fixture

---
 spacy/tests/doc/test_underscore.py       | 9 +++++++++
 spacy/tests/matcher/test_matcher_api.py  | 2 ++
 spacy/tests/regression/test_issue4849.py | 5 -----
 spacy/tests/regression/test_issue4903.py | 5 -----
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
index 2877bfeea..c1eff2c20 100644
--- a/spacy/tests/doc/test_underscore.py
+++ b/spacy/tests/doc/test_underscore.py
@@ -7,6 +7,15 @@ from spacy.tokens import Doc, Span, Token
 from spacy.tokens.underscore import Underscore
 
 
+@pytest.fixture(scope="function", autouse=True)
+def clean_underscore():
+    # reset the Underscore object after the test, to avoid having state copied across tests
+    yield
+    Underscore.doc_extensions = {}
+    Underscore.span_extensions = {}
+    Underscore.token_extensions = {}
+
+
 def test_create_doc_underscore():
     doc = Mock()
     doc.doc = doc
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e4584d03a..a826a0a0e 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -6,6 +6,7 @@ import re
 from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
 from spacy.tokens import Doc, Token
+from ..doc.test_underscore import clean_underscore
 
 
 @pytest.fixture
@@ -200,6 +201,7 @@ def test_matcher_any_token_operator(en_vocab):
     assert matches[2] == "test hello world"
 
 
+@pytest.mark.usefixtures("clean_underscore")
 def test_matcher_extension_attribute(en_vocab):
     matcher = Matcher(en_vocab)
     get_is_fruit = lambda token: token.text in ("apple", "banana")
diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py
index 7e58243bc..85d03fe9a 100644
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@@ -9,11 +9,6 @@ from spacy.tokens.underscore import Underscore
 def test_issue4849():
     nlp = English()
 
-    # reset the Underscore object because test_underscore has a lambda function that can't be pickled
-    Underscore.doc_extensions = {}
-    Underscore.span_extensions = {}
-    Underscore.token_extensions = {}
-
     ruler = EntityRuler(
         nlp, patterns=[
             {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index 156845558..9a3c10d61 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -33,11 +33,6 @@ class CustomPipe:
 def test_issue4903():
     # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
 
-    # reset the Underscore object because test_underscore has a lambda function that can't be pickled
-    Underscore.doc_extensions = {}
-    Underscore.span_extensions = {}
-    Underscore.token_extensions = {}
-
     nlp = English()
     custom_component = CustomPipe()
     nlp.add_pipe(nlp.create_pipe("sentencizer"))

From 54d8665ff74239c42a0fb6f457c26a50bc269079 Mon Sep 17 00:00:00 2001
From: Santiago Castro <bryant@montevideo.com.uy>
Date: Mon, 24 Feb 2020 16:15:28 -0500
Subject: [PATCH 12/41] Add missing comma in a dependency specification

Conda is complaining that it can't parse that line otherwise.
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 55396e011..12d7a2e63 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -59,7 +59,7 @@ install_requires =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=0.0.5<0.2.0
+    spacy_lookups_data>=0.0.5,<0.2.0
 cuda =
     cupy>=5.0.0b4
 cuda80 =

From d848a68340ad3e57212384e4c25e45da02b31990 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 25 Feb 2020 12:07:42 +0100
Subject: [PATCH 13/41] thinc 7.4.0.dev2

---
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4f0579313..4ceb3a838 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc==7.4.0.dev0
+thinc==7.4.0.dev2
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 55396e011..78d5be7f5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,13 +38,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc==7.4.0.dev0
+    thinc==7.4.0.dev2
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc==7.4.0.dev0
+    thinc==7.4.0.dev2
     blis>=0.4.0,<0.5.0
     wasabi>=0.4.0,<1.1.0
     srsly>=1.0.1,<1.1.0

From dc36ec98a4f57b6f0e9e7d508b3152cb53e67da7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 25 Feb 2020 16:46:14 +0100
Subject: [PATCH 14/41] Update pyproject.toml

---
 pyproject.toml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fed528d4a..8a6ababf3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,11 @@
 [build-system]
-requires = ["setuptools"]
+requires = [
+    "setuptools",
+    "wheel",
+    "cython>=0.25",
+    "cymem>=2.0.2,<2.1.0",
+    "preshed>=3.0.2,<3.1.0",
+    "murmurhash>=0.28.0,<1.1.0",
+    "thinc==7.4.0.dev0",
+]
 build-backend = "setuptools.build_meta"

From 62406a951374ab18753153d9cea0d0faf9e070d9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 26 Feb 2020 10:30:35 +0100
Subject: [PATCH 15/41] update from thinc 7.4.0.dev2 to 7.4.0

---
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4ceb3a838..e908e25f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc==7.4.0.dev2
+thinc==7.4.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 78d5be7f5..ac19f7bac 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -38,13 +38,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc==7.4.0.dev2
+    thinc==7.4.0
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc==7.4.0.dev2
+    thinc==7.4.0
     blis>=0.4.0,<0.5.0
     wasabi>=0.4.0,<1.1.0
     srsly>=1.0.1,<1.1.0

From 18ff97589d3f1adccd9aa451959dbfe97f67e29a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 26 Feb 2020 10:50:05 +0100
Subject: [PATCH 16/41] update spacy to 2.2.4.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index a1880fb54..365c2adbb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.2.3"
+__version__ = "2.2.4.dev0"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From ff184b7a9c64d954a7c7445e00c7505ed1d930f0 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Wed, 26 Feb 2020 12:10:38 +0100
Subject: [PATCH 17/41] Add tag_map argument to CLI debug-data and train
 (#4750) (#5038)

Add an argument for a path to a JSON-formatted tag map, which is used to
update and extend the default language tag map.
---
 spacy/cli/debug_data.py | 10 +++++++++-
 spacy/cli/train.py      |  8 ++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4b12052c3..0e12a594c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -26,6 +26,7 @@ BLANK_MODEL_THRESHOLD = 2000
     lang=("model language", "positional", None, str),
     train_path=("location of JSON-formatted training data", "positional", None, Path),
     dev_path=("location of JSON-formatted development data", "positional", None, Path),
+    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
     base_model=("name of model to update (optional)", "option", "b", str),
     pipeline=(
         "Comma-separated names of pipeline components to train",
@@ -41,6 +42,7 @@ def debug_data(
     lang,
     train_path,
     dev_path,
+    tag_map_path=None,
     base_model=None,
     pipeline="tagger,parser,ner",
     ignore_warnings=False,
@@ -60,6 +62,10 @@ def debug_data(
     if not dev_path.exists():
         msg.fail("Development data not found", dev_path, exits=1)
 
+    tag_map = {}
+    if tag_map_path is not None:
+        tag_map = srsly.read_json(tag_map_path)
+
     # Initialize the model and pipeline
     pipeline = [p.strip() for p in pipeline.split(",")]
     if base_model:
@@ -67,6 +73,8 @@ def debug_data(
     else:
         lang_cls = get_lang_class(lang)
         nlp = lang_cls()
+    # Update tag map with provided mapping
+    nlp.vocab.morphology.tag_map.update(tag_map)
 
     msg.divider("Data format validation")
 
@@ -344,7 +352,7 @@ def debug_data(
     if "tagger" in pipeline:
         msg.divider("Part-of-speech Tagging")
         labels = [label for label in gold_train_data["tags"]]
-        tag_map = nlp.Defaults.tag_map
+        tag_map = nlp.vocab.morphology.tag_map
         msg.info(
             "{} {} in data ({} {} in tag map)".format(
                 len(labels),
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5af93a8f3..968a009f6 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -57,6 +57,7 @@ from .. import about
     textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
     textcat_arch=("Textcat model architecture", "option", "ta", str),
     textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
+    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
     verbose=("Display more information for debug", "flag", "VV", bool),
     debug=("Run data diagnostics before training", "flag", "D", bool),
     # fmt: on
@@ -95,6 +96,7 @@ def train(
     textcat_multilabel=False,
     textcat_arch="bow",
     textcat_positive_label=None,
+    tag_map_path=None,
     verbose=False,
     debug=False,
 ):
@@ -132,6 +134,9 @@ def train(
         output_path.mkdir()
         msg.good("Created output directory: {}".format(output_path))
 
+    tag_map = {}
+    if tag_map_path is not None:
+        tag_map = srsly.read_json(tag_map_path)
     # Take dropout and batch size as generators of values -- dropout
     # starts high and decays sharply, to force the optimizer to explore.
     # Batch size starts at 1 and grows, so that we make updates quickly
@@ -238,6 +243,9 @@ def train(
                 pipe_cfg = {}
             nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 
+    # Update tag map with provided mapping
+    nlp.vocab.morphology.tag_map.update(tag_map)
+
     if vectors:
         msg.text("Loading vector from model '{}'".format(vectors))
         _load_vectors(nlp, vectors)

From 54da6a2a0717bcbba737e67a9f7ca201f62c6ef3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 26 Feb 2020 12:51:53 +0100
Subject: [PATCH 18/41] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8a6ababf3..827e2a797 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,6 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc==7.4.0.dev0",
+    "thinc==7.4.0",
 ]
 build-backend = "setuptools.build_meta"

From d1f703d78d1fa20078787d8655addd4a31c7c6a4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 26 Feb 2020 13:06:52 +0100
Subject: [PATCH 19/41] Improve German tokenization

Improve German tokenization with respect to Tiger.
---
 spacy/lang/de/__init__.py             |  3 +++
 spacy/lang/de/punctuation.py          | 27 ++++++++++++++++++++++++++-
 spacy/lang/de/tokenizer_exceptions.py | 11 +++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 1412f033a..dee1841c8 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .norm_exceptions import NORM_EXCEPTIONS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
@@ -22,6 +23,8 @@ class GermanDefaults(Language.Defaults):
         Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
     )
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    prefixes = TOKENIZER_PREFIXES
+    suffixes = TOKENIZER_SUFFIXES
     infixes = TOKENIZER_INFIXES
     tag_map = TAG_MAP
     stop_words = STOP_WORDS
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 7dfa61bd4..c376ce597 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,10 +1,32 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
+from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..punctuation import _prefixes, _suffixes
 
 
+_prefixes = ["``",] + list(_prefixes)
+
+_suffixes = (
+    ["''", "/"]
+    + LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
+
 _quotes = CONCAT_QUOTES.replace("'", "")
 
 _infixes = (
@@ -15,6 +37,7 @@ _infixes = (
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[0-9{a}])\/(?=[0-9{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
         r"(?<=[0-9])-(?=[0-9])",
@@ -22,4 +45,6 @@ _infixes = (
 )
 
 
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 5b09a0b89..ebbbfba8c 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -160,6 +160,8 @@ for exc_data in [
 
 
 for orth in [
+    "``",
+    "''",
     "A.C.",
     "a.D.",
     "A.D.",
@@ -175,10 +177,13 @@ for orth in [
     "biol.",
     "Biol.",
     "ca.",
+    "CDU/CSU",
     "Chr.",
     "Cie.",
+    "c/o",
     "co.",
     "Co.",
+    "d'",
     "D.C.",
     "Dipl.-Ing.",
     "Dipl.",
@@ -203,12 +208,18 @@ for orth in [
     "i.G.",
     "i.Tr.",
     "i.V.",
+    "I.",
+    "II.",
+    "III.",
+    "IV.",
+    "Inc.",
     "Ing.",
     "jr.",
     "Jr.",
     "jun.",
     "jur.",
     "K.O.",
+    "L'",
     "L.A.",
     "lat.",
     "M.A.",

From b4e0d2bf50fe6c654886eccb0395e47ccfbc3bef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 26 Feb 2020 20:59:10 +0100
Subject: [PATCH 20/41] Improve Makefile (#5067)

* Improve pex making

* Update gitignore
---
 .gitignore |  2 ++
 Makefile   | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index c4ad59fc7..828258603 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ __pycache__/
 .~env/
 .venv
 venv/
+env3.*/
 .dev
 .denv
 .pypyenv
@@ -56,6 +57,7 @@ lib64/
 parts/
 sdist/
 var/
+wheelhouse/
 *.egg-info/
 pip-wheel-metadata/
 Pipfile.lock
diff --git a/Makefile b/Makefile
index 5d15bccec..1be1c9794 100644
--- a/Makefile
+++ b/Makefile
@@ -1,28 +1,36 @@
 SHELL := /bin/bash
-sha = $(shell "git" "rev-parse" "--short" "HEAD")
+WHEELHOUSE := ./wheelhouse
+PYVER := 3.6
+VENV := ./env$(PYVER)
+
 version = $(shell "bin/get-version.sh")
-wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl
 
-dist/spacy.pex : dist/spacy-$(sha).pex
-	cp dist/spacy-$(sha).pex dist/spacy.pex
-	chmod a+rx dist/spacy.pex
+dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl
+	pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema
+	chmod a+rx dist/spacy-$(version).pex
 
-dist/spacy-$(sha).pex : dist/$(wheel)
-	env3.6/bin/python -m pip install pex==1.5.3
-	env3.6/bin/pex pytest dist/$(wheel) spacy_lookups_data -e spacy -o dist/spacy-$(sha).pex
+dist/pytest.pex : wheelhouse/pytest-*.whl
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock
+	chmod a+rx dist/pytest.pex
 
-dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py*
-	python3.6 -m venv env3.6
-	source env3.6/bin/activate
-	env3.6/bin/pip install wheel
-	env3.6/bin/pip install -r requirements.txt --no-cache-dir 
-	env3.6/bin/python setup.py build_ext --inplace
-	env3.6/bin/python setup.py sdist
-	env3.6/bin/python setup.py bdist_wheel
+wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
+	$(VENV)/bin/pip wheel . -w ./wheelhouse
+	$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
 
-.PHONY : clean
+wheelhouse/pytest-%.whl : $(VENV)/bin/pex
+	$(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse
+
+$(VENV) : 
+	python$(PYVER) -m venv $(VENV)
+	$(VENV)/bin/python -m pip install pex wheel
+
+.PHONY : clean test
+
+test : dist/spacy-$(version).pex dist/pytest.pex
+	PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x
 
 clean : setup.py
 	source env3.6/bin/activate
 	rm -rf dist/*
+	rm -rf ./wheelhouse
 	python setup.py clean --all

From 65d7bab10f540d3acd09da9c1cece5a166670a21 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Thu, 27 Feb 2020 18:43:00 +0100
Subject: [PATCH 21/41] Initialize all values in a2b/b2a in new align (#5063)

---
 spacy/gold.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 3884e1cba..07fd3bdd0 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -151,6 +151,8 @@ def align(tokens_a, tokens_b):
     cost = 0
     a2b = numpy.empty(len(tokens_a), dtype="i")
     b2a = numpy.empty(len(tokens_b), dtype="i")
+    a2b.fill(-1)
+    b2a.fill(-1)
     a2b_multi = {}
     b2a_multi = {}
     i = 0
@@ -160,7 +162,6 @@ def align(tokens_a, tokens_b):
     while i < len(tokens_a) and j < len(tokens_b):
         a = tokens_a[i][offset_a:]
         b = tokens_b[j][offset_b:]
-        a2b[i] =  b2a[j] = -1
         if a == b:
             if offset_a == offset_b == 0:
                 a2b[i] = j

From c6b12ab02adcdfe760bc10e249924553cb826410 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 2 Mar 2020 11:49:28 +0100
Subject: [PATCH 22/41] Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method
---
 spacy/errors.py                               |  4 ++
 spacy/pipeline/pipes.pyx                      |  2 +-
 spacy/tests/doc/test_doc_api.py               |  9 +--
 spacy/tests/doc/test_token_api.py             |  2 +-
 spacy/tests/parser/test_parse_navigate.py     | 32 +++++-----
 spacy/tests/regression/test_issue2001-2500.py |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  2 +-
 spacy/tests/regression/test_issue4590.py      |  2 +-
 spacy/tests/regression/test_issue5048.py      | 35 +++++++++++
 spacy/tests/test_displacy.py                  | 10 ++--
 spacy/tests/util.py                           | 58 ++++++++++++++-----
 spacy/tokens/doc.pyx                          |  4 +-
 12 files changed, 115 insertions(+), 47 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5048.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 2f0a8a2ad..5957c5ecd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -107,6 +107,9 @@ class Warnings(object):
     W027 = ("Found a large training file of {size} bytes. Note that it may "
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type 'uint64' instead. This may result "
+            "in problems with the vocab further on in the pipeline.")
 
 
 
@@ -541,6 +544,7 @@ class Errors(object):
     E188 = ("Could not match the gold entity links to entities in the doc - "
             "make sure the gold EL data refers to valid results of the "
             "named entity recognizer in the `nlp` pipeline.")
+    E189 = ("Each argument to `get_doc` should be of equal length.")
 
 
 @add_codes
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 3b190debe..a20c9b6df 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -367,7 +367,7 @@ class Tensorizer(Pipe):
         return sgd
 
 
-@component("tagger", assigns=["token.tag", "token.pos"])
+@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
 class Tagger(Pipe):
     """Pipeline component for part-of-speech tagging.
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 52f856d3e..19d908529 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
     # Example that caused run-time error while parsing Reddit
     # fmt: off
     text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
-    deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
-            "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
-            "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
-            "ROOT", "amod", "dobj"]
+    deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
+            "amod", "pobj", "acl", "prep", "prep", "pobj",
+            "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
@@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
 def test_doc_from_array_sent_starts(en_vocab):
     words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
     heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    # fmt: off
     deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    # fmt: on
     doc = Doc(en_vocab, words=words)
     for i, (dep, head) in enumerate(zip(deps, heads)):
         doc[i].dep_ = dep
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index bff2a95c6..b7522bb98 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
 def test_token_api_conjuncts_simple(en_vocab):
     words = "They came and went .".split()
     heads = [1, 0, -1, -2, -1]
-    deps = ["nsubj", "ROOT", "cc", "conj"]
+    deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["went"]
     assert [w.text for w in doc[3].conjuncts] == ["came"]
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index eb206458e..41524d45e 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
 @pytest.fixture
 def heads():
     # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-            -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
-            2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
-            1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
+    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
+            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
+            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
+            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
+            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
+            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
+            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
+            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
+            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
+            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
+            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
+            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
+            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
+            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
             0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-            -1, -8, -9, -1]
+            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
+            -1, 0, -1, -1]
     # fmt: on
 
 
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index e95c1a9b9..01f0f905c 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
     tag_ids = [en_vocab.strings.add(tag) for tag in tags]
     lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
     doc = Doc(en_vocab, words=words)
-    # Work around lemma corrpution problem and set lemmas after tags
+    # Work around lemma corruption problem and set lemmas after tags
     doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
     doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
     assert [t.tag_ for t in doc] == tags
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 73ff7376a..1f5e44499 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
     words = "When we write or communicate virtually , we can hide our true feelings .".split()
     # A tree with a non-projective (i.e. crossing) arc
     # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
+    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[1].is_sent_start is None
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
index 8ec9a0bd1..3d01cd487 100644
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
 
     text = "The quick brown fox jumped over the lazy fox"
     heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
 
     doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
 
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
new file mode 100644
index 000000000..228322493
--- /dev/null
+++ b/spacy/tests/regression/test_issue5048.py
@@ -0,0 +1,35 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+from spacy.tokens import Doc
+from spacy.attrs import DEP, POS, TAG
+
+from ..util import get_doc
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+
+    strings = en_vocab.strings
+
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+
+    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index d04c0506f..539714e0c 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
     deps = displacy.parse_deps(doc)
     assert isinstance(deps, dict)
     assert deps["words"] == [
-        {"lemma": None, "text": "This", "tag": "DET"},
-        {"lemma": None, "text": "is", "tag": "AUX"},
-        {"lemma": None, "text": "a", "tag": "DET"},
-        {"lemma": None, "text": "sentence", "tag": "NOUN"},
+        {"lemma": None, "text": words[0], "tag": pos[0]},
+        {"lemma": None, "text": words[1], "tag": pos[1]},
+        {"lemma": None, "text": words[2], "tag": pos[2]},
+        {"lemma": None, "text": words[3], "tag": pos[3]},
     ]
     assert deps["arcs"] == [
         {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
@@ -75,7 +75,7 @@ def test_displacy_rtl():
     deps = ["foo", "bar", "foo", "baz"]
     heads = [1, 0, 1, -2]
     nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
+    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
     doc.ents = [Span(doc, 1, 3, label="TEST")]
     html = displacy.render(doc, page=True, style="dep")
     assert "direction: rtl" in html
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 9ee5b89f8..52768dd41 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -7,8 +7,10 @@ import shutil
 import contextlib
 import srsly
 from pathlib import Path
+
+from spacy import Errors
 from spacy.tokens import Doc, Span
-from spacy.attrs import POS, HEAD, DEP
+from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
 from spacy.compat import path2str
 
 
@@ -26,30 +28,54 @@ def make_tempdir():
     shutil.rmtree(path2str(d))
 
 
-def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
+def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
     """Create Doc object from given vocab, words and annotations."""
-    pos = pos or [""] * len(words)
-    tags = tags or [""] * len(words)
-    heads = heads or [0] * len(words)
-    deps = deps or [""] * len(words)
-    for value in deps + tags + pos:
+    if deps and not heads:
+        heads = [0] * len(deps)
+    headings = []
+    values = []
+    annotations = [pos, heads, deps, lemmas, tags]
+    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
+    for a, annot in enumerate(annotations):
+        if annot is not None:
+            if len(annot) != len(words):
+                raise ValueError(Errors.E189)
+            headings.append(possible_headings[a])
+            if annot is not heads:
+                values.extend(annot)
+    for value in values:
         vocab.strings.add(value)
 
     doc = Doc(vocab, words=words)
-    attrs = doc.to_array([POS, HEAD, DEP])
-    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
-        attrs[i, 0] = doc.vocab.strings[p]
-        attrs[i, 1] = head
-        attrs[i, 2] = doc.vocab.strings[dep]
-    doc.from_array([POS, HEAD, DEP], attrs)
+
+    # if there are any other annotations, set them
+    if headings:
+        attrs = doc.to_array(headings)
+
+        j = 0
+        for annot in annotations:
+            if annot:
+                if annot is heads:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = heads[i]
+                        else:
+                            attrs[i,j] = heads[i]
+                else:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = doc.vocab.strings[annot[i]]
+                        else:
+                            attrs[i, j] = doc.vocab.strings[annot[i]]
+                j += 1
+        doc.from_array(headings, attrs)
+
+    # finally, set the entities
     if ents:
         doc.ents = [
             Span(doc, start, end, label=doc.vocab.strings[label])
             for start, end, label in ents
         ]
-    if tags:
-        for token in doc:
-            token.tag_ = tags[token.i]
     return doc
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 63495ec86..11f1ddf5f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -785,6 +785,8 @@ cdef class Doc:
         # Allow strings, e.g. 'lemma' or 'LEMMA'
         attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                  for id_ in attrs]
+        if array.dtype != numpy.uint64:
+            user_warning(Warnings.W028.format(type=array.dtype))
 
         if SENT_START in attrs and HEAD in attrs:
             raise ValueError(Errors.E032)
@@ -872,7 +874,7 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID]  # TODO: ENT_KB_ID ?
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM]  # TODO: ENT_KB_ID ?
         if self.is_tagged:
             array_head.extend([TAG, POS])
         # If doc parsed add head and dep attribute

From 2281c4708cc3dfa68ffcdff5554c18d8fae0c9de Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Mon, 2 Mar 2020 11:55:02 +0100
Subject: [PATCH 23/41] Restore empty tokenizer properties (#5026)

* Restore empty tokenizer properties

* Check for types in tokenizer.from_bytes()

* Add test for setting empty tokenizer rules
---
 spacy/tests/serialize/test_serialize_tokenizer.py | 11 +++++++++--
 spacy/tokenizer.pyx                               | 14 +++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index 9a273980c..0e0816a55 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -15,12 +15,19 @@ def load_tokenizer(b):
 
 
 def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
-    """Test that custom tokenizer with not all functions defined can be
-    serialized and deserialized correctly (see #2494)."""
+    """Test that custom tokenizer with not all functions defined or empty
+    properties can be serialized and deserialized correctly (see #2494,
+    #4991)."""
     tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
     tokenizer_bytes = tokenizer.to_bytes()
     Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
 
+    tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
+    tokenizer.rules = {}
+    tokenizer_bytes = tokenizer.to_bytes()
+    tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
+    assert tokenizer_reloaded.rules == {}
+
 
 @pytest.mark.skip(reason="Currently unreliable across platforms")
 @pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 230f41921..12c7b73af 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -14,7 +14,7 @@ import re
 
 from .tokens.doc cimport Doc
 from .strings cimport hash_string
-from .compat import unescape_unicode
+from .compat import unescape_unicode, basestring_
 from .attrs import intify_attrs
 from .symbols import ORTH
 
@@ -568,22 +568,22 @@ cdef class Tokenizer:
         for key in ["prefix_search", "suffix_search", "infix_finditer"]:
             if key in data:
                 data[key] = unescape_unicode(data[key])
-        if data.get("prefix_search"):
+        if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
             self.prefix_search = re.compile(data["prefix_search"]).search
-        if data.get("suffix_search"):
+        if "suffix_search" in data and isinstance(data["suffix_search"], basestring_):
             self.suffix_search = re.compile(data["suffix_search"]).search
-        if data.get("infix_finditer"):
+        if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
             self.infix_finditer = re.compile(data["infix_finditer"]).finditer
-        if data.get("token_match"):
+        if "token_match" in data and isinstance(data["token_match"], basestring_):
             self.token_match = re.compile(data["token_match"]).match
-        if data.get("rules"):
+        if "rules" in data and isinstance(data["rules"], dict):
             # make sure to hard reset the cache to remove data from the default exceptions
             self._rules = {}
             self._reset_cache([key for key in self._cache])
             self._reset_specials()
             self._cache = PreshMap()
             self._specials = PreshMap()
-            self._load_special_tokenization(data.get("rules", {}))
+            self._load_special_tokenization(data["rules"])
 
         return self
 

From 697bec764de41e39582caadc14608607c2af8d09 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 3 Mar 2020 12:22:39 +0100
Subject: [PATCH 24/41] Normalize IS_SENT_START to SENT_START for Matcher
 (#5080)

---
 spacy/matcher/_schemas.py                      | 4 ++++
 spacy/matcher/matcher.pyx                      | 2 ++
 spacy/tests/matcher/test_pattern_validation.py | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index 1b10f0dd5..4ef7ae49a 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -170,6 +170,10 @@ TOKEN_PATTERN_SCHEMA = {
                 "title": "Token is the first in a sentence",
                 "$ref": "#/definitions/boolean_value",
             },
+            "SENT_START": {
+                "title": "Token is the first in a sentence",
+                "$ref": "#/definitions/boolean_value",
+            },
             "LIKE_NUM": {
                 "title": "Token resembles a number",
                 "$ref": "#/definitions/boolean_value",
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 30ef3dd36..11461afb8 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -670,6 +670,8 @@ def _get_attr_values(spec, string_store):
                 continue
             if attr == "TEXT":
                 attr = "ORTH"
+            if attr == "IS_SENT_START":
+                attr = "SENT_START"
             if attr not in TOKEN_PATTERN_SCHEMA["items"]["properties"]:
                 raise ValueError(Errors.E152.format(attr=attr))
             attr = IDS.get(attr)
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 2db2f9eb3..c536698d0 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -34,6 +34,8 @@ TEST_PATTERNS = [
     ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
     ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
     ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
+    ([{"IS_SENT_START": True}], 0, 0),
+    ([{"SENT_START": True}], 0, 0),
 ]
 
 XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]

From d078b47c81acdce5ece828f2f7d6e193bb3840ce Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 3 Mar 2020 12:29:05 +0100
Subject: [PATCH 25/41] Break out of infinite loop as intended (#5077)

---
 spacy/tokens/doc.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 11f1ddf5f..5997be804 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1175,6 +1175,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
         heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
         if loop_count > 10:
             user_warning(Warnings.W026)
+            break
         loop_count += 1
     # Set sentence starts
     for i in range(length):

From d307e9ca58c84dc24e6717fccafe7b55c604ee7c Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 3 Mar 2020 13:58:22 +0100
Subject: [PATCH 26/41] take care of global vectors in multiprocessing (#5081)

* restore load_nlp.VECTORS in the child process

* add unit test

* fix test

* remove unnecessary import

* add utf8 encoding

* import unicode_literals
---
 spacy/_ml.py                             |  3 +--
 spacy/language.py                        |  9 ++++++--
 spacy/tests/regression/test_issue4725.py | 26 ++++++++++++++++++++++++
 spacy/tests/regression/test_issue4849.py |  1 -
 spacy/tests/regression/test_issue4903.py |  2 --
 5 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue4725.py

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 8695a88cc..fb7d39255 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -296,8 +296,7 @@ def link_vectors_to_models(vocab):
     key = (ops.device, vectors.name)
     if key in thinc.extra.load_nlp.VECTORS:
         if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
-            # This is a hack to avoid the problem in #3853. Maybe we should
-            # print a warning as well?
+            # This is a hack to avoid the problem in #3853.
             old_name = vectors.name
             new_name = vectors.name + "_%d" % data.shape[0]
             user_warning(Warnings.W019.format(old=old_name, new=new_name))
diff --git a/spacy/language.py b/spacy/language.py
index 16aa4967e..28fddfebb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -3,6 +3,9 @@ from __future__ import absolute_import, unicode_literals
 
 import random
 import itertools
+
+from thinc.extra import load_nlp
+
 from spacy.util import minibatch
 import weakref
 import functools
@@ -856,7 +859,7 @@ class Language(object):
         procs = [
             mp.Process(
                 target=_apply_pipes,
-                args=(self.make_doc, pipes, rch, sch, Underscore.get_state()),
+                args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
             )
             for rch, sch in zip(texts_q, bytedocs_send_ch)
         ]
@@ -1112,7 +1115,7 @@ def _pipe(docs, proc, kwargs):
         yield doc
 
 
-def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
+def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state, vectors):
     """Worker for Language.pipe
 
     receiver (multiprocessing.Connection): Pipe to receive text. Usually
@@ -1120,8 +1123,10 @@ def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
     sender (multiprocessing.Connection): Pipe to send doc. Usually created by
         `multiprocessing.Pipe()`
     underscore_state (tuple): The data in the Underscore class of the parent
+    vectors (dict): The global vectors data, copied from the parent
     """
     Underscore.load_state(underscore_state)
+    load_nlp.VECTORS = vectors
     while True:
         texts = receiver.get()
         docs = (make_doc(text) for text in texts)
diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py
new file mode 100644
index 000000000..f80f19852
--- /dev/null
+++ b/spacy/tests/regression/test_issue4725.py
@@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+
+from spacy.lang.en import English
+from spacy.vocab import Vocab
+
+
+def test_issue4725():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py
index 85d03fe9a..834219773 100644
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
 
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
-from spacy.tokens.underscore import Underscore
 
 
 def test_issue4849():
diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py
index 9a3c10d61..d467b1cd6 100644
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@@ -1,10 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import spacy
 from spacy.lang.en import English
 from spacy.tokens import Span, Doc
-from spacy.tokens.underscore import Underscore
 
 
 class CustomPipe:

From a0998868ffe6d0d8d1a610374f537a4f41eda83e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 3 Mar 2020 13:58:56 +0100
Subject: [PATCH 27/41] prevent updating cfg if the Model was already defined
 (#5078)

---
 spacy/syntax/nn_parser.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 153ca67cd..d5c6bf2a8 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -606,7 +606,6 @@ cdef class Parser:
         if not hasattr(get_gold_tuples, '__call__'):
             gold_tuples = get_gold_tuples
             get_gold_tuples = lambda: gold_tuples
-        cfg.setdefault('min_action_freq', 30)
         actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
                                          min_freq=cfg.get('min_action_freq', 30),
                                          learn_tokens=self.cfg.get("learn_tokens", False))
@@ -616,8 +615,9 @@ cdef class Parser:
                 if label not in actions[action]:
                     actions[action][label] = freq
         self.moves.initialize_actions(actions)
-        cfg.setdefault('token_vector_width', 96)
         if self.model is True:
+            cfg.setdefault('min_action_freq', 30)
+            cfg.setdefault('token_vector_width', 96)
             self.model, cfg = self.Model(self.moves.n_moves, **cfg)
             if sgd is None:
                 sgd = self.create_optimizer()
@@ -633,11 +633,11 @@ cdef class Parser:
             if pipeline is not None:
                 self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
             link_vectors_to_models(self.vocab)
+            self.cfg.update(cfg)
         else:
             if sgd is None:
                 sgd = self.create_optimizer()
             self.model.begin_training([])
-        self.cfg.update(cfg)
         return sgd
 
     def to_disk(self, path, exclude=tuple(), **kwargs):

From 8c20dae6f7b1d5ac056402e0057269ce80dba0fa Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 3 Mar 2020 21:43:25 +0100
Subject: [PATCH 28/41] Fix model-final/model-best meta from train CLI (#5093)

* Fix model-final/model-best meta

* include speed and accuracy from final iteration
* combine with speeds from base model if necessary

* Include token_acc metric for all components
---
 spacy/cli/train.py | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 968a009f6..59b0f2225 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -554,7 +554,30 @@ def train(
         with nlp.use_params(optimizer.averages):
             final_model_path = output_path / "model-final"
             nlp.to_disk(final_model_path)
-            final_meta = srsly.read_json(output_path / "model-final" / "meta.json")
+            meta_loc = output_path / "model-final" / "meta.json"
+            final_meta = srsly.read_json(meta_loc)
+            final_meta.setdefault("accuracy", {})
+            final_meta["accuracy"].update(meta.get("accuracy", {}))
+            final_meta.setdefault("speed", {})
+            final_meta["speed"].setdefault("cpu", None)
+            final_meta["speed"].setdefault("gpu", None)
+            # combine cpu and gpu speeds with the base model speeds
+            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
+                speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
+                final_meta["speed"]["cpu"] = speed
+            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
+                speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
+                final_meta["speed"]["gpu"] = speed
+            # if there were no speeds to update, overwrite with meta
+            if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
+                final_meta["speed"].update(meta["speed"])
+            # note: beam speeds are not combined with the base model
+            if has_beam_widths:
+                final_meta.setdefault("beam_accuracy", {})
+                final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
+                final_meta.setdefault("beam_speed", {})
+                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
+            srsly.write_json(meta_loc, final_meta)
         msg.good("Saved model to output directory", final_model_path)
         with msg.loading("Creating best model..."):
             best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
@@ -649,11 +672,11 @@ def _get_metrics(component):
     if component == "parser":
         return ("las", "uas", "las_per_type", "token_acc")
     elif component == "tagger":
-        return ("tags_acc",)
+        return ("tags_acc", "token_acc")
     elif component == "ner":
-        return ("ents_f", "ents_p", "ents_r", "ents_per_type")
+        return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
     elif component == "textcat":
-        return ("textcat_score",)
+        return ("textcat_score", "token_acc")
     return ("token_acc",)
 
 
@@ -709,3 +732,12 @@ def _get_progress(
     if beam_width is not None:
         result.insert(1, beam_width)
     return result
+
+
+def _get_total_speed(speeds):
+    seconds_per_word = 0.0
+    for words_per_second in speeds:
+        if words_per_second is None:
+            return None
+        seconds_per_word += 1.0 / words_per_second
+    return 1.0 / seconds_per_word

From 9be90dbca3a75ebbaa85ec14dd02fe3ab87291be Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 3 Mar 2020 21:44:51 +0100
Subject: [PATCH 29/41] Improve token head verification (#5079)

* Improve token head verification

Improve the verification for valid token heads when heads are set:

* in `Token.head`: heads come from the same document
* in `Doc.from_array()`: head indices are within the bounds of the
document

* Improve error message
---
 spacy/errors.py                   |  7 +++++++
 spacy/tests/doc/test_array.py     | 27 +++++++++++++++++++++++++++
 spacy/tests/doc/test_token_api.py |  5 +++++
 spacy/tokens/doc.pyx              | 10 +++++++++-
 spacy/tokens/token.pyx            |  3 +++
 5 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 5957c5ecd..b43b8487f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -545,6 +545,13 @@ class Errors(object):
             "make sure the gold EL data refers to valid results of the "
             "named entity recognizer in the `nlp` pipeline.")
     E189 = ("Each argument to `get_doc` should be of equal length.")
+    E190 = ("Token head out of range in `Doc.from_array()` for token index "
+            "'{index}' with value '{value}' (equivalent to relative head "
+            "index: '{rel_head_index}'). The head indices should be relative "
+            "to the current token index rather than absolute indices in the "
+            "array.")
+    E191 = ("Invalid head: the head token must be from the same doc as the "
+            "token itself.")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index aa0d37eca..1c0c79f6e 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -77,3 +77,30 @@ def test_doc_array_idx(en_vocab):
     assert offsets[0] == 0
     assert offsets[1] == 3
     assert offsets[2] == 11
+
+
+def test_doc_from_array_heads_in_bounds(en_vocab):
+    """Test that Doc.from_array doesn't set heads that are out of bounds."""
+    words = ["This", "is", "a", "sentence", "."]
+    doc = Doc(en_vocab, words=words)
+    for token in doc:
+        token.head = doc[0]
+
+    # correct
+    arr = doc.to_array(["HEAD"])
+    doc_from_array = Doc(en_vocab, words=words)
+    doc_from_array.from_array(["HEAD"], arr)
+
+    # head before start
+    arr = doc.to_array(["HEAD"])
+    arr[0] = -1
+    doc_from_array = Doc(en_vocab, words=words)
+    with pytest.raises(ValueError):
+        doc_from_array.from_array(["HEAD"], arr)
+
+    # head after end
+    arr = doc.to_array(["HEAD"])
+    arr[0] = 5
+    doc_from_array = Doc(en_vocab, words=words)
+    with pytest.raises(ValueError):
+        doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index b7522bb98..8c749b26d 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -167,6 +167,11 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[4].left_edge.i == 0
     assert doc[2].left_edge.i == 0
 
+    # head token must be from the same document
+    doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    with pytest.raises(ValueError):
+        doc[0].head = doc2[0]
+
 
 def test_is_sent_start(en_tokenizer):
     doc = en_tokenizer("This is a sentence. This is another.")
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5997be804..0c90929c3 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -790,7 +790,7 @@ cdef class Doc:
 
         if SENT_START in attrs and HEAD in attrs:
             raise ValueError(Errors.E032)
-        cdef int i, col
+        cdef int i, col, abs_head_index
         cdef attr_id_t attr_id
         cdef TokenC* tokens = self.c
         cdef int length = len(array)
@@ -804,6 +804,14 @@ cdef class Doc:
             attr_ids[i] = attr_id
         if len(array.shape) == 1:
             array = array.reshape((array.size, 1))
+        # Check that all heads are within the document bounds
+        if HEAD in attrs:
+            col = attrs.index(HEAD)
+            for i in range(length):
+                # cast index to signed int
+                abs_head_index = numpy.int32(array[i, col]) + i
+                if abs_head_index < 0 or abs_head_index >= length:
+                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
         # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
         if TAG in attrs:
             col = attrs.index(TAG)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8b15a4223..8019e3b4f 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -623,6 +623,9 @@ cdef class Token:
             # This function sets the head of self to new_head and updates the
             # counters for left/right dependents and left/right corner for the
             # new and the old head
+            # Check that token is from the same document
+            if self.doc != new_head.doc:
+                raise ValueError(Errors.E191)
             # Do nothing if old head is new head
             if self.i + self.c.head == new_head.i:
                 return

From 03376c9d9bea0dd850bd2612521843f6c8f580ba Mon Sep 17 00:00:00 2001
From: Muhammad Irfan <virtuoso.irfan@gmail.com>
Date: Wed, 4 Mar 2020 11:58:56 +0500
Subject: [PATCH 30/41] Basque language added and tested.

---
 spacy/lang/eu/__init__.py        |  30 +++++++++
 spacy/lang/eu/examples.py        |  16 +++++
 spacy/lang/eu/lex_attrs.py       |  80 +++++++++++++++++++++++
 spacy/lang/eu/punctuation.py     |   7 ++
 spacy/lang/eu/stop_words.py      | 108 +++++++++++++++++++++++++++++++
 spacy/lang/eu/tag_map.py         |  71 ++++++++++++++++++++
 spacy/tests/conftest.py          |   5 ++
 spacy/tests/lang/eu/test_text.py |  16 +++++
 8 files changed, 333 insertions(+)
 create mode 100644 spacy/lang/eu/__init__.py
 create mode 100644 spacy/lang/eu/examples.py
 create mode 100644 spacy/lang/eu/lex_attrs.py
 create mode 100644 spacy/lang/eu/punctuation.py
 create mode 100644 spacy/lang/eu/stop_words.py
 create mode 100644 spacy/lang/eu/tag_map.py
 create mode 100644 spacy/tests/lang/eu/test_text.py

diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
new file mode 100644
index 000000000..4f3338c1d
--- /dev/null
+++ b/spacy/lang/eu/__init__.py
@@ -0,0 +1,30 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_SUFFIXES
+from .tag_map import TAG_MAP
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+
+
+class BasqueDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: "eu"
+
+    tokenizer_exceptions = BASE_EXCEPTIONS
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    suffixes = TOKENIZER_SUFFIXES
+
+
+class Basque(Language):
+    lang = "eu"
+    Defaults = BasqueDefaults
+
+
+__all__ = ["Basque"]
diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py
new file mode 100644
index 000000000..ec9f0dd06
--- /dev/null
+++ b/spacy/lang/eu/examples.py
@@ -0,0 +1,16 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.eu.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "",
+    ""
+]
diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py
new file mode 100644
index 000000000..c11e913db
--- /dev/null
+++ b/spacy/lang/eu/lex_attrs.py
@@ -0,0 +1,80 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+# Source http://mylanguages.org/basque_numbers.php
+
+
+_num_words = """
+bat
+bi
+hiru
+lau
+bost
+sei
+zazpi
+zortzi
+bederatzi
+hamar
+hamaika
+hamabi
+hamahiru
+hamalau
+hamabost
+hamasei
+hamazazpi
+Hemezortzi
+hemeretzi
+hogei
+ehun
+mila
+milioi
+""".split()
+
+# source https://www.google.com/intl/ur/inputtools/try/
+
+_ordinal_words = """
+lehen
+bigarren
+hirugarren
+laugarren
+bosgarren
+seigarren
+zazpigarren
+zortzigarren
+bederatzigarren
+hamargarren
+hamaikagarren
+hamabigarren
+hamahirugarren
+hamalaugarren
+hamabosgarren
+hamaseigarren
+hamazazpigarren
+hamazortzigarren
+hemeretzigarren
+hogeigarren
+behin
+""".split()
+
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py
new file mode 100644
index 000000000..b8b1a1c83
--- /dev/null
+++ b/spacy/lang/eu/punctuation.py
@@ -0,0 +1,7 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_SUFFIXES
+
+
+_suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py
new file mode 100644
index 000000000..208238961
--- /dev/null
+++ b/spacy/lang/eu/stop_words.py
@@ -0,0 +1,108 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+# Source: https://github.com/stopwords-iso/stopwords-eu
+# https://www.ranks.nl/stopwords/basque
+# https://www.mustgo.com/worldlanguages/basque/
+STOP_WORDS = set(
+"""
+al
+anitz
+arabera
+asko
+baina
+bat
+batean
+batek
+bati
+batzuei
+batzuek
+batzuetan
+batzuk
+bera
+beraiek
+berau
+berauek
+bere
+berori
+beroriek
+beste
+bezala
+da
+dago
+dira
+ditu
+du
+dute
+edo
+egin
+ere
+eta
+eurak
+ez
+gainera
+gu
+gutxi
+guzti
+haiei
+haiek
+haietan
+hainbeste
+hala
+han
+handik
+hango
+hara
+hari
+hark
+hartan
+hau
+hauei
+hauek
+hauetan
+hemen
+hemendik
+hemengo
+hi
+hona
+honek
+honela
+honetan
+honi
+hor
+hori
+horiei
+horiek
+horietan
+horko
+horra
+horrek
+horrela
+horretan
+horri
+hortik
+hura
+izan
+ni
+noiz
+nola
+non
+nondik
+nongo
+nor
+nora
+ze
+zein
+zen
+zenbait
+zenbat
+zer
+zergatik
+ziren
+zituen
+zu
+zuek
+zuen
+zuten
+""".split()
+)
diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py
new file mode 100644
index 000000000..2499d7e3e
--- /dev/null
+++ b/spacy/lang/eu/tag_map.py
@@ -0,0 +1,71 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+
+TAG_MAP = {
+    ".": {POS: PUNCT, "PunctType": "peri"},
+    ",": {POS: PUNCT, "PunctType": "comm"},
+    "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
+    "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
+    "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
+    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    ":": {POS: PUNCT},
+    "$": {POS: SYM, "Other": {"SymType": "currency"}},
+    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+    "AFX": {POS: ADJ, "Hyph": "yes"},
+    "CC": {POS: CCONJ, "ConjType": "coor"},
+    "CD": {POS: NUM, "NumType": "card"},
+    "DT": {POS: DET},
+    "EX": {POS: ADV, "AdvType": "ex"},
+    "FW": {POS: X, "Foreign": "yes"},
+    "HYPH": {POS: PUNCT, "PunctType": "dash"},
+    "IN": {POS: ADP},
+    "JJ": {POS: ADJ, "Degree": "pos"},
+    "JJR": {POS: ADJ, "Degree": "comp"},
+    "JJS": {POS: ADJ, "Degree": "sup"},
+    "LS": {POS: PUNCT, "NumType": "ord"},
+    "MD": {POS: VERB, "VerbType": "mod"},
+    "NIL": {POS: ""},
+    "NN": {POS: NOUN, "Number": "sing"},
+    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
+    "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
+    "NNS": {POS: NOUN, "Number": "plur"},
+    "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
+    "POS": {POS: PART, "Poss": "yes"},
+    "PRP": {POS: PRON, "PronType": "prs"},
+    "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
+    "RB": {POS: ADV, "Degree": "pos"},
+    "RBR": {POS: ADV, "Degree": "comp"},
+    "RBS": {POS: ADV, "Degree": "sup"},
+    "RP": {POS: PART},
+    "SP": {POS: SPACE},
+    "SYM": {POS: SYM},
+    "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
+    "UH": {POS: INTJ},
+    "VB": {POS: VERB, "VerbForm": "inf"},
+    "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
+    "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
+    "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
+    "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
+    "VBZ": {
+        POS: VERB,
+        "VerbForm": "fin",
+        "Tense": "pres",
+        "Number": "sing",
+        "Person": 3,
+    },
+    "WDT": {POS: ADJ, "PronType": "int|rel"},
+    "WP": {POS: NOUN, "PronType": "int|rel"},
+    "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
+    "WRB": {POS: ADV, "PronType": "int|rel"},
+    "ADD": {POS: X},
+    "NFP": {POS: PUNCT},
+    "GW": {POS: X},
+    "XX": {POS: X},
+    "BES": {POS: VERB},
+    "HVS": {POS: VERB},
+    "_SP": {POS: SPACE},
+}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 816970e61..fc89c2658 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -83,6 +83,11 @@ def es_tokenizer():
     return get_lang_class("es").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def eu_tokenizer():
+    return get_lang_class("eu").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def fi_tokenizer():
     return get_lang_class("fi").Defaults.create_tokenizer()
diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py
new file mode 100644
index 000000000..e73917ffa
--- /dev/null
+++ b/spacy/tests/lang/eu/test_text.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_eu_tokenizer_handles_long_text(eu_tokenizer):
+    text = """ta nere guitarra estrenatu ondoren"""
+    tokens = eu_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
+def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
+    tokens = eu_tokenizer(text)
+    assert len(tokens) == length

From 224a7f8e94721a7af10e366773ce2c012a5b8f62 Mon Sep 17 00:00:00 2001
From: Muhammad Irfan <virtuoso.irfan@gmail.com>
Date: Wed, 4 Mar 2020 15:49:06 +0500
Subject: [PATCH 31/41] examples

---
 spacy/lang/eu/examples.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py
index ec9f0dd06..f2d325d78 100644
--- a/spacy/lang/eu/examples.py
+++ b/spacy/lang/eu/examples.py
@@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-
 """
 Example sentences to test spaCy and its language models.
 
@@ -9,8 +8,7 @@ Example sentences to test spaCy and its language models.
 >>> docs = nlp.pipe(sentences)
 """
 
-
 sentences = [
-    "",
-    ""
+    "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
+    "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
 ]

From 4d655b1d45577ceeb0113616f6cc7590568e5a2b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 4 Mar 2020 13:50:37 +0100
Subject: [PATCH 32/41] Require srsly >=1.0.2

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e908e25f8..ec30efc16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ thinc==7.4.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
-srsly>=1.0.1,<1.1.0
+srsly>=1.0.2,<1.1.0
 catalogue>=0.0.7,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index 1429c77ac..e44e32bb2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     thinc==7.4.0
     blis>=0.4.0,<0.5.0
     wasabi>=0.4.0,<1.1.0
-    srsly>=1.0.1,<1.1.0
+    srsly>=1.0.2,<1.1.0
     catalogue>=0.0.7,<1.1.0
     # Third-party dependencies
     tqdm>=4.38.0,<5.0.0

From 3440a72ecb188850bf4b08244c2041ac0d8109a7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 4 Mar 2020 19:28:16 +0100
Subject: [PATCH 33/41] Update Makefile (#5099)

---
 Makefile | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 1be1c9794..13c9026b7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,36 +1,37 @@
 SHELL := /bin/bash
-WHEELHOUSE := ./wheelhouse
 PYVER := 3.6
 VENV := ./env$(PYVER)
 
-version = $(shell "bin/get-version.sh")
+version := $(shell "bin/get-version.sh")
 
-dist/spacy-$(version).pex : wheelhouse/spacy-$(version)-*.whl
-	pex -f ./wheelhouse --no-index --disable-cache -m spacy -o dist/spacy-$(version).pex spacy==$(version) jsonschema
-	chmod a+rx dist/spacy-$(version).pex
+dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema
+	chmod a+rx $@
 
 dist/pytest.pex : wheelhouse/pytest-*.whl
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o dist/pytest.pex pytest pytest-timeout mock
-	chmod a+rx dist/pytest.pex
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
+	chmod a+rx $@
 
-wheelhouse/spacy-$(version)-%.whl : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
+wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 	$(VENV)/bin/pip wheel . -w ./wheelhouse
 	$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
+	touch $@
 
 wheelhouse/pytest-%.whl : $(VENV)/bin/pex
 	$(VENV)/bin/pip wheel pytest pytest-timeout mock -w ./wheelhouse
 
-$(VENV) : 
+$(VENV)/bin/pex :
 	python$(PYVER) -m venv $(VENV)
 	$(VENV)/bin/python -m pip install pex wheel
 
 .PHONY : clean test
 
 test : dist/spacy-$(version).pex dist/pytest.pex
-	PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x
+	( . $(VENV)/bin/activate ; \
+	PEX_PATH=dist/spacy-$(version).pex ./dist/pytest.pex --pyargs spacy -x ; )
 
 clean : setup.py
-	source env3.6/bin/activate
 	rm -rf dist/*
 	rm -rf ./wheelhouse
+	rm -rf $(VENV)
 	python setup.py clean --all

From 80004930ed098ec5b6bf9ecd081b96b1e7e7080f Mon Sep 17 00:00:00 2001
From: David Pollack <david@da3.net>
Date: Thu, 5 Mar 2020 15:48:41 +0100
Subject: [PATCH 34/41] fix typo in svg file

---
 .github/contributors/dhpollack.md    | 106 +++++++++++++++++++++++++++
 website/src/images/logos/allenai.svg |   2 +-
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 .github/contributors/dhpollack.md

diff --git a/.github/contributors/dhpollack.md b/.github/contributors/dhpollack.md
new file mode 100644
index 000000000..444d97d42
--- /dev/null
+++ b/.github/contributors/dhpollack.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | David Pollack        |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | Mar 5. 2020          |
+| GitHub username                | dhpollack            |
+| Website (optional)             |                      |
diff --git a/website/src/images/logos/allenai.svg b/website/src/images/logos/allenai.svg
index 2879bef60..c00569bf8 100644
--- a/website/src/images/logos/allenai.svg
+++ b/website/src/images/logos/allenai.svg
@@ -1,6 +1,6 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="220" height="37" viewBox="0 0 610 103">
 <defs>
-    <radialGradient id="gradient_allenai1 "cx="75.721" cy="20.894" r="11.05" gradientUnits="userSpaceOnUse">
+    <radialGradient id="gradient_allenai1" cx="75.721" cy="20.894" r="11.05" gradientUnits="userSpaceOnUse">
         <stop offset=".3" stop-color="#FDEA65" />
         <stop offset="1" stop-color="#FCB431" />
     </radialGradient>

From 1a2b8fc264efdc384c5497b97ee4b1f55675a3ec Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 6 Mar 2020 14:45:28 +0100
Subject: [PATCH 35/41] set vector of merged entity (#5085)

* merge_entities sets the vector in the vocab for the merged token

* add unit test

* import unicode_literals

* move code to _merge function

* only set vector if vocab has non-zero vectors
---
 spacy/tests/regression/test_issue5082.py | 46 ++++++++++++++++++++++++
 spacy/tokens/_retokenize.pyx             |  4 +++
 2 files changed, 50 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue5082.py

diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py
new file mode 100644
index 000000000..efa5d39f2
--- /dev/null
+++ b/spacy/tests/regression/test_issue5082.py
@@ -0,0 +1,46 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy as np
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
+
+
+def test_issue5082():
+    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+    nlp = English()
+    vocab = nlp.vocab
+    array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32)
+    array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32)
+    array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32)
+    array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32)
+    array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32)
+
+    vocab.set_vector("I", array1)
+    vocab.set_vector("like", array2)
+    vocab.set_vector("David", array3)
+    vocab.set_vector("Bowie", array4)
+
+    text = "I like David Bowie"
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    parsed_vectors_1 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_1) == 4
+    np.testing.assert_array_equal(parsed_vectors_1[0], array1)
+    np.testing.assert_array_equal(parsed_vectors_1[1], array2)
+    np.testing.assert_array_equal(parsed_vectors_1[2], array3)
+    np.testing.assert_array_equal(parsed_vectors_1[3], array4)
+
+    merge_ents = nlp.create_pipe("merge_entities")
+    nlp.add_pipe(merge_ents)
+
+    parsed_vectors_2 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_2) == 3
+    np.testing.assert_array_equal(parsed_vectors_2[0], array1)
+    np.testing.assert_array_equal(parsed_vectors_2[1], array2)
+    np.testing.assert_array_equal(parsed_vectors_2[2], array34)
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index a5d06491a..512ad73bc 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -213,6 +213,10 @@ def _merge(Doc doc, merges):
         new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
         if spans[token_index][-1].whitespace_:
             new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
+        # add the vector of the (merged) entity to the vocab
+        if not doc.vocab.get_vector(new_orth).any():
+            if doc.vocab.vectors_length > 0:
+                doc.vocab.set_vector(new_orth, span.vector)
         token = tokens[token_index]
         lex = doc.vocab.get(doc.mem, new_orth)
         token.lex = lex

From 993758c58fba9d4611223f5dd6dcdb203cf67bba Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Sun, 8 Mar 2020 13:22:25 +0100
Subject: [PATCH 36/41] Remove unnecessary iterator in Language.pipe (#5101)

Remove iterator over `raw_texts` with `iterator.tee()` in
`Language.pipe` that is never consumed and consumes memory
unnecessarily.
---
 spacy/language.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 28fddfebb..f0928b1f9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -757,8 +757,6 @@ class Language(object):
 
         DOCS: https://spacy.io/api/language#pipe
         """
-        # raw_texts will be used later to stop iterator.
-        texts, raw_texts = itertools.tee(texts)
         if is_python2 and n_process != 1:
             user_warning(Warnings.W023)
             n_process = 1

From 9dd98a4b2759f5231fcc3b2a09d16f27b79ab13b Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Sun, 8 Mar 2020 13:24:19 +0100
Subject: [PATCH 37/41] Improve Makefile (#5105)

* Explicitly upgrade pip

* Include spacy-lookups-data in pex
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 13c9026b7..cf96d6294 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 version := $(shell "bin/get-version.sh")
 
 dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
 	chmod a+rx $@
 
 dist/pytest.pex : wheelhouse/pytest-*.whl
@@ -22,7 +22,7 @@ wheelhouse/pytest-%.whl : $(VENV)/bin/pex
 
 $(VENV)/bin/pex :
 	python$(PYVER) -m venv $(VENV)
-	$(VENV)/bin/python -m pip install pex wheel
+	$(VENV)/bin/pip install -U pip setuptools pex wheel
 
 .PHONY : clean test
 

From 31755630a7b33bc9c621c1e82cc0c09da84720d4 Mon Sep 17 00:00:00 2001
From: Yohei Tamura <tamuhey@gmail.com>
Date: Sun, 8 Mar 2020 21:24:38 +0900
Subject: [PATCH 38/41] fix typ (#5106)

---
 bin/wiki_entity_linking/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md
index 56d0c1415..4e4af5c21 100644
--- a/bin/wiki_entity_linking/README.md
+++ b/bin/wiki_entity_linking/README.md
@@ -2,7 +2,7 @@
 
 ### Step 1: Create a Knowledge Base (KB) and training data
 
-Run  `wikipedia_pretrain_kb.py` 
+Run  `wikidata_pretrain_kb.py` 
 * This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
   * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
   * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)

From 0345135167c882575e006bf434c9f8d8d81f9e12 Mon Sep 17 00:00:00 2001
From: Mark Abraham <Mark.J.Abraham@gmail.com>
Date: Sun, 8 Mar 2020 13:25:56 +0100
Subject: [PATCH 39/41] Tokenizer to_disk and from_disk now ensure paths
 (#5116)

* Tokenizer to_disk and from_disk now ensure strings are converted to paths

Fixes #5115

* Sign contributor agreement
---
 .github/contributors/mabraham.md | 89 ++++++++++++++++++++++++++++++++
 spacy/tokenizer.pyx              |  2 +
 2 files changed, 91 insertions(+)
 create mode 100644 .github/contributors/mabraham.md

diff --git a/.github/contributors/mabraham.md b/.github/contributors/mabraham.md
new file mode 100644
index 000000000..c91c950a3
--- /dev/null
+++ b/.github/contributors/mabraham.md
@@ -0,0 +1,89 @@
+
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+        assignment is or becomes invalid, ineffective or unenforceable, you hereby
+            grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+                royalty-free, unrestricted license to exercise all rights under those
+                    copyrights. This includes, at our option, the right to sublicense these same
+                        rights to third parties through multiple levels of sublicensees or other
+                            licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+        contribution as if each of us were the sole owners, and if one of us makes
+            a derivative work of your contribution, the one who makes the derivative
+                work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+        against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+        exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+        consent of, pay or render an accounting to the other for any use or
+            distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+        your contribution in whole or in part, alone or in combination with or
+            included in any product, work or materials arising out of the project to
+                which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+        multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+        authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+        third party's copyrights, trademarks, patents, or other intellectual
+            property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+        other applicable export and import laws. You agree to notify us if you
+            become aware of any circumstance which would make any of the foregoing
+                representations inaccurate in any respect. We may publicly disclose your
+                    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+        or entity, including my employer, has or will have rights with respect to my
+            contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+        actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |                      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |                      |
+| GitHub username                |                      |
+| Website (optional)             |                      |
\ No newline at end of file
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 12c7b73af..4da081259 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -508,6 +508,7 @@ cdef class Tokenizer:
 
         DOCS: https://spacy.io/api/tokenizer#to_disk
         """
+        path = util.ensure_path(path)
         with path.open("wb") as file_:
             file_.write(self.to_bytes(**kwargs))
 
@@ -521,6 +522,7 @@ cdef class Tokenizer:
 
         DOCS: https://spacy.io/api/tokenizer#from_disk
         """
+        path = util.ensure_path(path)
         with path.open("rb") as file_:
             bytes_data = file_.read()
         self.from_bytes(bytes_data, **kwargs)

From 5f680042647ef7d0c71a5041f33558bf81e656d8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 9 Mar 2020 11:05:00 +0100
Subject: [PATCH 40/41] Port over gitignore changes from develop

Prevents stale files when switching branches
---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 828258603..edcbba4d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,11 @@ corpora/
 keys/
 *.json.gz
 
+# Tests
+spacy/tests/package/setup.cfg
+spacy/tests/package/pyproject.toml
+spacy/tests/package/requirements.txt
+
 # Website
 website/.cache/
 website/public/

From 1d6aec805d5c03ad8a039466e98ed3a619e650c4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 9 Mar 2020 11:17:20 +0100
Subject: [PATCH 41/41] Fix formatting and update docs for v2.2.4

---
 spacy/cli/debug_data.py       | 25 ++++++++++++++++---------
 website/docs/api/cli.md       | 30 ++++++++++++++++++++----------
 website/docs/api/doc.md       | 22 ++++++++++++----------
 website/docs/api/span.md      | 30 ++++++++++++++++++++++++++----
 website/docs/api/top-level.md | 32 ++++++++++++++++----------------
 website/meta/languages.json   |  2 ++
 6 files changed, 92 insertions(+), 49 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 0e12a594c..c5e1ff6cf 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -23,20 +23,17 @@ BLANK_MODEL_THRESHOLD = 2000
 
 
 @plac.annotations(
+    # fmt: off
     lang=("model language", "positional", None, str),
     train_path=("location of JSON-formatted training data", "positional", None, Path),
     dev_path=("location of JSON-formatted development data", "positional", None, Path),
     tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
     base_model=("name of model to update (optional)", "option", "b", str),
-    pipeline=(
-        "Comma-separated names of pipeline components to train",
-        "option",
-        "p",
-        str,
-    ),
+    pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
     ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
     verbose=("Print additional information and explanations", "flag", "V", bool),
     no_format=("Don't pretty-print the results", "flag", "NF", bool),
+    # fmt: on
 )
 def debug_data(
     lang,
@@ -235,13 +232,17 @@ def debug_data(
 
         if gold_train_data["ws_ents"]:
             msg.fail(
-                "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
+                "{} invalid whitespace entity span(s)".format(
+                    gold_train_data["ws_ents"]
+                )
             )
             has_ws_ents_error = True
 
         if gold_train_data["punct_ents"]:
             msg.warn(
-                "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
+                "{} entity span(s) with punctuation".format(
+                    gold_train_data["punct_ents"]
+                )
             )
             has_punct_ents_warning = True
 
@@ -592,7 +593,13 @@ def _compile_gold(train_docs, pipeline):
                 if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
                     # "Illegal" whitespace entity
                     data["ws_ents"] += 1
-                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
+                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
+                    ".",
+                    "'",
+                    "!",
+                    "?",
+                    ",",
+                ]:
                     # punctuation entity: could be replaced by whitespace when training with noise,
                     # so add a warning to alert the user to this unexpected side effect.
                     data["punct_ents"] += 1
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 2f7346491..e47695efb 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -184,16 +184,17 @@ low data labels and more.
 $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format]
 ```
 
-| Argument                   | Type       | Description                                                                                        |
-| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
-| `lang`                     | positional | Model language.                                                                                    |
-| `train_path`               | positional | Location of JSON-formatted training data. Can be a file or a directory of files.                   |
-| `dev_path`                 | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
-| `--base-model`, `-b`       | option     | Optional name of base model to update. Can be any loadable spaCy model.                            |
-| `--pipeline`, `-p`         | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
-| `--ignore-warnings`, `-IW` | flag       | Ignore warnings, only show stats and errors.                                                       |
-| `--verbose`, `-V`          | flag       | Print additional information and explanations.                                                     |
-| --no-format, `-NF`         | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
+| Argument                                               | Type       | Description                                                                                        |
+| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- |
+| `lang`                                                 | positional | Model language.                                                                                    |
+| `train_path`                                           | positional | Location of JSON-formatted training data. Can be a file or a directory of files.                   |
+| `dev_path`                                             | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
+| `--tag-map-path`, `-tm` <Tag variant="new">2.2.3</Tag> | option     | Location of JSON-formatted tag map.                                                                |
+| `--base-model`, `-b`                                   | option     | Optional name of base model to update. Can be any loadable spaCy model.                            |
+| `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
+| `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                       |
+| `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                     |
+| --no-format, `-NF`                                     | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
 
 <Accordion title="Example output">
 
@@ -368,6 +369,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `dev_path`                                                      | positional    | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files.                                                                |
 | `--base-model`, `-b` <Tag variant="new">2.1</Tag>               | option        | Optional name of base model to update. Can be any loadable spaCy model.                                                                                           |
 | `--pipeline`, `-p` <Tag variant="new">2.1</Tag>                 | option        | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.                                                                         |
+| `--replace-components`, `-R`                                    | flag          | Replace components from the base model.                                                                                                                           |
 | `--vectors`, `-v`                                               | option        | Model to load vectors from.                                                                                                                                       |
 | `--n-iter`, `-n`                                                | option        | Number of iterations (default: `30`).                                                                                                                             |
 | `--n-early-stopping`, `-ne`                                     | option        | Maximum number of training epochs without dev accuracy improvement.                                                                                               |
@@ -378,6 +380,13 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag>           | option        | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.                                                       |
 | `--parser-multitasks`, `-pt`                                    | option        | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                       |
 | `--entity-multitasks`, `-et`                                    | option        | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                          |
+| `--width`, `-cw` <Tag variant="new">2.2.4</Tag>                 | option        | Width of CNN layers of `Tok2Vec` component.                                                                                                                       |
+| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag>            | option        | Depth of CNN layers of `Tok2Vec` component.                                                                                                                       |
+| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag>            | option        | Window size for CNN layers of `Tok2Vec` component.                                                                                                                |
+| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag>            | option        | Maxout size for CNN layers of `Tok2Vec` component.                                                                                                                |
+| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag>            | flag          | Whether to use character-based embedding of `Tok2Vec` component.                                                                                                  |
+| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag>        | option        | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch).                                                                                                 |
+| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag>            | option        | Number of embedding rows of `Tok2Vec` component.                                                                                                                  |
 | `--noise-level`, `-nl`                                          | option        | Float indicating the amount of corruption for data augmentation.                                                                                                  |
 | `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag>     | option        | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement).                |
 | `--gold-preproc`, `-G`                                          | flag          | Use gold preprocessing.                                                                                                                                           |
@@ -385,6 +394,7 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
 | `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag>     | flag          | Text classification classes aren't mutually exclusive (multilabel).                                                                                               |
 | `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag>            | option        | Text classification model architecture. Defaults to `"bow"`.                                                                                                      |
 | `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option        | Text classification positive label for binary classes with two labels.                                                                                            |
+| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag>          | option        | Location of JSON-formatted tag map.                                                                                                                               |
 | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag>              | flag          | Show more detailed messages during training.                                                                                                                      |
 | `--help`, `-h`                                                  | flag          | Show help message and available arguments.                                                                                                                        |
 | **CREATES**                                                     | model, pickle | A spaCy model on each epoch.                                                                                                                                      |
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 4f948e425..87b854a8c 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -7,9 +7,10 @@ source: spacy/tokens/doc.pyx
 
 A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
 named entities, export annotations to numpy arrays, losslessly serialize to
-compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
-The Python-level `Token` and [`Span`](/api/span) objects are views of this
-array, i.e. they don't own the data themselves.
+compressed binary strings. The `Doc` object holds an array of
+[`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and
+[`Span`](/api/span) objects are views of this array, i.e. they don't own the
+data themselves.
 
 ## Doc.\_\_init\_\_ {#init tag="method"}
 
@@ -197,13 +198,14 @@ the character indices don't map to a valid span.
 > assert span.text == "New York"
 > ```
 
-| Name        | Type                                     | Description                                             |
-| ----------- | ---------------------------------------- | ------------------------------------------------------- |
-| `start`     | int                                      | The index of the first character of the span.           |
-| `end`       | int                                      | The index of the last character after the span.         |
-| `label`     | uint64 / unicode                         | A label to attach to the Span, e.g. for named entities. |
-| `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                   |
-| **RETURNS** | `Span`                                   | The newly constructed object or `None`.                 |
+| Name                                 | Type                                     | Description                                                           |
+| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
+| `start`                              | int                                      | The index of the first character of the span.                         |
+| `end`                                | int                                      | The index of the last character after the span.                       |
+| `label`                              | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.               |
+| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity. |
+| `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 |
+| **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                               |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 64b77b89d..3833bbca9 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -172,6 +172,28 @@ Remove a previously registered extension.
 | `name`      | unicode | Name of the extension.                                                |
 | **RETURNS** | tuple   | A `(default, method, getter, setter)` tuple of the removed extension. |
 
+## Span.char_span {#char_span tag="method" new="2.2.4"}
+
+Create a `Span` object from the slice `span.text[start:end]`. Returns `None` if
+the character indices don't map to a valid span.
+
+> #### Example
+>
+> ```python
+> doc = nlp("I like New York")
+> span = doc[1:4].char_span(5, 13, label="GPE")
+> assert span.text == "New York"
+> ```
+
+| Name        | Type                                     | Description                                                           |
+| ----------- | ---------------------------------------- | --------------------------------------------------------------------- |
+| `start`     | int                                      | The index of the first character of the span.                         |
+| `end`       | int                                      | The index of the last character after the span.                       |
+| `label`     | uint64 / unicode                         | A label to attach to the span, e.g. for named entities.               |
+| `kb_id`     | uint64 / unicode                         | An ID from a knowledge base to capture the meaning of a named entity. |
+| `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 |
+| **RETURNS** | `Span`                                   | The newly constructed object or `None`.                               |
+
 ## Span.similarity {#similarity tag="method" model="vectors"}
 
 Make a semantic similarity estimate. The default estimate is cosine similarity
@@ -293,10 +315,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
 > assert doc2.text == "New York"
 > ```
 
-| Name              | Type  | Description                                          |
-| ----------------- | ----- | ---------------------------------------------------- |
-| `copy_user_data`  | bool  | Whether or not to copy the original doc's user data. |
-| **RETURNS**       | `Doc` | A `Doc` object of the `Span`'s content.              |
+| Name             | Type  | Description                                          |
+| ---------------- | ----- | ---------------------------------------------------- |
+| `copy_user_data` | bool  | Whether or not to copy the original doc's user data. |
+| **RETURNS**      | `Doc` | A `Doc` object of the `Span`'s content.              |
 
 ## Span.root {#root tag="property" model="parser"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 266df87f0..217c51794 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="dep", options=options)
 > ```
 
-| Name               | Type    | Description                                                                                                     | Default                 |
-| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| `fine_grained`     | bool    | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
-| `add_lemma`        | bool    | Print the lemma's in a separate row below the token texts in the `dep` visualisation.                           | `False`                 |
-| `collapse_punct`   | bool    | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
-| `collapse_phrases` | bool    | Merge noun phrases into one token.                                                                              | `False`                 |
-| `compact`          | bool    | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
-| `color`            | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
-| `bg`               | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
-| `font`             | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
-| `offset_x`         | int     | Spacing on left side of the SVG in px.                                                                          | `50`                    |
-| `arrow_stroke`     | int     | Width of arrow path in px.                                                                                      | `2`                     |
-| `arrow_width`      | int     | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
-| `arrow_spacing`    | int     | Spacing between arrows in px to avoid overlaps.                                                                 | `20` / `12` (compact)   |
-| `word_spacing`     | int     | Vertical spacing between words and arcs in px.                                                                  | `45`                    |
-| `distance`         | int     | Distance between words in px.                                                                                   | `175` / `150` (compact) |
+| Name                                       | Type    | Description                                                                                                     | Default                 |
+| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
+| `fine_grained`                             | bool    | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool    | Print the lemma's in a separate row below the token texts.                                                      | `False`                 |
+| `collapse_punct`                           | bool    | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
+| `collapse_phrases`                         | bool    | Merge noun phrases into one token.                                                                              | `False`                 |
+| `compact`                                  | bool    | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
+| `color`                                    | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
+| `bg`                                       | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
+| `font`                                     | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
+| `offset_x`                                 | int     | Spacing on left side of the SVG in px.                                                                          | `50`                    |
+| `arrow_stroke`                             | int     | Width of arrow path in px.                                                                                      | `2`                     |
+| `arrow_width`                              | int     | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
+| `arrow_spacing`                            | int     | Spacing between arrows in px to avoid overlaps.                                                                 | `20` / `12` (compact)   |
+| `word_spacing`                             | int     | Vertical spacing between words and arcs in px.                                                                  | `45`                    |
+| `distance`                                 | int     | Distance between words in px.                                                                                   | `175` / `150` (compact) |
 
 #### Named Entity Visualizer options {#displacy_options-ent}
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index c22ddad69..8834aaddc 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -95,6 +95,8 @@
             "has_examples": true
         },
         { "code": "hr", "name": "Croatian", "has_examples": true },
+        { "code": "eu", "name": "Basque", "has_examples": true },
+        { "code": "yo", "name": "Yoruba", "has_examples": true },
         { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
         { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
         { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },