From a30bc774156de38a6450fe67aa99e3949e59109e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 1 Jul 2020 21:00:47 +0200
Subject: [PATCH 01/11] bugfixing prune_vectors and vectors_loc

---
 spacy/cli/init_model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 37f862ef2..d0d876aed 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -37,7 +37,7 @@ def init_model_cli(
     clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
     jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
     vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
-    prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
+    prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
     truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
@@ -56,6 +56,7 @@ def init_model_cli(
         freqs_loc=freqs_loc,
         clusters_loc=clusters_loc,
         jsonl_loc=jsonl_loc,
+        vectors_loc=vectors_loc,
         prune_vectors=prune_vectors,
         truncate_vectors=truncate_vectors,
         vectors_name=vectors_name,
@@ -228,7 +229,7 @@ def add_vectors(
     else:
         if vectors_loc:
             with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(msg, vectors_loc)
+                vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
             msg.good(f"Loaded vectors from {vectors_loc}")
         else:
             vectors_data, vector_keys = (None, None)
@@ -247,7 +248,7 @@ def add_vectors(
         nlp.vocab.prune_vectors(prune_vectors)
 
 
-def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
+def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
     f = open_file(vectors_loc)
     shape = tuple(int(size) for size in next(f).split())
     if truncate_vectors >= 1:

From 60c2695131f09ecc4f21aada71a3ccc54cd3a979 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 1 Jul 2020 22:33:39 +0200
Subject: [PATCH 02/11] Remove deprecated methods

---
 spacy/tokens/doc.pyx  | 44 -------------------------------------------
 spacy/tokens/span.pyx | 12 ------------
 2 files changed, 56 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 28590e91e..8fe922af9 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1123,50 +1123,6 @@ cdef class Doc:
                 remove_label_if_necessary(attributes[i])
                 retokenizer.merge(span, attributes[i])
 
-    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Retokenize the document, such that the span at
-        `doc.text[start_idx : end_idx]` is merged into a single token. If
-        `start_idx` and `end_idx `do not mark start and end token boundaries,
-        the document remains unchanged.
-
-        start_idx (int): Character index of the start of the slice to merge.
-        end_idx (int): Character index after the end of the slice to merge.
-        **attributes: Attributes to assign to the merged token. By default,
-            attributes are inherited from the syntactic root of the span.
-        RETURNS (Token): The newly merged token, or `None` if the start and end
-            indices did not fall at token boundaries.
-        """
-        cdef unicode tag, lemma, ent_type
-        warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
-        # TODO: ENT_KB_ID ?
-        if len(args) == 3:
-            warnings.warn(Warnings.W003, DeprecationWarning)
-            tag, lemma, ent_type = args
-            attributes[TAG] = tag
-            attributes[LEMMA] = lemma
-            attributes[ENT_TYPE] = ent_type
-        elif not args:
-            fix_attributes(self, attributes)
-        elif args:
-            raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
-                                                kwargs=repr(attributes)))
-        remove_label_if_necessary(attributes)
-        attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
-        cdef int start = token_by_start(self.c, self.length, start_idx)
-        if start == -1:
-            return None
-        cdef int end = token_by_end(self.c, self.length, end_idx)
-        if end == -1:
-            return None
-        # Currently we have the token index, we want the range-end index
-        end += 1
-        with self.retokenize() as retokenizer:
-            retokenizer.merge(self[start:end], attrs=attributes)
-        return self[start]
-
-    def print_tree(self, light=False, flat=False):
-        raise ValueError(Errors.E105)
-
     def to_json(self, underscore=None):
         """Convert a Doc to JSON. The format it produces will be the new format
         for the `spacy train` command (not implemented yet).
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index b8f79f8a6..902d46f5a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -280,18 +280,6 @@ cdef class Span:
 
         return array
 
-    def merge(self, *args, **attributes):
-        """Retokenize the document, such that the span is merged into a single
-        token.
-
-        **attributes: Attributes to assign to the merged token. By default,
-            attributes are inherited from the syntactic root token of the span.
-        RETURNS (Token): The newly merged token.
-        """
-        warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
-        return self.doc.merge(self.start_char, self.end_char, *args,
-                              **attributes)
-
     def get_lca_matrix(self):
         """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
         `Span`, where LCA[i, j] is the index of the lowest common ancestor among

From f503817623627b0e7a8e7bdacdcda412fc9318c0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 2 Jul 2020 13:48:11 +0200
Subject: [PATCH 03/11] fix parsing entity links in new gold format

---
 spacy/errors.py        |  2 --
 spacy/gold/example.pyx | 16 +++-------------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 66a3c61da..6e7ec49ae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -556,8 +556,6 @@ class Errors(object):
     E979 = ("Cannot convert {type} to an Example object.")
     E980 = ("Each link annotation should refer to a dictionary with at most one "
             "identifier mapping to 1.0, and all others to 0.0.")
-    E981 = ("The offsets of the annotations for 'links' need to refer exactly "
-            "to the offsets of the 'entities' annotations.")
     E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
             "into {values}, but found {value}.")
     E983 = ("Invalid key for '{dict}': {key}. Available keys: "
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 5e36156a9..841b233c4 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
             if key == "entities":
                 pass
             elif key == "links":
-                entities = doc_annot.get("entities", {})
-                if not entities:
-                    raise ValueError(Errors.E981)
-                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
+                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
                 tok_annot["ENT_KB_ID"] = ent_kb_ids
             elif key == "cats":
                 pass
@@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
                 ent_types.append("")
     return ent_iobs, ent_types
 
-def _parse_links(vocab, words, links, entities):
-    reference = Doc(vocab, words=words)
+def _parse_links(vocab, words, spaces, links):
+    reference = Doc(vocab, words=words, spaces=spaces)
     starts = {token.idx: token.i for token in reference}
     ends = {token.idx + len(token): token.i for token in reference}
     ent_kb_ids = ["" for _ in reference]
-    entity_map = [(ent[0], ent[1]) for ent in entities]
-
-    # links annotations need to refer 1-1 to entity annotations - throw error otherwise
-    for index, annot_dict in links.items():
-        start_char, end_char = index
-        if (start_char, end_char) not in entity_map:
-            raise ValueError(Errors.E981)
 
     for index, annot_dict in links.items():
         true_kb_ids = []

From 04ed4d60a84b9fdcd87476e3f8db5a7d4b7a8889 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 2 Jul 2020 13:57:35 +0200
Subject: [PATCH 04/11] raise error when links are not aligned to tokens

---
 spacy/errors.py                 | 2 ++
 spacy/gold/example.pyx          | 2 ++
 spacy/tests/test_new_example.py | 3 +--
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 6e7ec49ae..61ff5a037 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -556,6 +556,8 @@ class Errors(object):
     E979 = ("Cannot convert {type} to an Example object.")
     E980 = ("Each link annotation should refer to a dictionary with at most one "
             "identifier mapping to 1.0, and all others to 0.0.")
+    E981 = ("The offsets of the annotations for 'links' could not be aligned "
+            "to token boundaries.")
     E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
             "into {values}, but found {value}.")
     E983 = ("Invalid key for '{dict}': {key}. Available keys: "
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 841b233c4..2ecee1821 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -396,6 +396,8 @@ def _parse_links(vocab, words, spaces, links):
             start_char, end_char = index
             start_token = starts.get(start_char)
             end_token = ends.get(end_char)
+            if start_token is None or end_token is None:
+                raise ValueError(Errors.E981)
             for i in range(start_token, end_token+1):
                 ent_kb_ids[i] = true_kb_ids[0]
 
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py
index b89654554..58eab4a54 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
     [
         {
             "words": ["I", "like", "New", "York", "and", "Berlin", "."],
-            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
-            "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
+            "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
         }
     ],
 )

From b5268955d7c35dfa86f8d9ae23caf42569c6e098 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 2 Jul 2020 15:39:45 +0200
Subject: [PATCH 05/11] Update matcher usage examples [ci skip]

---
 website/docs/api/matcher.md                | 68 +++++++++-------------
 website/docs/api/phrasematcher.md          | 34 +++++------
 website/docs/usage/processing-pipelines.md |  2 +-
 website/docs/usage/rule-based-matching.md  | 47 ++++++++-------
 website/docs/usage/spacy-101.md            |  8 +--
 5 files changed, 70 insertions(+), 89 deletions(-)

diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 8210f7094..636354496 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
 
 ## Matcher.\_\_call\_\_ {#call tag="method"}
 
-Find all token sequences matching the supplied patterns on the `Doc`. As of
-spaCy v2.3, the `Matcher` can also be called on `Span` objects.
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 
 > #### Example
 >
@@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
 >
 > matcher = Matcher(nlp.vocab)
 > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-> matcher.add("HelloWorld", None, pattern)
+> matcher.add("HelloWorld", [pattern])
 > doc = nlp("hello world!")
 > matches = matcher(doc)
 > ```
 
 | Name        | Type         | Description                                                                                                                                                              |
 | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `doclike`   | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).                                                                                                                     |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
 | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
 
-<Infobox title="Important note" variant="warning">
-
-By default, the matcher **does not perform any action** on matches, like tagging
-matched phrases with entity types. Instead, actions need to be specified when
-**adding patterns or entities**, by passing in a callback function as the
-`on_match` argument on [`add`](/api/matcher#add). This allows you to define
-custom actions per pattern within the same matcher. For example, you might only
-want to merge some entity types, and set custom flags for other matched
-patterns. For more details and examples, see the usage guide on
-[rule-based matching](/usage/rule-based-matching).
-
-</Infobox>
-
 ## Matcher.pipe {#pipe tag="method"}
 
 Match a stream of documents, yielding them in turn.
@@ -92,7 +78,7 @@ patterns.
 > ```python
 > matcher = Matcher(nlp.vocab)
 > assert len(matcher) == 0
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert len(matcher) == 1
 > ```
 
@@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
 >
 > ```python
 > matcher = Matcher(nlp.vocab)
-> assert 'Rule' not in matcher
-> matcher.add('Rule', None, [{'ORTH': 'test'}])
-> assert 'Rule' in matcher
+> assert "Rule" not in matcher
+> matcher.add("Rule", [[{'ORTH': 'test'}]])
+> assert "Rule" in matcher
 > ```
 
 | Name        | Type | Description                                           |
@@ -129,39 +115,39 @@ overwritten.
 > #### Example
 >
 > ```python
->   def on_match(matcher, doc, id, matches):
->       print('Matched!', matches)
+> def on_match(matcher, doc, id, matches):
+>     print('Matched!', matches)
 >
->   matcher = Matcher(nlp.vocab)
->   matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
->   matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
->   doc = nlp("HELLO WORLD on Google Maps.")
->   matches = matcher(doc)
+> matcher = Matcher(nlp.vocab)
+> patterns = [
+>    [{"LOWER": "hello"}, {"LOWER": "world"}],
+>    [{"ORTH": "Google"}, {"ORTH": "Maps"}]
+> ]
+> matcher.add("TEST_PATTERNS", patterns)
+> doc = nlp("HELLO WORLD on Google Maps.")
+> matches = matcher(doc)
 > ```
 
-| Name        | Type               | Description                                                                                   |
-| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`  | str                | An ID for the thing you're matching.                                                          |
-| `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+<Infobox title="Changed in v3.0" variant="warning">
 
-<Infobox title="Changed in v2.2.2" variant="warning">
-
-As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
-the default in the future. The patterns are now the second argument and a list
+As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
 (instead of a variable number of arguments). The `on_match` callback becomes an
 optional keyword argument.
 
 ```diff
 patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-- matcher.add("GoogleNow", None, *patterns)
-+ matcher.add("GoogleNow", patterns)
 - matcher.add("GoogleNow", on_match, *patterns)
 + matcher.add("GoogleNow", patterns, on_match=on_match)
 ```
 
 </Infobox>
 
+| Name       | Type               | Description                                                                                   |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str                | An ID for the thing you're matching.                                                          |
+| `patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
 ## Matcher.remove {#remove tag="method" new="2"}
 
 Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@@ -170,7 +156,7 @@ exist.
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert "Rule" in matcher
 > matcher.remove("Rule")
 > assert "Rule" not in matcher
@@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > on_match, patterns = matcher.get("Rule")
 > ```
 
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index f02d81de9..9c722297d 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > from spacy.matcher import PhraseMatcher
 >
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > doc = nlp("Barack Obama lifts America one last time in emotional farewell")
 > matches = matcher(doc)
 > ```
@@ -104,7 +104,7 @@ patterns.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert len(matcher) == 0
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert len(matcher) == 1
 > ```
 
@@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert "OBAMA" not in matcher
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert "OBAMA" in matcher
 > ```
 
@@ -145,36 +145,32 @@ overwritten.
 >       print('Matched!', matches)
 >
 >   matcher = PhraseMatcher(nlp.vocab)
->   matcher.add("OBAMA", on_match, nlp("Barack Obama"))
->   matcher.add("HEALTH", on_match, nlp("health care reform"),
->                                   nlp("healthcare reform"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
+>   matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
 >   doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
 >   matches = matcher(doc)
 > ```
 
-| Name       | Type               | Description                                                                                   |
-| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | str                | An ID for the thing you're matching.                                                          |
-| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        |
+<Infobox title="Changed in v3.0" variant="warning">
 
-<Infobox title="Changed in v2.2.2" variant="warning">
-
-As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
-become the default in the future. The `Doc` patterns are now the second argument
-and a list (instead of a variable number of arguments). The `on_match` callback
+As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
+argument (instead of a variable number of arguments). The `on_match` callback
 becomes an optional keyword argument.
 
 ```diff
 patterns = [nlp("health care reform"), nlp("healthcare reform")]
-- matcher.add("HEALTH", None, *patterns)
-+ matcher.add("HEALTH", patterns)
 - matcher.add("HEALTH", on_match, *patterns)
 + matcher.add("HEALTH", patterns, on_match=on_match)
 ```
 
 </Infobox>
 
+| Name       | Type               | Description                                                                                   |
+| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str                | An ID for the thing you're matching.                                                          |
+| `docs`     | list               | `Doc` objects of the phrases to match.                                                        |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
 ## PhraseMatcher.remove {#remove tag="method" new="2.2"}
 
 Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@@ -184,7 +180,7 @@ does not exist.
 >
 > ```python
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > assert "OBAMA" in matcher
 > matcher.remove("OBAMA")
 > assert "OBAMA" not in matcher
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0ead27a49..6b32dc422 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -407,7 +407,7 @@ class EntityMatcher(object):
     def __init__(self, nlp, terms, label):
         patterns = [nlp.make_doc(text) for text in terms]
         self.matcher = PhraseMatcher(nlp.vocab)
-        self.matcher.add(label, None, *patterns)
+        self.matcher.add(label, patterns)
 
     def __call__(self, doc):
         matches = self.matcher(doc)
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 9a8f3da7b..d0ee44e49 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -98,9 +98,7 @@ print([token.text for token in doc])
 
 First, we initialize the `Matcher` with a vocab. The matcher must always share
 the same vocab with the documents it will operate on. We can now call
-[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
-second argument lets you pass in an optional callback function to invoke on a
-successful match. For now, we set it to `None`.
+[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
 
 ```python
 ### {executable="true"}
@@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 # Add match ID "HelloWorld" with no callback and one pattern
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
 
 doc = nlp("Hello, world! Hello world!")
 matches = matcher(doc)
@@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
 also match sequences without punctuation between "hello" and "world":
 
 ```python
-matcher.add("HelloWorld", None,
-            [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
-            [{"LOWER": "hello"}, {"LOWER": "world"}])
+patterns = [
+    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
+    [{"LOWER": "hello"}, {"LOWER": "world"}]
+]
+matcher.add("HelloWorld", patterns)
 ```
 
 By default, the matcher will only return the matches and **not do anything
@@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab, validate=True)
 # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
 # 🚨 Raises an error:
 # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
 # Pattern 0:
@@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
     print(entity.text)
 
 pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-matcher.add("GoogleIO", add_event_ent, pattern)
+matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
 doc = nlp("This is a text about Google I/O")
 matches = matcher(doc)
 ```
@@ -509,19 +509,18 @@ import spacy
 from spacy.matcher import Matcher
 from spacy.tokens import Token
 
-# We're using a class because the component needs to be initialised with
+# We're using a class because the component needs to be initialized with
 # the shared vocab via the nlp object
 class BadHTMLMerger(object):
     def __init__(self, nlp):
+        patterns = [
+            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
+            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
+        ]
         # Register a new token extension to flag bad HTML
         Token.set_extension("bad_html", default=False)
         self.matcher = Matcher(nlp.vocab)
-        self.matcher.add(
-            "BAD_HTML",
-            None,
-            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
-            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
-        )
+        self.matcher.add("BAD_HTML", patterns)
 
     def __call__(self, doc):
         # This method is invoked when the component is called on a Doc
@@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
 
 pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
            {"POS": "ADJ"}]
-matcher.add("FacebookIs", collect_sents, pattern)  # add pattern
+matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
 doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
 matches = matcher(doc)
 
@@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
            {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
-matcher.add("PHONE_NUMBER", None, pattern)
+matcher.add("PHONE_NUMBER", [pattern])
 
 doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
 print([t.text for t in doc])
@@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
     elif doc.vocab.strings[match_id] == "SAD":
         doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 
-matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
-matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern
+matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
+matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
 
 # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
-matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
+matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
 
 doc = nlp("Hello world 😀 #MondayMotivation")
 matches = matcher(doc)
@@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
 terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
 # Only run nlp.make_doc to speed things up
 patterns = [nlp.make_doc(text) for text in terms]
-matcher.add("TerminologyList", None, *patterns)
+matcher.add("TerminologyList", patterns)
 
 doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
           "converse in the Oval Office inside the White House in Washington, D.C.")
@@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
 patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
-matcher.add("Names", None, *patterns)
+matcher.add("Names", patterns)
 
 doc = nlp("angela merkel and us president barack Obama")
 for match_id, start, end in matcher(doc):
@@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
 
 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
-matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
+matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
 
 doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
 for match_id, start, end in matcher(doc):
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 39d732724..aa8aa59af 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
 def set_sentiment(matcher, doc, i, matches):
     doc.sentiment += 0.1
 
-pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
-matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o"
-matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji
+pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
+patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
+matcher.add("GoogleIO", patterns1)  # Match "Google I/O" or "Google i/o"
+matcher.add("HAPPY", patterns2, on_match=set_sentiment)  # Match one or more happy emoji
 
 doc = nlp("A text about Google I/O 😀😀")
 matches = matcher(doc)

From a723fa02a1a23fdcd4eb16603f11fa78e001b337 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 2 Jul 2020 17:41:50 +0200
Subject: [PATCH 06/11] DocBin: add version number, missing attributes and
 strings (#5685)

* Add version number to DocBin

Add a version number to DocBin for future use.

* Add POS to all attributes in DocBin

* Add morph string to strings in DocBin

* Update DocBin API

* Add string for ENT_KB_ID in DocBin
---
 spacy/tokens/_serialize.py | 7 ++++++-
 website/docs/api/docbin.md | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index d16515a57..a56900988 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
 
 
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
+ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
 
 
 class DocBin(object):
@@ -31,6 +31,7 @@ class DocBin(object):
         "spaces": bytes, # Serialized numpy boolean array with spaces data
         "lengths": bytes, # Serialized numpy int32 array with the doc lengths
         "strings": List[unicode] # List of unique strings in the token data
+        "version": str, # DocBin version number
     }
 
     Strings for the words, tags, labels etc are represented by 64-bit hashes in
@@ -53,6 +54,7 @@ class DocBin(object):
         DOCS: https://spacy.io/api/docbin#init
         """
         attrs = sorted([intify_attr(attr) for attr in attrs])
+        self.version = "0.1"
         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
         self.tokens = []
@@ -87,8 +89,10 @@ class DocBin(object):
             self.strings.add(token.text)
             self.strings.add(token.tag_)
             self.strings.add(token.lemma_)
+            self.strings.add(token.morph_)
             self.strings.add(token.dep_)
             self.strings.add(token.ent_type_)
+            self.strings.add(token.ent_kb_id_)
         self.cats.append(doc.cats)
         if self.store_user_data:
             self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@@ -147,6 +151,7 @@ class DocBin(object):
         spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
 
         msg = {
+            "version": self.version,
             "attrs": self.attrs,
             "tokens": tokens.tobytes("C"),
             "spaces": spaces.tobytes("C"),
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index fe8c359f7..07f95f91d 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
 the msgpack object has the following structure:
 
 ```python
-### msgpack object strcutrue
+### msgpack object structrue
 {
+    "version": str,           # DocBin version number
     "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
     "tokens": bytes,          # Serialized numpy uint64 array with the token data
     "spaces": bytes,          # Serialized numpy boolean array with spaces data
@@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
 
 | Argument          | Type     | Description                                                                                                                                                                                |
 | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
+| `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
 | `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 |
 | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |
 

From 41b65fd0f86c82d3e388cfb16c2a8580a839abeb Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 2 Jul 2020 21:48:01 +0200
Subject: [PATCH 07/11] fix to pretrain script (#5699)

* fix to pretrain script

* remove unnecessary import
---
 spacy/cli/pretrain.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 18c429c60..d0684dcff 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .. import util
-from ..gold import Example
 
 
 @app.command("pretrain")
@@ -183,7 +182,7 @@ def pretrain(
         for batch_id, batch in enumerate(batches):
             docs, count = make_docs(
                 nlp,
-                [ex.doc for ex in batch],
+                batch,
                 max_length=pretrain_config["max_length"],
                 min_length=pretrain_config["min_length"],
             )

From e4dcac4a4bb01fabd24d8ee7be93bf12d9bee1bd Mon Sep 17 00:00:00 2001
From: Jan Jessewitsch <61113983+Jan-711@users.noreply.github.com>
Date: Fri, 3 Jul 2020 11:32:42 +0200
Subject: [PATCH 08/11] Merging multiple docs into one (#5032)

* Add static method to Doc to allow merging of multiple docs.

* Add error description for the error that occurs if docs with different
vocabs (from different languages) are merged in Doc.from_docs().

* Add test for Doc.from_docs() implementation.

* Fix using numpy's concatenate in Doc.from_docs.

* Replace typing's type annotations in from_docs.

* Simply remove type annotations in from_docs.

* Add documentation for Doc.from_docs to api.

* Simplify from_docs, its test and the api doc for codebase consistency.

* Fix merging of Doc objects that end with whitespaces (Achieved by simply not setting the SPACY attribute on whitespace tokens). Remove two unnecessary imports of attributes.

* Add merging of user data from Doc objects in from_docs. Add user data test case to corresponding test. Add applicable warning messages.

* Fix incorrect setting of tokens idx by using concatenated spaces (again). Add test case to corresponding test.

* Add MORPH to attrs

* Update warnings calls

* Remove out-dated error from merge

* Rename space_delimiter to ensure_whitespace

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py                 |  6 ++-
 spacy/tests/doc/test_doc_api.py | 54 +++++++++++++++++++++
 spacy/tokens/doc.pyx            | 86 ++++++++++++++++++++++++++++++++-
 website/docs/api/doc.md         | 27 +++++++++++
 4 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 61ff5a037..07cf7bbdf 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -159,6 +159,8 @@ class Warnings(object):
     W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
             "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
             "string \"Field1=Value1,Value2|Field2=Value3\".")
+    W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
+    W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
 
 
 @add_codes
@@ -593,7 +595,9 @@ class Errors(object):
     E997 = ("Tokenizer special cases are not allowed to modify the text. "
             "This would map '{chunk}' to '{orth}' given token attributes "
             "'{token_attrs}'.")
- 
+    E999 = ("Unable to merge the `Doc` objects because they do not all share "
+            "the same `Vocab`.")
+
 
 @add_codes
 class TempErrors(object):
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 018830d37..38e6114de 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
     assert new_doc.is_parsed
 
 
+def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
+    en_texts = ["Merging the docs is fun.", "They don't think alike."]
+    de_text = "Wie war die Frage?"
+    en_docs = [en_tokenizer(text) for text in en_texts]
+    docs_idx = en_texts[0].index('docs')
+    de_doc = de_tokenizer(de_text)
+    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
+
+    assert Doc.from_docs([]) is None
+
+    assert de_doc is not Doc.from_docs([de_doc])
+    assert str(de_doc) == str(Doc.from_docs([de_doc]))
+
+    with pytest.raises(ValueError):
+        Doc.from_docs(en_docs + [de_doc])
+
+    m_doc = Doc.from_docs(en_docs)
+    assert len(en_docs) == len(list(m_doc.sents))
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == " ".join(en_texts)
+    p_token = m_doc[len(en_docs[0])-1]
+    assert p_token.text == "." and bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+    with pytest.raises(AttributeError):
+        not_available = m_doc[2]._.is_ambiguous             # not callable, because it was not set via set_extension
+    assert len(m_doc.user_data) == len(en_docs[0].user_data)    # but it's there
+
+    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
+    assert len(en_docs) == len(list(m_doc.sents))
+    assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == "".join(en_texts)
+    p_token = m_doc[len(en_docs[0]) - 1]
+    assert p_token.text == "." and not bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+
+    m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
+    with pytest.raises(ValueError):                 # important attributes from sentenziser or parser are missing
+        assert list(m_doc.sents)
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == " ".join(en_texts)         # space delimiter considered, although spacy attribute was missing
+    p_token = m_doc[len(en_docs[0]) - 1]
+    assert p_token.text == "." and bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+
+
 def test_doc_lang(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "world"])
     assert doc.lang_ == "en"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8fe922af9..f69a6811d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
+import copy
 from collections import Counter
 import numpy
 import numpy.linalg
@@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 
-from ..attrs import intify_attrs, IDS
+from ..attrs import intify_attr, intify_attrs, IDS
 from ..util import normalize_slice
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
@@ -806,7 +807,7 @@ cdef class Doc:
         attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                  for id_ in attrs]
         if array.dtype != numpy.uint64:
-            warnings.warn(Warnings.W028.format(type=array.dtype))
+            warnings.warn(Warnings.W101.format(type=array.dtype))
 
         if SENT_START in attrs and HEAD in attrs:
             raise ValueError(Errors.E032)
@@ -882,6 +883,87 @@ cdef class Doc:
             set_children_from_heads(self.c, length)
         return self
 
+    @staticmethod
+    def from_docs(docs, ensure_whitespace=True, attrs=None):
+        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
+        the same `Vocab`.
+
+        docs (list): A list of Doc objects.
+        ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
+        attrs (list): Optional list of attribute ID ints or attribute name strings.
+        RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
+
+        DOCS: https://spacy.io/api/doc#from_docs
+        """
+        if not docs:
+            return None
+
+        vocab = {doc.vocab for doc in docs}
+        if len(vocab) > 1:
+            raise ValueError(Errors.E999)
+        (vocab,) = vocab
+
+        if attrs is None:
+            attrs = [LEMMA, NORM]
+            if all(doc.is_nered for doc in docs):
+                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
+            # TODO: separate for is_morphed?
+            if all(doc.is_tagged for doc in docs):
+                attrs.extend([TAG, POS, MORPH])
+            if all(doc.is_parsed for doc in docs):
+                attrs.extend([HEAD, DEP])
+            else:
+                attrs.append(SENT_START)
+        else:
+            if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
+                attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
+            attrs = list(attr for attr in set(attrs) if attr)    # filter duplicates, remove None if present
+        if SPACY not in attrs:
+            attrs.append(SPACY)
+
+        concat_words = []
+        concat_spaces = []
+        concat_user_data = {}
+        char_offset = 0
+        for doc in docs:
+            concat_words.extend(t.text for t in doc)
+            concat_spaces.extend(bool(t.whitespace_) for t in doc)
+
+            for key, value in doc.user_data.items():
+                if isinstance(key, tuple) and len(key) == 4:
+                    data_type, name, start, end = key
+                    if start is not None or end is not None:
+                        start += char_offset
+                        if end is not None:
+                            end += char_offset
+                        concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                    else:
+                        warnings.warn(Warnings.W101.format(name=name))
+                else:
+                    warnings.warn(Warnings.W102.format(key=key, value=value))
+            char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
+
+        arrays = [doc.to_array(attrs) for doc in docs]
+
+        if ensure_whitespace:
+            spacy_index = attrs.index(SPACY)
+            for i, array in enumerate(arrays[:-1]):
+                if len(array) > 0 and not docs[i][-1].is_space:
+                    array[-1][spacy_index] = 1
+            token_offset = -1
+            for doc in docs[:-1]:
+                token_offset += len(doc)
+                if not doc[-1].is_space:
+                    concat_spaces[token_offset] = True
+
+        concat_array = numpy.concatenate(arrays)
+
+        concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
+
+        concat_doc.from_array(attrs, concat_array)
+
+        return concat_doc
+
     def get_lca_matrix(self):
         """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
         `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index d585cbd25..3b31b2c80 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -349,6 +349,33 @@ array of attributes.
 | `exclude`   | list                                   | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS** | `Doc`                                  | Itself.                                                                   |
 
+
+## Doc.from_docs {#from_docs tag="staticmethod"}
+
+Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Doc
+> texts = ["London is the capital of the United Kingdom.",
+>          "The River Thames flows through London.",
+>          "The famous Tower Bridge crosses the River Thames."]
+> docs = list(nlp.pipe(texts))
+> c_doc = Doc.from_docs(docs)
+> assert str(c_doc) == " ".join(texts)
+> assert len(list(c_doc.sents)) == len(docs)
+> assert [str(ent) for ent in c_doc.ents] == \
+>        [str(ent) for doc in docs for ent in doc.ents]
+> ```
+
+| Name                | Type  | Description                                                                                     |
+| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
+| `docs`              | list  | A list of `Doc` objects.                                                                        |
+| `ensure_whitespace` | bool  | Insert a space between two adjacent docs whenever the first doc does not end in whitespace.     |
+| `attrs`             | list  | Optional list of attribute ID ints or attribute name strings.                                   |
+| **RETURNS**         | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
+
 ## Doc.to_disk {#to_disk tag="method" new="2"}
 
 Save the current state to a directory.

From abad56db7d0032d6e75e08c8cb1cf763bef5129d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 3 Jul 2020 12:54:32 +0200
Subject: [PATCH 09/11] Add conllu2docs converter (#5704)

Add conllu2docs converter adapted from conllu2json converter
---
 spacy/cli/convert.py                          |   8 +-
 spacy/gold/converters/__init__.py             |   4 +-
 .../{conllu2json.py => conllu2docs.py}        | 104 ++++++------------
 spacy/tests/test_cli.py                       |  40 ++++---
 4 files changed, 62 insertions(+), 94 deletions(-)
 rename spacy/gold/converters/{conllu2json.py => conllu2docs.py} (79%)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index b008e2f93..976fe7910 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -9,7 +9,7 @@ import sys
 from ._app import app, Arg, Opt
 from ..gold import docs_to_json
 from ..tokens import DocBin
-from ..gold.converters import iob2docs, conll_ner2docs, json2docs
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
@@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
 # imported from /converters.
 
 CONVERTERS = {
-    # "conllubio": conllu2docs, TODO
-    # "conllu": conllu2docs, TODO
-    # "conll": conllu2docs, TODO
+    "conllubio": conllu2docs,
+    "conllu": conllu2docs,
+    "conll": conllu2docs,
     "ner": conll_ner2docs,
     "iob": iob2docs,
     "json": json2docs,
diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
index 3e366933a..63d52ad9d 100644
--- a/spacy/gold/converters/__init__.py
+++ b/spacy/gold/converters/__init__.py
@@ -1,6 +1,4 @@
 from .iob2docs import iob2docs  # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
 from .json2docs import json2docs
-
-# TODO: Update this one
-# from .conllu2docs import conllu2docs  # noqa: F401
+from .conllu2docs import conllu2docs  # noqa: F401
diff --git a/spacy/gold/converters/conllu2json.py b/spacy/gold/converters/conllu2docs.py
similarity index 79%
rename from spacy/gold/converters/conllu2json.py
rename to spacy/gold/converters/conllu2docs.py
index 73fdf57e7..b591d3218 100644
--- a/spacy/gold/converters/conllu2json.py
+++ b/spacy/gold/converters/conllu2docs.py
@@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags
 from ...language import Language
-from ...tokens import Doc, Token
+from ...tokens import Doc, Token, Span
 from wasabi import Printer
 
 
-def conllu2json(
+def conllu2docs(
     input_data,
     n_sents=10,
     append_morphology=False,
@@ -28,34 +28,22 @@ def conllu2json(
     MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
     msg = Printer(no_print=no_print)
     n_sents_info(msg, n_sents)
-    docs = []
-    raw = ""
-    sentences = []
-    conll_data = read_conllx(
+    sent_docs = read_conllx(
         input_data,
         append_morphology=append_morphology,
         ner_tag_pattern=MISC_NER_PATTERN,
         ner_map=ner_map,
         merge_subtokens=merge_subtokens,
     )
-    has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
-    for i, example in enumerate(conll_data):
-        raw += example.text
-        sentences.append(
-            generate_sentence(
-                example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
-            )
-        )
-        # Real-sized documents could be extracted using the comments on the
-        # conllu document
-        if len(sentences) % n_sents == 0:
-            doc = create_json_doc(raw, sentences, i)
-            docs.append(doc)
-            raw = ""
-            sentences = []
-    if sentences:
-        doc = create_json_doc(raw, sentences, i)
-        docs.append(doc)
+    docs = []
+    sent_docs_to_merge = []
+    for sent_doc in sent_docs:
+        sent_docs_to_merge.append(sent_doc)
+        if len(sent_docs_to_merge) % n_sents == 0:
+            docs.append(Doc.from_docs(sent_docs_to_merge))
+            sent_docs_to_merge = []
+    if sent_docs_to_merge:
+        docs.append(Doc.from_docs(sent_docs_to_merge))
     return docs
 
 
@@ -84,14 +72,14 @@ def read_conllx(
     ner_tag_pattern="",
     ner_map=None,
 ):
-    """ Yield examples, one for each sentence """
+    """ Yield docs, one for each sentence """
     vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
     for sent in input_data.strip().split("\n\n"):
         lines = sent.strip().split("\n")
         if lines:
             while lines[0].startswith("#"):
                 lines.pop(0)
-            example = example_from_conllu_sentence(
+            doc = doc_from_conllu_sentence(
                 vocab,
                 lines,
                 ner_tag_pattern,
@@ -99,7 +87,7 @@ def read_conllx(
                 append_morphology=append_morphology,
                 ner_map=ner_map,
             )
-            yield example
+            yield doc
 
 
 def get_entities(lines, tag_pattern, ner_map=None):
@@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
     return iob_to_biluo(iob)
 
 
-def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
-    sentence = {}
-    tokens = []
-    token_annotation = example_dict["token_annotation"]
-    for i, id_ in enumerate(token_annotation["ids"]):
-        token = {}
-        token["id"] = id_
-        token["orth"] = token_annotation["words"][i]
-        token["tag"] = token_annotation["tags"][i]
-        token["pos"] = token_annotation["pos"][i]
-        token["lemma"] = token_annotation["lemmas"][i]
-        token["morph"] = token_annotation["morphs"][i]
-        token["head"] = token_annotation["heads"][i] - i
-        token["dep"] = token_annotation["deps"][i]
-        if has_ner_tags:
-            token["ner"] = example_dict["doc_annotation"]["entities"][i]
-        tokens.append(token)
-    sentence["tokens"] = tokens
-    return sentence
-
-
-def create_json_doc(raw, sentences, id_):
-    doc = {}
-    paragraph = {}
-    doc["id"] = id_
-    doc["paragraphs"] = []
-    paragraph["raw"] = raw.strip()
-    paragraph["sentences"] = sentences
-    doc["paragraphs"].append(paragraph)
-    return doc
-
-
-def example_from_conllu_sentence(
+def doc_from_conllu_sentence(
     vocab,
     lines,
     ner_tag_pattern,
@@ -263,8 +219,9 @@ def example_from_conllu_sentence(
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
 
-    # create Example from custom Doc annotation
-    words, spaces, tags, morphs, lemmas = [], [], [], [], []
+    # create final Doc from custom Doc annotation
+    words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
+    heads, deps = [], []
     for i, t in enumerate(doc):
         words.append(t._.merged_orth)
         lemmas.append(t._.merged_lemma)
@@ -274,16 +231,23 @@ def example_from_conllu_sentence(
             tags.append(t.tag_ + "__" + t._.merged_morph)
         else:
             tags.append(t.tag_)
+        poses.append(t.pos_)
+        heads.append(t.head.i)
+        deps.append(t.dep_)
 
     doc_x = Doc(vocab, words=words, spaces=spaces)
-    ref_dict = Example(doc_x, reference=doc).to_dict()
-    ref_dict["words"] = words
-    ref_dict["lemmas"] = lemmas
-    ref_dict["spaces"] = spaces
-    ref_dict["tags"] = tags
-    ref_dict["morphs"] = morphs
-    example = Example.from_dict(doc_x, ref_dict)
-    return example
+    for i in range(len(doc)):
+        doc_x[i].tag_ = tags[i]
+        doc_x[i].morph_ = morphs[i]
+        doc_x[i].lemma_ = lemmas[i]
+        doc_x[i].pos_ = poses[i]
+        doc_x[i].dep_ = deps[i]
+        doc_x[i].head = doc_x[heads[i]]
+    doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
+    doc_x.is_parsed = True
+    doc_x.is_tagged = True
+
+    return doc_x
 
 
 def merge_conllu_subtokens(lines, doc):
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index ca0f3710f..e8928f33a 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,14 +1,10 @@
 import pytest
 
-from spacy.gold import docs_to_json
-from spacy.gold.converters import iob2docs, conll_ner2docs
-from spacy.gold.converters.conllu2json import conllu2json
+from spacy.gold import docs_to_json, biluo_tags_from_offsets
+from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.lang.en import English
 from spacy.cli.pretrain import make_docs
 
-# TODO
-# from spacy.gold.converters import conllu2docs
-
 
 def test_cli_converters_conllu2json():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
     ]
     input_data = "\n".join(lines)
-    converted = conllu2json(input_data, n_sents=1)
-    assert len(converted) == 1
+    converted_docs = conllu2docs(input_data, n_sents=1)
+    assert len(converted_docs) == 1
+    converted = [docs_to_json(converted_docs)]
     assert converted[0]["id"] == 0
     assert len(converted[0]["paragraphs"]) == 1
     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
@@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
     assert [t["head"] for t in tokens] == [1, 2, -1, 0]
     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
-    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
+    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
 
 
 @pytest.mark.parametrize(
@@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
 )
 def test_cli_converters_conllu2json_name_ner_map(lines):
     input_data = "\n".join(lines)
-    converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
-    assert len(converted) == 1
+    converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
+    assert len(converted_docs) == 1
+    converted = [docs_to_json(converted_docs)]
     assert converted[0]["id"] == 0
     assert len(converted[0]["paragraphs"]) == 1
-    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
+    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
     sent = converted[0]["paragraphs"][0]["sentences"][0]
     assert len(sent["tokens"]) == 5
@@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
-    assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
+    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
 def test_cli_converters_conllu2json_subtokens():
@@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
     ]
     input_data = "\n".join(lines)
-    converted = conllu2json(
+    converted_docs = conllu2docs(
         input_data, n_sents=1, merge_subtokens=True, append_morphology=True
     )
-    assert len(converted) == 1
+    assert len(converted_docs) == 1
+    converted = [docs_to_json(converted_docs)]
+
     assert converted[0]["id"] == 0
     assert len(converted[0]["paragraphs"]) == 1
-    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
+    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
     sent = converted[0]["paragraphs"][0]["sentences"][0]
     assert len(sent["tokens"]) == 4
@@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
     assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
     assert [t["head"] for t in tokens] == [1, 1, 0, -1]
     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
-    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
+    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    assert biluo_tags == ["O", "U-PER", "O", "O"]
 
 
 def test_cli_converters_iob2json(en_vocab):

From a902b5f2175012bc57bd326dbfcf43e5ed2a91a5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Jul 2020 12:58:16 +0200
Subject: [PATCH 10/11] Record whether Doc objects are built from known spacing
 (#5697)

* Tell convert CLI to store user data for Doc

* Remove assert

* Add has_unknwon_spaces flag on Doc

* Do not tokenize docs with unknown spaces in Corpus

* Handle conversion of unknown spaces in Example

* Fixes

* Fixes

* Draft has_known_spaces support in DocBin

* Add test for serialize has_unknown_spaces

* Fix DocBin serialization when has_unknown_spaces

* Use serialization in test
---
 spacy/cli/convert.py                        |  4 +-
 spacy/gold/converters/json2docs.py          |  2 -
 spacy/gold/corpus.py                        | 36 +++++++++-------
 spacy/gold/example.pyx                      |  4 +-
 spacy/tests/serialize/test_serialize_doc.py | 16 +++++++
 spacy/tokens/_serialize.py                  | 10 +++++
 spacy/tokens/doc.pxd                        |  3 ++
 spacy/tokens/doc.pyx                        | 46 ++++++++++-----------
 8 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 976fe7910..56f38766a 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
     if output_type == "json":
         srsly.write_json("-", docs_to_json(docs))
     else:
-        sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
+        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
 
 
 def _write_docs_to_file(docs, output_file, output_type):
@@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
     if output_type == "json":
         srsly.write_json(output_file, docs_to_json(docs))
     else:
-        data = DocBin(docs=docs).to_bytes()
+        data = DocBin(docs=docs, store_user_data=True).to_bytes()
         with output_file.open("wb") as file_:
             file_.write(data)
  
diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py
index 50ad16faf..342f94848 100644
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
         for json_para in json_to_annotations(json_doc):
             example_dict = _fix_legacy_dict_data(json_para)
             tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-            if json_para.get("raw"):
-                assert tok_dict.get("SPACY")
             doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
             docs.append(doc)
     return docs
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 9a688987c..64f38d21c 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -43,25 +43,36 @@ class Corpus:
                 locs.append(path)
         return locs
 
+    def _make_example(self, nlp, reference, gold_preproc):
+        if gold_preproc or reference.has_unknown_spaces:
+            return Example(
+                Doc(
+                    nlp.vocab,
+                    words=[word.text for word in reference],
+                    spaces=[bool(word.whitespace_) for word in reference]
+                ),
+                reference
+            )
+        else:
+            return Example(
+                nlp.make_doc(reference.text),
+                reference
+            )
+ 
     def make_examples(self, nlp, reference_docs, max_length=0):
         for reference in reference_docs:
             if len(reference) == 0:
                 continue
             elif max_length == 0 or len(reference) < max_length:
-                yield Example(
-                    nlp.make_doc(reference.text),
-                    reference
-                )
+                yield self._make_example(nlp, reference, False)
             elif reference.is_sentenced:
                 for ref_sent in reference.sents:
                     if len(ref_sent) == 0:
                         continue
                     elif max_length == 0 or len(ref_sent) < max_length:
-                        yield Example(
-                            nlp.make_doc(ref_sent.text),
-                            ref_sent.as_doc()
-                        )
+                        yield self._make_example(nlp, ref_sent.as_doc(), False)
     
+
     def make_examples_gold_preproc(self, nlp, reference_docs):
         for reference in reference_docs:
             if reference.is_sentenced:
@@ -69,14 +80,7 @@ class Corpus:
             else:
                 ref_sents = [reference]
             for ref_sent in ref_sents:
-                eg = Example(
-                    Doc(
-                        nlp.vocab, 
-                        words=[w.text for w in ref_sent],
-                        spaces=[bool(w.whitespace_) for w in ref_sent]
-                    ),
-                    ref_sent
-                )
+                eg = self._make_example(nlp, ref_sent, True)
                 if len(eg.x):
                     yield eg
 
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 2ecee1821..7b629dcd2 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -15,7 +15,7 @@ from ..syntax import nonproj
 
 
 cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
-    """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
+    """ Create a Doc from dictionaries with token and doc annotations. """
     attrs, array = _annot2array(vocab, tok_annot, doc_annot)
     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
     if "entities" in doc_annot:
@@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):
 
 def _guess_spaces(text, words):
     if text is None:
-        return [True] * len(words)
+        return None
     spaces = []
     text_pos = 0
     # align words with text
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 615bb1cd9..85c21f7f9 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -75,3 +75,19 @@ def test_serialize_doc_bin():
     for i, doc in enumerate(reloaded_docs):
         assert doc.text == texts[i]
         assert doc.cats == cats
+
+
+def test_serialize_doc_bin_unknown_spaces(en_vocab):
+    doc1 = Doc(en_vocab, words=["that", "'s"])
+    assert doc1.has_unknown_spaces
+    assert doc1.text == "that 's "
+    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
+    assert not doc2.has_unknown_spaces
+    assert doc2.text == "that's"
+
+    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
+    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
+    assert re_doc1.has_unknown_spaces
+    assert re_doc1.text == "that 's "
+    assert not re_doc2.has_unknown_spaces
+    assert re_doc2.text == "that's"
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index a56900988..edc183e0d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -61,6 +61,7 @@ class DocBin(object):
         self.spaces = []
         self.cats = []
         self.user_data = []
+        self.flags = []
         self.strings = set()
         self.store_user_data = store_user_data
         for doc in docs:
@@ -85,6 +86,9 @@ class DocBin(object):
         assert array.shape[0] == spaces.shape[0]  # this should never happen
         spaces = spaces.reshape((spaces.shape[0], 1))
         self.spaces.append(numpy.asarray(spaces, dtype=bool))
+        self.flags.append({
+            "has_unknown_spaces": doc.has_unknown_spaces
+        })
         for token in doc:
             self.strings.add(token.text)
             self.strings.add(token.tag_)
@@ -109,8 +113,11 @@ class DocBin(object):
             vocab[string]
         orth_col = self.attrs.index(ORTH)
         for i in range(len(self.tokens)):
+            flags = self.flags[i]
             tokens = self.tokens[i]
             spaces = self.spaces[i]
+            if flags.get("has_unknown_spaces"):
+                spaces = None
             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
             doc = doc.from_array(self.attrs, tokens)
             doc.cats = self.cats[i]
@@ -134,6 +141,7 @@ class DocBin(object):
         self.spaces.extend(other.spaces)
         self.strings.update(other.strings)
         self.cats.extend(other.cats)
+        self.flags.extend(other.flags)
         if self.store_user_data:
             self.user_data.extend(other.user_data)
 
@@ -158,6 +166,7 @@ class DocBin(object):
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
             "strings": list(self.strings),
             "cats": self.cats,
+            "flags": self.flags,
         }
         if self.store_user_data:
             msg["user_data"] = self.user_data
@@ -183,6 +192,7 @@ class DocBin(object):
         self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
         self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
         self.cats = msg["cats"]
+        self.flags = msg.get("flags", [{} for _ in lengths])
         if self.store_user_data and "user_data" in msg:
             self.user_data = list(msg["user_data"])
         for tokens in self.tokens:
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 42918ab6d..2775aa97e 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -59,11 +59,14 @@ cdef class Doc:
     cdef public dict user_token_hooks
     cdef public dict user_span_hooks
 
+    cdef public bint has_unknown_spaces
+
     cdef public list _py_tokens
 
     cdef int length
     cdef int max_length
 
+
     cdef public object noun_chunks_iterator
 
     cdef object __weakref__
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f69a6811d..723873e1f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -172,8 +172,7 @@ cdef class Doc:
             raise ValueError(Errors.E046.format(name=name))
         return Underscore.doc_extensions.pop(name)
 
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
-                 orths_and_spaces=None):
+    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
         """Create a Doc object.
 
         vocab (Vocab): A vocabulary object, which must match any models you
@@ -215,28 +214,25 @@ cdef class Doc:
         self._vector = None
         self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
         cdef bint has_space
-        if orths_and_spaces is None and words is not None:
-            if spaces is None:
-                spaces = [True] * len(words)
-            elif len(spaces) != len(words):
-                raise ValueError(Errors.E027)
-            orths_and_spaces = zip(words, spaces)
+        if words is None and spaces is not None:
+            raise ValueError("words must be set if spaces is set")
+        elif spaces is None and words is not None:
+            self.has_unknown_spaces = True
+        else:
+            self.has_unknown_spaces = False
+        words = words if words is not None else []
+        spaces = spaces if spaces is not None else ([True] * len(words))
+        if len(spaces) != len(words):
+            raise ValueError(Errors.E027)
         cdef const LexemeC* lexeme
-        if orths_and_spaces is not None:
-            orths_and_spaces = list(orths_and_spaces)
-            for orth_space in orths_and_spaces:
-                if isinstance(orth_space, unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space)
-                    has_space = True
-                elif isinstance(orth_space, bytes):
-                    raise ValueError(Errors.E028.format(value=orth_space))
-                elif isinstance(orth_space[0], unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space[0])
-                    has_space = orth_space[1]
-                else:
-                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
-                    has_space = orth_space[1]
-                self.push_back(lexeme, has_space)
+        for word, has_space in zip(words, spaces):
+            if isinstance(word, unicode):
+                lexeme = self.vocab.get(self.mem, word)
+            elif isinstance(word, bytes):
+                raise ValueError(Errors.E028.format(value=word))
+            else:
+                lexeme = self.vocab.get_by_orth(self.mem, word)
+            self.push_back(lexeme, has_space)
         # Tough to decide on policy for this. Is an empty doc tagged and parsed?
         # There's no information we'd like to add to it, so I guess so?
         if self.length == 0:
@@ -1082,6 +1078,7 @@ cdef class Doc:
             "sentiment": lambda: self.sentiment,
             "tensor": lambda: self.tensor,
             "cats": lambda: self.cats,
+            "has_unknown_spaces": lambda: self.has_unknown_spaces
         }
         for key in kwargs:
             if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@@ -1114,6 +1111,7 @@ cdef class Doc:
             "cats": lambda b: None,
             "user_data_keys": lambda b: None,
             "user_data_values": lambda b: None,
+            "has_unknown_spaces": lambda b: None
         }
         for key in kwargs:
             if key in deserializers or key in ("user_data",):
@@ -1134,6 +1132,8 @@ cdef class Doc:
             self.tensor = msg["tensor"]
         if "cats" not in exclude and "cats" in msg:
             self.cats = msg["cats"]
+        if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
+            self.has_unknown_spaces = msg["has_unknown_spaces"]
         start = 0
         cdef const LexemeC* lex
         cdef unicode orth_

From e1b3e8ee114ed74cccd802ed1e7a738fe8e316e5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Jul 2020 13:21:08 +0200
Subject: [PATCH 11/11] Set version to v3.0.0a1

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 818dd1286..5b2a89c61 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a0"
+__version__ = "3.0.0a1"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"