From 31504f59828a17de7e71bfdf9dd0e05ba8d229bc Mon Sep 17 00:00:00 2001
From: Nick Sorros <nsorros@gmail.com>
Date: Tue, 22 Jun 2021 10:41:45 +0300
Subject: [PATCH 1/4] Switch model and data path in prodigy project.yml recipe
 (#8467)

---
 website/docs/usage/projects.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index fc191824a..d30a50302 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -800,7 +800,7 @@ vars:
 commands:
   - name: annotate
   - script:
-      - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
+      - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}'
       - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
       - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
       - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'

From d96c422cfc1c63e7cdf56dcc3608cbda5efcd948 Mon Sep 17 00:00:00 2001
From: themrmax <max@fillr.com>
Date: Tue, 22 Jun 2021 15:34:06 -0700
Subject: [PATCH 2/4] Fix broken link

change /api/registry to /api/top-level#registry
---
 website/docs/usage/linguistic-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 5a1293c2e..b05d16da3 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
 pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
 takes the `nlp` object and returns a tokenizer. Here, we're registering a
 function called `whitespace_tokenizer` in the
-[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
+[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
 construct your tokenizer during training, you can pass in your Python file by
 setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
 

From 393c3c70d7ca1c00d4357faa4cd05450285db160 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 23 Jun 2021 15:51:35 +0200
Subject: [PATCH 3/4] Various fixes for spans in Docs.from_docs (#8487)

* Fix spans offsets if a doc ends in a single space and no space is
  inserted
* Also include spans key in merged doc for empty spans lists
---
 spacy/tests/doc/test_doc_api.py | 40 ++++++++++++++++++++++++---------
 spacy/tokens/doc.pyx            |  6 ++++-
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 7ca231bc4..6f3f9ae0b 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab):
 
 @pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
-    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
+    en_texts = [
+        "Merging the docs is fun.",
+        "",
+        "They don't think alike. ",
+        "Another doc.",
+    ]
     en_texts_without_empty = [t for t in en_texts if len(t)]
     de_text = "Wie war die Frage?"
     en_docs = [en_tokenizer(text) for text in en_texts]
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
+    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    span_group_texts = sorted(
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+    )
     de_doc = de_tokenizer(de_text)
     Token.set_extension("is_ambiguous", default=False)
-    en_docs[0][2]._.is_ambiguous = True # docs
-    en_docs[2][3]._.is_ambiguous = True # think
+    en_docs[0][2]._.is_ambiguous = True  # docs
+    en_docs[2][3]._.is_ambiguous = True  # think
     assert Doc.from_docs([]) is None
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
@@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 
     m_doc = Doc.from_docs(en_docs)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert not any([t._.is_ambiguous for t in m_doc[3:8]])
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
+    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
-    assert str(m_doc) == "".join(en_texts)
+    assert len(m_doc.text) == sum(len(t) for t in en_texts)
+    assert m_doc.text == "".join(en_texts_without_empty)
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and not bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
+    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     # can merge empty docs
     doc = Doc.from_docs([en_tokenizer("")] * 10)
 
+    # empty but set spans keys are preserved
+    en_docs = [en_tokenizer(text) for text in en_texts]
+    m_doc = Doc.from_docs(en_docs)
+    assert "group" not in m_doc.spans
+    for doc in en_docs:
+        doc.spans["group"] = []
+    m_doc = Doc.from_docs(en_docs)
+    assert "group" in m_doc.spans
+    assert len(m_doc.spans["group"]) == 0
+
 
 def test_doc_api_from_docs_ents(en_tokenizer):
     texts = ["Merging the docs is fun.", "They don't think alike."]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 912d8c17d..cd2bd6f6c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1141,6 +1141,10 @@ cdef class Doc:
                 else:
                     warnings.warn(Warnings.W102.format(key=key, value=value))
             for key in doc.spans:
+                # if a spans key is in any doc, include it in the merged doc
+                # even if it is empty
+                if key not in concat_spans:
+                    concat_spans[key] = []
                 for span in doc.spans[key]:
                     concat_spans[key].append((
                         span.start_char + char_offset,
@@ -1150,7 +1154,7 @@ cdef class Doc:
                         span.text, # included as a check
                     ))
             char_offset += len(doc.text)
-            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
                 char_offset += 1
 
         arrays = [doc.to_array(attrs) for doc in docs]

From 3e3d87a068019fd66625ef4f1686f0ad616f7ab5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Jun 2021 12:37:55 +1000
Subject: [PATCH 4/4] Update maintainer info [ci skip]

---
 CONTRIBUTING.md |  6 +-----
 README.md       | 10 +++++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 072981270..682e5134c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,11 +2,7 @@
 
 # Contribute to spaCy
 
-Thanks for your interest in contributing to spaCy 🎉 The project is maintained
-by **[@honnibal](https://github.com/honnibal)**,
-**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
-**[@adrianeboyd](https://github.com/adrianeboyd)**,
-and we'll do our best to help you get started. This page will give you a quick
+Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
 overview of how things are organized and most importantly, how to get involved.
 
 ## Table of contents
diff --git a/README.md b/README.md
index 3bc7ba0f1..61d5449a4 100644
--- a/README.md
+++ b/README.md
@@ -61,11 +61,11 @@ open-source software, released under the MIT license.
 ## 💬 Where to ask questions
 
 The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
-**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
-**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
-be able to provide individual support via email. We also believe that help is
-much more valuable if it's shared publicly, so that more people can benefit from
-it.
+**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
+**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
+Please understand that we won't be able to provide individual support via email.
+We also believe that help is much more valuable if it's shared publicly, so that
+more people can benefit from it.
 
 | Type                            | Platforms                               |
 | ------------------------------- | --------------------------------------- |