From 139f655f344cff20d9535455a4b72f7a76c90748 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Mar 2021 13:34:01 +0200
Subject: [PATCH] Merge doc.spans in Doc.from_docs() (#7497)

Merge data from `doc.spans` in `Doc.from_docs()`.

* Fix internal character offset set when merging empty docs (only
affects tokens and spans in `user_data` if an empty doc is in the list
of docs)
---
 spacy/errors.py                 |  3 +++
 spacy/tests/doc/test_doc_api.py |  9 +++++++++
 spacy/tokens/doc.pyx            | 30 ++++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d8c5cc3a8..289d2cfed 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -497,6 +497,9 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
+            "'{text}'. This is likely a bug in spaCy, so feel free to open an "
+            "issue: https://github.com/explosion/spaCy/issues")
     E874 = ("Could not initialize the tok2vec model from component "
             "'{component}' and layer '{layer}'.")
     E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c27139d2f..0b915513f 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -352,6 +352,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_texts_without_empty = [t for t in en_texts if len(t)]
     de_text = "Wie war die Frage?"
     en_docs = [en_tokenizer(text) for text in en_texts]
+    en_docs[0].spans["group"] = [en_docs[0][1:4]]
+    en_docs[2].spans["group"] = [en_docs[2][1:4]]
+    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
     docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
     expected = (True, None, None, None)
@@ -377,6 +380,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
         # not callable, because it was not set via set_extension
         m_doc[2]._.is_ambiguous
     assert len(m_doc.user_data) == len(en_docs[0].user_data)  # but it's there
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
@@ -388,6 +393,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert len(m_doc) == len(en_docs_tokens)
     think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
     assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
@@ -399,6 +406,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert len(m_doc) == len(en_docs_tokens)
     think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
     assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
 
 def test_doc_api_from_docs_ents(en_tokenizer):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 850036483..69f900297 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -6,7 +6,7 @@ from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
 import copy
-from collections import Counter
+from collections import Counter, defaultdict
 from enum import Enum
 import itertools
 import numpy
@@ -1120,6 +1120,7 @@ cdef class Doc:
         concat_words = []
         concat_spaces = []
         concat_user_data = {}
+        concat_spans = defaultdict(list)
         char_offset = 0
         for doc in docs:
             concat_words.extend(t.text for t in doc)
@@ -1137,8 +1138,17 @@ cdef class Doc:
                         warnings.warn(Warnings.W101.format(name=name))
                 else:
                     warnings.warn(Warnings.W102.format(key=key, value=value))
+            for key in doc.spans:
+                for span in doc.spans[key]:
+                    concat_spans[key].append((
+                        span.start_char + char_offset,
+                        span.end_char + char_offset,
+                        span.label,
+                        span.kb_id,
+                        span.text, # included as a check
+                    ))
             char_offset += len(doc.text)
-            if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space):
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
                 char_offset += 1
 
         arrays = [doc.to_array(attrs) for doc in docs]
@@ -1160,6 +1170,22 @@ cdef class Doc:
 
         concat_doc.from_array(attrs, concat_array)
 
+        for key in concat_spans:
+            if key not in concat_doc.spans:
+                concat_doc.spans[key] = []
+            for span_tuple in concat_spans[key]:
+                span = concat_doc.char_span(
+                        span_tuple[0],
+                        span_tuple[1],
+                        label=span_tuple[2],
+                        kb_id=span_tuple[3],
+                )
+                text = span_tuple[4]
+                if span is not None and span.text == text:
+                    concat_doc.spans[key].append(span)
+                else:
+                    raise ValueError(Errors.E873.format(key=key, text=text))
+
         return concat_doc
 
     def get_lca_matrix(self):