Merging multiple docs into one (#5032)

* Add static method to Doc to allow merging of multiple docs. * Add error description for the error that occurs if docs with different vocabs (from different languages) are merged in Doc.from_docs(). * Add test for Doc.from_docs() implementation. * Fix using numpy's concatenate in Doc.from_docs. * Replace typing's type annotations in from_docs. * Simply remove type annotations in from_docs. * Add documentation for Doc.from_docs to api. * Simplify from_docs, its test and the api doc for codebase consistency. * Fix merging of Doc objects that end with whitespaces (Achieved by simply not setting the SPACY attribute on whitespace tokens). Remove two unnecessary imports of attributes. * Add merging of user data from Doc objects in from_docs. Add user data test case to corresponding test. Add applicable warning messages. * Fix incorrect setting of tokens idx by using concatenated spaces (again). Add test case to corresponding test. * Add MORPH to attrs * Update warnings calls * Remove out-dated error from merge * Rename space_delimiter to ensure_whitespace Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-07-26 07:59:47 +03:00 · 2020-07-03 11:32:42 +02:00 · 2020-07-03 11:32:42 +02:00 · e4dcac4a4b
commit e4dcac4a4b
parent 41b65fd0f8
4 changed files with 170 additions and 3 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -159,6 +159,8 @@ class Warnings(object):
    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
            "string \"Field1=Value1,Value2|Field2=Value3\".")
+    W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
+    W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")


@add_codes
@ -593,7 +595,9 @@ class Errors(object):
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
            "This would map '{chunk}' to '{orth}' given token attributes "
            "'{token_attrs}'.")
- 
+    E999 = ("Unable to merge the `Doc` objects because they do not all share "
+            "the same `Vocab`.")
+

@add_codes
 class TempErrors(object):
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
    assert new_doc.is_parsed


+def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
+    en_texts = ["Merging the docs is fun.", "They don't think alike."]
+    de_text = "Wie war die Frage?"
+    en_docs = [en_tokenizer(text) for text in en_texts]
+    docs_idx = en_texts[0].index('docs')
+    de_doc = de_tokenizer(de_text)
+    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
+
+    assert Doc.from_docs([]) is None
+
+    assert de_doc is not Doc.from_docs([de_doc])
+    assert str(de_doc) == str(Doc.from_docs([de_doc]))
+
+    with pytest.raises(ValueError):
+        Doc.from_docs(en_docs + [de_doc])
+
+    m_doc = Doc.from_docs(en_docs)
+    assert len(en_docs) == len(list(m_doc.sents))
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == " ".join(en_texts)
+    p_token = m_doc[len(en_docs[0])-1]
+    assert p_token.text == "." and bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+    with pytest.raises(AttributeError):
+        not_available = m_doc[2]._.is_ambiguous             # not callable, because it was not set via set_extension
+    assert len(m_doc.user_data) == len(en_docs[0].user_data)    # but it's there
+
+    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
+    assert len(en_docs) == len(list(m_doc.sents))
+    assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == "".join(en_texts)
+    p_token = m_doc[len(en_docs[0]) - 1]
+    assert p_token.text == "." and not bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+
+    m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
+    with pytest.raises(ValueError):                 # important attributes from sentenziser or parser are missing
+        assert list(m_doc.sents)
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == " ".join(en_texts)         # space delimiter considered, although spacy attribute was missing
+    p_token = m_doc[len(en_docs[0]) - 1]
+    assert p_token.text == "." and bool(p_token.whitespace_)
+    en_docs_tokens = [t for doc in en_docs for t in doc]
+    assert len(m_doc) == len(en_docs_tokens)
+    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
+    assert m_doc[9].idx == think_idx
+
+
 def test_doc_lang(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    assert doc.lang_ == "en"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t

+import copy
 from collections import Counter
 import numpy
 import numpy.linalg
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t

-from ..attrs import intify_attrs, IDS
+from ..attrs import intify_attr, intify_attrs, IDS
 from ..util import normalize_slice
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
@ -806,7 +807,7 @@ cdef class Doc:
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
        if array.dtype != numpy.uint64:
-            warnings.warn(Warnings.W028.format(type=array.dtype))
+            warnings.warn(Warnings.W101.format(type=array.dtype))

        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
@ -882,6 +883,87 @@ cdef class Doc:
            set_children_from_heads(self.c, length)
        return self

+    @staticmethod
+    def from_docs(docs, ensure_whitespace=True, attrs=None):
+        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
+        the same `Vocab`.
+
+        docs (list): A list of Doc objects.
+        ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
+        attrs (list): Optional list of attribute ID ints or attribute name strings.
+        RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
+
+        DOCS: https://spacy.io/api/doc#from_docs
+        """
+        if not docs:
+            return None
+
+        vocab = {doc.vocab for doc in docs}
+        if len(vocab) > 1:
+            raise ValueError(Errors.E999)
+        (vocab,) = vocab
+
+        if attrs is None:
+            attrs = [LEMMA, NORM]
+            if all(doc.is_nered for doc in docs):
+                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
+            # TODO: separate for is_morphed?
+            if all(doc.is_tagged for doc in docs):
+                attrs.extend([TAG, POS, MORPH])
+            if all(doc.is_parsed for doc in docs):
+                attrs.extend([HEAD, DEP])
+            else:
+                attrs.append(SENT_START)
+        else:
+            if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
+                attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
+            attrs = list(attr for attr in set(attrs) if attr)    # filter duplicates, remove None if present
+        if SPACY not in attrs:
+            attrs.append(SPACY)
+
+        concat_words = []
+        concat_spaces = []
+        concat_user_data = {}
+        char_offset = 0
+        for doc in docs:
+            concat_words.extend(t.text for t in doc)
+            concat_spaces.extend(bool(t.whitespace_) for t in doc)
+
+            for key, value in doc.user_data.items():
+                if isinstance(key, tuple) and len(key) == 4:
+                    data_type, name, start, end = key
+                    if start is not None or end is not None:
+                        start += char_offset
+                        if end is not None:
+                            end += char_offset
+                        concat_user_data[(data_type, name, start, end)] = copy.copy(value)
+                    else:
+                        warnings.warn(Warnings.W101.format(name=name))
+                else:
+                    warnings.warn(Warnings.W102.format(key=key, value=value))
+            char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
+
+        arrays = [doc.to_array(attrs) for doc in docs]
+
+        if ensure_whitespace:
+            spacy_index = attrs.index(SPACY)
+            for i, array in enumerate(arrays[:-1]):
+                if len(array) > 0 and not docs[i][-1].is_space:
+                    array[-1][spacy_index] = 1
+            token_offset = -1
+            for doc in docs[:-1]:
+                token_offset += len(doc)
+                if not doc[-1].is_space:
+                    concat_spaces[token_offset] = True
+
+        concat_array = numpy.concatenate(arrays)
+
+        concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
+
+        concat_doc.from_array(attrs, concat_array)
+
+        return concat_doc
+
    def get_lca_matrix(self):
        """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
        `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -349,6 +349,33 @@ array of attributes.
 | `exclude`   | list                                   | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS** | `Doc`                                  | Itself.                                                                   |

+
+## Doc.from_docs {#from_docs tag="staticmethod"}
+
+Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Doc
+> texts = ["London is the capital of the United Kingdom.",
+>          "The River Thames flows through London.",
+>          "The famous Tower Bridge crosses the River Thames."]
+> docs = list(nlp.pipe(texts))
+> c_doc = Doc.from_docs(docs)
+> assert str(c_doc) == " ".join(texts)
+> assert len(list(c_doc.sents)) == len(docs)
+> assert [str(ent) for ent in c_doc.ents] == \
+>        [str(ent) for doc in docs for ent in doc.ents]
+> ```
+
+| Name                | Type  | Description                                                                                     |
+| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
+| `docs`              | list  | A list of `Doc` objects.                                                                        |
+| `ensure_whitespace` | bool  | Insert a space between two adjacent docs whenever the first doc does not end in whitespace.     |
+| `attrs`             | list  | Optional list of attribute ID ints or attribute name strings.                                   |
+| **RETURNS**         | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
+
 ## Doc.to_disk {#to_disk tag="method" new="2"}

 Save the current state to a directory.