Merging multiple docs into one (#5032)

* Add static method to Doc to allow merging of multiple docs.

* Add error description for the error that occurs if docs with different
vocabs (from different languages) are merged in Doc.from_docs().

* Add test for Doc.from_docs() implementation.

* Fix using numpy's concatenate in Doc.from_docs.

* Replace typing's type annotations in from_docs.

* Simply remove type annotations in from_docs.

* Add documentation for Doc.from_docs to api.

* Simplify from_docs, its test and the api doc for codebase consistency.

* Fix merging of Doc objects that end with whitespaces (Achieved by simply not setting the SPACY attribute on whitespace tokens). Remove two unnecessary imports of attributes.

* Add merging of user data from Doc objects in from_docs. Add user data test case to corresponding test. Add applicable warning messages.

* Fix incorrect setting of tokens idx by using concatenated spaces (again). Add test case to corresponding test.

* Add MORPH to attrs

* Update warnings calls

* Remove out-dated error from merge

* Rename space_delimiter to ensure_whitespace

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Jan Jessewitsch 2020-07-03 11:32:42 +02:00 committed by GitHub
parent 41b65fd0f8
commit e4dcac4a4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 170 additions and 3 deletions

View File

@ -159,6 +159,8 @@ class Warnings(object):
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
@add_codes
@ -593,7 +595,9 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
@add_codes
class TempErrors(object):

View File

@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
assert new_doc.is_parsed
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "They don't think alike."]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
docs_idx = en_texts[0].index('docs')
de_doc = de_tokenizer(de_text)
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc]))
with pytest.raises(ValueError):
Doc.from_docs(en_docs + [de_doc])
m_doc = Doc.from_docs(en_docs)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts)
p_token = m_doc[len(en_docs[0])-1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
with pytest.raises(AttributeError):
not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == "".join(en_texts)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing
assert list(m_doc.sents)
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en"

View File

@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
import copy
from collections import Counter
import numpy
import numpy.linalg
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS
from ..attrs import intify_attr, intify_attrs, IDS
from ..util import normalize_slice
from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings
@ -806,7 +807,7 @@ cdef class Doc:
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs]
if array.dtype != numpy.uint64:
warnings.warn(Warnings.W028.format(type=array.dtype))
warnings.warn(Warnings.W101.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032)
@ -882,6 +883,87 @@ cdef class Doc:
set_children_from_heads(self.c, length)
return self
@staticmethod
def from_docs(docs, ensure_whitespace=True, attrs=None):
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
the same `Vocab`.
docs (list): A list of Doc objects.
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
attrs (list): Optional list of attribute ID ints or attribute name strings.
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
DOCS: https://spacy.io/api/doc#from_docs
"""
if not docs:
return None
vocab = {doc.vocab for doc in docs}
if len(vocab) > 1:
raise ValueError(Errors.E999)
(vocab,) = vocab
if attrs is None:
attrs = [LEMMA, NORM]
if all(doc.is_nered for doc in docs):
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
# TODO: separate for is_morphed?
if all(doc.is_tagged for doc in docs):
attrs.extend([TAG, POS, MORPH])
if all(doc.is_parsed for doc in docs):
attrs.extend([HEAD, DEP])
else:
attrs.append(SENT_START)
else:
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present
if SPACY not in attrs:
attrs.append(SPACY)
concat_words = []
concat_spaces = []
concat_user_data = {}
char_offset = 0
for doc in docs:
concat_words.extend(t.text for t in doc)
concat_spaces.extend(bool(t.whitespace_) for t in doc)
for key, value in doc.user_data.items():
if isinstance(key, tuple) and len(key) == 4:
data_type, name, start, end = key
if start is not None or end is not None:
start += char_offset
if end is not None:
end += char_offset
concat_user_data[(data_type, name, start, end)] = copy.copy(value)
else:
warnings.warn(Warnings.W101.format(name=name))
else:
warnings.warn(Warnings.W102.format(key=key, value=value))
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
arrays = [doc.to_array(attrs) for doc in docs]
if ensure_whitespace:
spacy_index = attrs.index(SPACY)
for i, array in enumerate(arrays[:-1]):
if len(array) > 0 and not docs[i][-1].is_space:
array[-1][spacy_index] = 1
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
if not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)
concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
concat_doc.from_array(attrs, concat_array)
return concat_doc
def get_lca_matrix(self):
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among

View File

@ -349,6 +349,33 @@ array of attributes.
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | Itself. |
## Doc.from_docs {#from_docs tag="staticmethod"}
Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
> #### Example
>
> ```python
> from spacy.tokens import Doc
> texts = ["London is the capital of the United Kingdom.",
> "The River Thames flows through London.",
> "The famous Tower Bridge crosses the River Thames."]
> docs = list(nlp.pipe(texts))
> c_doc = Doc.from_docs(docs)
> assert str(c_doc) == " ".join(texts)
> assert len(list(c_doc.sents)) == len(docs)
> assert [str(ent) for ent in c_doc.ents] == \
> [str(ent) for doc in docs for ent in doc.ents]
> ```
| Name | Type | Description |
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
| `docs` | list | A list of `Doc` objects. |
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. |
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. |
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
## Doc.to_disk {#to_disk tag="method" new="2"}
Save the current state to a directory.