From 139f655f344cff20d9535455a4b72f7a76c90748 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 29 Mar 2021 13:34:01 +0200 Subject: [PATCH] Merge doc.spans in Doc.from_docs() (#7497) Merge data from `doc.spans` in `Doc.from_docs()`. * Fix internal character offset set when merging empty docs (only affects tokens and spans in `user_data` if an empty doc is in the list of docs) --- spacy/errors.py | 3 +++ spacy/tests/doc/test_doc_api.py | 9 +++++++++ spacy/tokens/doc.pyx | 30 ++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index d8c5cc3a8..289d2cfed 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -497,6 +497,9 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E873 = ("Unable to merge a span from doc.spans with key '{key}' and text " + "'{text}'. This is likely a bug in spaCy, so feel free to open an " + "issue: https://github.com/explosion/spaCy/issues") E874 = ("Could not initialize the tok2vec model from component " "'{component}' and layer '{layer}'.") E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. " diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index c27139d2f..0b915513f 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -352,6 +352,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] + en_docs[0].spans["group"] = [en_docs[0][1:4]] + en_docs[2].spans["group"] = [en_docs[2][1:4]] + span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) docs_idx = en_texts[0].index("docs") de_doc = de_tokenizer(de_text) expected = (True, None, None, None) @@ -377,6 +380,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): # not callable, because it was not set via set_extension m_doc[2]._.is_ambiguous assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) @@ -388,6 +393,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think") assert m_doc[9].idx == think_idx + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) @@ -399,6 +406,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx + assert "group" in m_doc.spans + assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) def test_doc_api_from_docs_ents(en_tokenizer): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 850036483..69f900297 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -6,7 +6,7 @@ from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t import copy -from collections import Counter +from collections import Counter, defaultdict from enum import Enum import itertools import numpy @@ -1120,6 +1120,7 @@ cdef class Doc: concat_words = [] concat_spaces = [] concat_user_data = {} + concat_spans = defaultdict(list) char_offset = 0 for doc in docs: concat_words.extend(t.text for t in doc) @@ -1137,8 +1138,17 @@ cdef class Doc: warnings.warn(Warnings.W101.format(name=name)) else: warnings.warn(Warnings.W102.format(key=key, value=value)) + for key in doc.spans: + for span in doc.spans[key]: + concat_spans[key].append(( + span.start_char + char_offset, + span.end_char + char_offset, + span.label, + span.kb_id, + span.text, # included as a check + )) char_offset += len(doc.text) - if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] @@ -1160,6 +1170,22 @@ cdef class Doc: concat_doc.from_array(attrs, concat_array) + for key in concat_spans: + if key not in concat_doc.spans: + concat_doc.spans[key] = [] + for span_tuple in concat_spans[key]: + span = concat_doc.char_span( + span_tuple[0], + span_tuple[1], + label=span_tuple[2], + kb_id=span_tuple[3], + ) + text = span_tuple[4] + if span is not None and span.text == text: + concat_doc.spans[key].append(span) + else: + raise ValueError(Errors.E873.format(key=key, text=text)) + return concat_doc def get_lca_matrix(self):