From fbbbda195446be61eb8a3ea4930884e059c28e64 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 2 Aug 2021 19:07:19 +0200 Subject: [PATCH] Fix start/end chars for empty and out-of-bounds spans (#8816) --- spacy/tests/doc/test_span.py | 30 ++++++++++++++++++++++++++++++ spacy/tokens/span.pyx | 9 +++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 6e34f2126..01b022b9d 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -357,6 +357,9 @@ def test_span_eq_hash(doc, doc_not_parsed): assert hash(doc[0:2]) != hash(doc[1:3]) assert hash(doc[0:2]) != hash(doc_not_parsed[0:2]) + # check that an out-of-bounds is not equivalent to the span of the full doc + assert doc[0 : len(doc)] != doc[len(doc) : len(doc) + 1] + def test_span_boundaries(doc): start = 1 @@ -369,6 +372,33 @@ def test_span_boundaries(doc): with pytest.raises(IndexError): span[5] + empty_span_0 = doc[0:0] + assert empty_span_0.text == "" + assert empty_span_0.start == 0 + assert empty_span_0.end == 0 + assert empty_span_0.start_char == 0 + assert empty_span_0.end_char == 0 + + empty_span_1 = doc[1:1] + assert empty_span_1.text == "" + assert empty_span_1.start == 1 + assert empty_span_1.end == 1 + assert empty_span_1.start_char == empty_span_1.end_char + + oob_span_start = doc[-len(doc) - 1 : -len(doc) - 10] + assert oob_span_start.text == "" + assert oob_span_start.start == 0 + assert oob_span_start.end == 0 + assert oob_span_start.start_char == 0 + assert oob_span_start.end_char == 0 + + oob_span_end = doc[len(doc) + 1 : len(doc) + 10] + assert oob_span_end.text == "" + assert oob_span_end.start == len(doc) + assert oob_span_end.end == len(doc) + assert oob_span_end.start_char == len(doc.text) + assert oob_span_end.end_char == len(doc.text) + def test_span_lemma(doc): # span lemmas should have the same number of spaces as the span diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 093b2a4da..48c6053c1 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -105,13 +105,18 @@ cdef class Span: if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) + start_char = doc[start].idx if start < doc.length else len(doc.text) + if start == end: + end_char = start_char + else: + end_char = doc[end - 1].idx + len(doc[end - 1]) self.c = SpanC( label=label, kb_id=kb_id, start=start, end=end, - start_char=doc[start].idx if start < doc.length else 0, - end_char=doc[end - 1].idx + len(doc[end - 1]) if end >= 1 else 0, + start_char=start_char, + end_char=end_char, ) self._vector = vector self._vector_norm = vector_norm