From ac640f5118e2abbdfe03674002328191a58654e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2020 15:24:49 +0200 Subject: [PATCH] Revert "Remove Span._recalculate_indices" This reverts commit 727370c633b37457ddbedc80aecf07e1dc2c967d. --- spacy/tests/doc/test_doc_api.py | 9 ++++++--- spacy/tests/doc/test_retokenize_merge.py | 1 - spacy/tokens/span.pxd | 1 + spacy/tokens/span.pyx | 17 +++++++++++++++++ 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index db8a6d1c4..ea832c136 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -608,11 +608,14 @@ def test_doc_init_iob(): doc = Doc(Vocab(), words=words, ents=ents) -def test_doc_set_ents_invalid_spans(en_tokenizer): +@pytest.mark.xfail +def test_doc_set_ents_spans(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) - with pytest.raises(IndexError): - doc.ents = spans + # If this line is uncommented, it works: + # print(spans) + doc.ents = spans + assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"] diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index b483255c8..cb886545a 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -336,7 +336,6 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) retokenizer.merge(doc[-2:], attrs=attrs) - sent1, sent2 = list(doc.sents) assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1 diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index cc6b908bb..f6f88a23e 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -16,4 +16,5 @@ cdef class Span: cdef public _vector cdef public _vector_norm + cpdef int _recalculate_indices(self) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 491ba0266..64c3c7df0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -150,6 +150,7 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#len """ + self._recalculate_indices() if self.end < self.start: return 0 return self.end - self.start @@ -166,6 +167,7 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#getitem """ + self._recalculate_indices() if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self.doc, start + self.start, end + self.start) @@ -186,6 +188,7 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#iter """ + self._recalculate_indices() for i in range(self.start, self.end): yield self.doc[i] @@ -336,6 +339,19 @@ cdef class Span: output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) return output + cpdef int _recalculate_indices(self) except -1: + if self.end > self.doc.length \ + or self.doc.c[self.start].idx != self.start_char \ + or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: + start = token_by_start(self.doc.c, self.doc.length, self.start_char) + if self.start == -1: + raise IndexError(Errors.E036.format(start=self.start_char)) + end = token_by_end(self.doc.c, self.doc.length, self.end_char) + if end == -1: + raise IndexError(Errors.E037.format(end=self.end_char)) + self.start = start + self.end = end + 1 + @property def vocab(self): """RETURNS (Vocab): The Span's Doc's vocab.""" @@ -504,6 +520,7 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#root """ + self._recalculate_indices() if "root" in self.doc.user_span_hooks: return self.doc.user_span_hooks["root"](self) # This should probably be called 'head', and the other one called