Merge pull request #6232 from adrianeboyd/feature/remove-span-recalculate

Remove Span._recalculate_indices
2026-03-05 20:31:30 +03:00 · 2020-10-09 15:54:13 +02:00 · 2020-10-09 15:54:13 +02:00 · c23ce1ae71
commit c23ce1ae71
parent 4771a10503 727370c633
4 changed files with 4 additions and 24 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -608,14 +608,11 @@ def test_doc_init_iob():
        doc = Doc(Vocab(), words=words, ents=ents)


-@pytest.mark.xfail
-def test_doc_set_ents_spans(en_tokenizer):
+def test_doc_set_ents_invalid_spans(en_tokenizer):
    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
    spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
-    # If this line is uncommented, it works:
-    # print(spans)
-    doc.ents = spans
-    assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
+    with pytest.raises(IndexError):
+        doc.ents = spans
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
        retokenizer.merge(doc[-2:], attrs=attrs)
+    sent1, sent2 = list(doc.sents)
    assert len(sent1) == init_len - 1
    assert len(sent2) == init_len2 - 1

--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -16,5 +16,4 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm

-    cpdef int _recalculate_indices(self) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -150,7 +150,6 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#len
        """
-        self._recalculate_indices()
        if self.end < self.start:
            return 0
        return self.end - self.start
@ -167,7 +166,6 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#getitem
        """
-        self._recalculate_indices()
        if isinstance(i, slice):
            start, end = normalize_slice(len(self), i.start, i.stop, i.step)
            return Span(self.doc, start + self.start, end + self.start)
@ -188,7 +186,6 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#iter
        """
-        self._recalculate_indices()
        for i in range(self.start, self.end):
            yield self.doc[i]

@ -339,19 +336,6 @@ cdef class Span:
                output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
        return output

-    cpdef int _recalculate_indices(self) except -1:
-        if self.end > self.doc.length \
-        or self.doc.c[self.start].idx != self.start_char \
-        or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
-            start = token_by_start(self.doc.c, self.doc.length, self.start_char)
-            if self.start == -1:
-                raise IndexError(Errors.E036.format(start=self.start_char))
-            end = token_by_end(self.doc.c, self.doc.length, self.end_char)
-            if end == -1:
-                raise IndexError(Errors.E037.format(end=self.end_char))
-            self.start = start
-            self.end = end + 1
-
    @property
    def vocab(self):
        """RETURNS (Vocab): The Span's Doc's vocab."""
@ -520,7 +504,6 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#root
        """
-        self._recalculate_indices()
        if "root" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["root"](self)
        # This should probably be called 'head', and the other one called