Make Span/Doc.ents more consistent for ent_kb_id and ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents` * Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents` * Make `Span.ent_id` an alias of `Span.id` rather than a read-only view of the root token's `ent_id` annotation
2025-07-15 18:52:29 +03:00 · 2022-08-22 20:28:57 +02:00 · 2022-08-22 20:28:57 +02:00 · bb0e178878
commit bb0e178878
parent 1a5be63715
8 changed files with 94 additions and 36 deletions
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
    assert [t.ent_iob_ for t in doc] == orig_iobs
 def test_ents_clear(en_vocab):
    """Ensure that removing entities clears token attributes"""
    text = ["Louisiana", "Office", "of", "Conservation"]
    doc = Doc(en_vocab, words=text)
    entity = Span(doc, 0, 4, label=391, span_id="TEST")
    doc.ents = [entity]
    doc.ents = []
    for token in doc:
        assert token.ent_iob == 2
        assert token.ent_type == 0
        assert token.ent_id == 0
        assert token.ent_kb_id == 0
    doc.ents = [entity]
    doc.set_ents([], default="missing")
    for token in doc:
        assert token.ent_iob == 0
        assert token.ent_type == 0
        assert token.ent_id == 0
        assert token.ent_kb_id == 0
    doc.set_ents([], default="blocked")
    for token in doc:
        assert token.ent_iob == 3
        assert token.ent_type == 0
        assert token.ent_id == 0
        assert token.ent_kb_id == 0
 def test_add_overlapping_entities(en_vocab):
    text = ["Louisiana", "Office", "of", "Conservation"]
    doc = Doc(en_vocab, words=text)
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -692,3 +692,23 @@ def test_span_group_copy(doc):
    assert len(doc.spans["test"]) == 3
    # check that the copy spans were not modified and this is an isolated doc
    assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113)
 def test_span_ent_id(en_tokenizer):
    doc = en_tokenizer("a b c d")
    doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
    span = doc.ents[0]
    assert doc[1].ent_id_ == "ID0"
    # setting Span.id sets Token.ent_id
    span.id_ = "ID1"
    doc.ents = [span]
    assert doc.ents[0].ent_id_ == "ID1"
    assert doc[1].ent_id_ == "ID1"
    # Span.ent_id is an alias of Span.id
    span.ent_id_ = "ID2"
    doc.ents = [span]
    assert doc.ents[0].ent_id_ == "ID2"
    assert doc[1].ent_id_ == "ID2"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -808,27 +808,33 @@ cdef class Doc:
                    self.c[i].ent_iob = 1
                self.c[i].ent_type = span.label
                self.c[i].ent_kb_id = span.kb_id
-                # for backwards compatibility in v3, only set ent_id from
+                self.c[i].ent_id = span.id
                # span.id if it's set, otherwise don't override
                self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
        for span in blocked:
            for i in range(span.start, span.end):
                self.c[i].ent_iob = 3
                self.c[i].ent_type = 0
                self.c[i].ent_kb_id = 0
                self.c[i].ent_id = 0
        for span in missing:
            for i in range(span.start, span.end):
                self.c[i].ent_iob = 0
                self.c[i].ent_type = 0
                self.c[i].ent_kb_id = 0
                self.c[i].ent_id = 0
        for span in outside:
            for i in range(span.start, span.end):
                self.c[i].ent_iob = 2
                self.c[i].ent_type = 0
                self.c[i].ent_kb_id = 0
                self.c[i].ent_id = 0
        # Set tokens outside of all provided spans
        if default != SetEntsDefault.unmodified:
            for i in range(self.length):
                if i not in seen_tokens:
                    self.c[i].ent_type = 0
                    self.c[i].ent_kb_id = 0
                    self.c[i].ent_id = 0
                    if default == SetEntsDefault.outside:
                        self.c[i].ent_iob = 2
                    elif default == SetEntsDefault.missing:
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -115,17 +115,23 @@ class Span:
    end: int
    start_char: int
    end_char: int
-    label: int
+    @property
-    kb_id: int
+    def label(self) -> int: ...
-    ent_id: int
+    @property
-    ent_id_: str
+    def kb_id(self) -> int: ...
    @property
    def id(self) -> int: ...
    @property
-    def id_(self) -> str: ...
+    def ent_id(self) -> int: ...
    @property
    def orth_(self) -> str: ...
    @property
    def lemma_(self) -> str: ...
-    label_: str
+    @property
-    kb_id_: str
+    def label_(self) -> str: ...
    @property
    def kb_id_(self) -> str: ...
    @property
    def id_(self) -> str: ...
    @property
    def ent_id_(self) -> str: ...
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -802,28 +802,18 @@ cdef class Span:
    property id:
        def __get__(self):
-            cdef SpanC* span_c = self.span_c()
+            return self.span_c().id
            return span_c.id
        def __set__(self, attr_t id):
-            cdef SpanC* span_c = self.span_c()
+            self.span_c().id = id
            span_c.id = id
    property ent_id:
-        """RETURNS (uint64): The entity ID."""
+        """Alias for the span's ID."""
        def __get__(self):
-            return self.root.ent_id
+            return self.id
-        def __set__(self, hash_t key):
+        def __set__(self, attr_t ent_id):
-            raise NotImplementedError(Errors.E200.format(attr="ent_id"))
+            self.id = ent_id
    property ent_id_:
        """RETURNS (str): The (string) entity ID."""
        def __get__(self):
            return self.root.ent_id_
        def __set__(self, str key):
            raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
    @property
    def orth_(self):
@ -839,7 +829,7 @@ cdef class Span:
        return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
    property label_:
-        """RETURNS (str): The span's label."""
+        """The span's label."""
        def __get__(self):
            return self.doc.vocab.strings[self.label]
@ -847,7 +837,7 @@ cdef class Span:
            self.label = self.doc.vocab.strings.add(label_)
    property kb_id_:
-        """RETURNS (str): The span's KB ID."""
+        """The span's KB ID."""
        def __get__(self):
            return self.doc.vocab.strings[self.kb_id]
@ -855,13 +845,22 @@ cdef class Span:
            self.kb_id = self.doc.vocab.strings.add(kb_id_)
    property id_:
-        """RETURNS (str): The span's ID."""
+        """The span's ID."""
        def __get__(self):
            return self.doc.vocab.strings[self.id]
        def __set__(self, str id_):
            self.id = self.doc.vocab.strings.add(id_)
    property ent_id_:
        """Alias for the span's ID."""
        def __get__(self):
            return self.id_
        def __set__(self, str ent_id_):
            self.id_ = ent_id_
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
    # Don't allow spaces to be the root, if there are
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -561,8 +561,8 @@ overlaps with will be returned.
 | `lemma_`                                | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~                                      |
 | `kb_id`                                 | The hash value of the knowledge base ID referred to by the span. ~~int~~                                                      |
 | `kb_id_`                                | The knowledge base ID referred to by the span. ~~str~~                                                                        |
-| `ent_id`                                | The hash value of the named entity the root token is an instance of. ~~int~~                                                  |
+| `ent_id`                                | Alias for `id`: the hash value of the span's ID. ~~int~~                                                                      |
-| `ent_id_`                               | The string ID of the named entity the root token is an instance of. ~~str~~                                                   |
+| `ent_id_`                               | Alias for `id_`: the span's ID. ~~str~~                                                                                       |
 | `id`                                    | The hash value of the span's ID. ~~int~~                                                                                      |
 | `id_`                                   | The span's ID. ~~str~~                                                                                                        |
 | `sentiment`                             | A scalar value indicating the positivity or negativity of the span. ~~float~~                                                 |
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
 | `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                  |
 | `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                           |
 | `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                           |
-| `ent_id`                                     | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                        |
+| `ent_id`                                     | ID of the entity the token is an instance of, if any. ~~int~~                                                                                                                                                                                                        |
-| `ent_id_`                                    | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                        |
+| `ent_id_`                                    | ID of the entity the token is an instance of, if any. ~~str~~                                                                                                                                                                                                        |
 | `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                       |
 | `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                       |
 | `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                   |
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
 ruler.add_patterns(patterns)
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
 doc2 = nlp("Apple is opening its first big office in San Fran.")
-print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
 ```
 If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
-patterns, the `ent_id_` property of the matched entity is set to the `id` given
+patterns, the `id_` property of the matched entity is set to the `id` given
 in the patterns. So in the example above it's easy to identify that "San
 Francisco" and "San Fran" are both the same entity.