Make Span/Doc.ents more consistent for ent_kb_id and ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
This commit is contained in:
Adriane Boyd 2022-08-22 20:28:57 +02:00 committed by GitHub
parent 1a5be63715
commit bb0e178878
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 94 additions and 36 deletions

View File

@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
assert [t.ent_iob_ for t in doc] == orig_iobs
def test_ents_clear(en_vocab):
"""Ensure that removing entities clears token attributes"""
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)
entity = Span(doc, 0, 4, label=391, span_id="TEST")
doc.ents = [entity]
doc.ents = []
for token in doc:
assert token.ent_iob == 2
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.ents = [entity]
doc.set_ents([], default="missing")
for token in doc:
assert token.ent_iob == 0
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.set_ents([], default="blocked")
for token in doc:
assert token.ent_iob == 3
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)

View File

@ -692,3 +692,23 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113)
def test_span_ent_id(en_tokenizer):
doc = en_tokenizer("a b c d")
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
span = doc.ents[0]
assert doc[1].ent_id_ == "ID0"
# setting Span.id sets Token.ent_id
span.id_ = "ID1"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID1"
assert doc[1].ent_id_ == "ID1"
# Span.ent_id is an alias of Span.id
span.ent_id_ = "ID2"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID2"
assert doc[1].ent_id_ == "ID2"

View File

@ -808,27 +808,33 @@ cdef class Doc:
self.c[i].ent_iob = 1
self.c[i].ent_type = span.label
self.c[i].ent_kb_id = span.kb_id
# for backwards compatibility in v3, only set ent_id from
# span.id if it's set, otherwise don't override
self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
self.c[i].ent_id = span.id
for span in blocked:
for i in range(span.start, span.end):
self.c[i].ent_iob = 3
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in missing:
for i in range(span.start, span.end):
self.c[i].ent_iob = 0
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in outside:
for i in range(span.start, span.end):
self.c[i].ent_iob = 2
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
# Set tokens outside of all provided spans
if default != SetEntsDefault.unmodified:
for i in range(self.length):
if i not in seen_tokens:
self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
if default == SetEntsDefault.outside:
self.c[i].ent_iob = 2
elif default == SetEntsDefault.missing:

View File

@ -115,17 +115,23 @@ class Span:
end: int
start_char: int
end_char: int
label: int
kb_id: int
ent_id: int
ent_id_: str
@property
def label(self) -> int: ...
@property
def kb_id(self) -> int: ...
@property
def id(self) -> int: ...
@property
def id_(self) -> str: ...
def ent_id(self) -> int: ...
@property
def orth_(self) -> str: ...
@property
def lemma_(self) -> str: ...
label_: str
kb_id_: str
@property
def label_(self) -> str: ...
@property
def kb_id_(self) -> str: ...
@property
def id_(self) -> str: ...
@property
def ent_id_(self) -> str: ...

View File

@ -802,28 +802,18 @@ cdef class Span:
property id:
def __get__(self):
cdef SpanC* span_c = self.span_c()
return span_c.id
return self.span_c().id
def __set__(self, attr_t id):
cdef SpanC* span_c = self.span_c()
span_c.id = id
self.span_c().id = id
property ent_id:
"""RETURNS (uint64): The entity ID."""
"""Alias for the span's ID."""
def __get__(self):
return self.root.ent_id
return self.id
def __set__(self, hash_t key):
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
property ent_id_:
"""RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
def __set__(self, attr_t ent_id):
self.id = ent_id
@property
def orth_(self):
@ -839,7 +829,7 @@ cdef class Span:
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_:
"""RETURNS (str): The span's label."""
"""The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@ -847,7 +837,7 @@ cdef class Span:
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
"""RETURNS (str): The span's KB ID."""
"""The span's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
@ -855,13 +845,22 @@ cdef class Span:
self.kb_id = self.doc.vocab.strings.add(kb_id_)
property id_:
"""RETURNS (str): The span's ID."""
"""The span's ID."""
def __get__(self):
return self.doc.vocab.strings[self.id]
def __set__(self, str id_):
self.id = self.doc.vocab.strings.add(id_)
property ent_id_:
"""Alias for the span's ID."""
def __get__(self):
return self.id_
def __set__(self, str ent_id_):
self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are

View File

@ -561,8 +561,8 @@ overlaps with will be returned.
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ |
| `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ |
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
| `id` | The hash value of the span's ID. ~~int~~ |
| `id_` | The span's ID. ~~str~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |

View File

@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. ~~str~~ |
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |

View File

@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
ruler.add_patterns(patterns)
doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
patterns, the `ent_id_` property of the matched entity is set to the `id` given
patterns, the `id_` property of the matched entity is set to the `id` given
in the patterns. So in the example above it's easy to identify that "San
Francisco" and "San Fran" are both the same entity.