Make Span/Doc.ents more consistent for ent_kb_id and ent_id (#11328)

* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents`
* Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents`
* Make `Span.ent_id` an alias of `Span.id` rather than a read-only view
of the root token's `ent_id` annotation
This commit is contained in:
Adriane Boyd 2022-08-22 20:28:57 +02:00 committed by GitHub
parent 1a5be63715
commit bb0e178878
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 94 additions and 36 deletions

View File

@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
assert [t.ent_iob_ for t in doc] == orig_iobs assert [t.ent_iob_ for t in doc] == orig_iobs
def test_ents_clear(en_vocab):
"""Ensure that removing entities clears token attributes"""
text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text)
entity = Span(doc, 0, 4, label=391, span_id="TEST")
doc.ents = [entity]
doc.ents = []
for token in doc:
assert token.ent_iob == 2
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.ents = [entity]
doc.set_ents([], default="missing")
for token in doc:
assert token.ent_iob == 0
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
doc.set_ents([], default="blocked")
for token in doc:
assert token.ent_iob == 3
assert token.ent_type == 0
assert token.ent_id == 0
assert token.ent_kb_id == 0
def test_add_overlapping_entities(en_vocab): def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"] text = ["Louisiana", "Office", "of", "Conservation"]
doc = Doc(en_vocab, words=text) doc = Doc(en_vocab, words=text)

View File

@ -692,3 +692,23 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3 assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc # check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2 assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113)
def test_span_ent_id(en_tokenizer):
doc = en_tokenizer("a b c d")
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
span = doc.ents[0]
assert doc[1].ent_id_ == "ID0"
# setting Span.id sets Token.ent_id
span.id_ = "ID1"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID1"
assert doc[1].ent_id_ == "ID1"
# Span.ent_id is an alias of Span.id
span.ent_id_ = "ID2"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID2"
assert doc[1].ent_id_ == "ID2"

View File

@ -808,27 +808,33 @@ cdef class Doc:
self.c[i].ent_iob = 1 self.c[i].ent_iob = 1
self.c[i].ent_type = span.label self.c[i].ent_type = span.label
self.c[i].ent_kb_id = span.kb_id self.c[i].ent_kb_id = span.kb_id
# for backwards compatibility in v3, only set ent_id from self.c[i].ent_id = span.id
# span.id if it's set, otherwise don't override
self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
for span in blocked: for span in blocked:
for i in range(span.start, span.end): for i in range(span.start, span.end):
self.c[i].ent_iob = 3 self.c[i].ent_iob = 3
self.c[i].ent_type = 0 self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in missing: for span in missing:
for i in range(span.start, span.end): for i in range(span.start, span.end):
self.c[i].ent_iob = 0 self.c[i].ent_iob = 0
self.c[i].ent_type = 0 self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
for span in outside: for span in outside:
for i in range(span.start, span.end): for i in range(span.start, span.end):
self.c[i].ent_iob = 2 self.c[i].ent_iob = 2
self.c[i].ent_type = 0 self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
# Set tokens outside of all provided spans # Set tokens outside of all provided spans
if default != SetEntsDefault.unmodified: if default != SetEntsDefault.unmodified:
for i in range(self.length): for i in range(self.length):
if i not in seen_tokens: if i not in seen_tokens:
self.c[i].ent_type = 0 self.c[i].ent_type = 0
self.c[i].ent_kb_id = 0
self.c[i].ent_id = 0
if default == SetEntsDefault.outside: if default == SetEntsDefault.outside:
self.c[i].ent_iob = 2 self.c[i].ent_iob = 2
elif default == SetEntsDefault.missing: elif default == SetEntsDefault.missing:

View File

@ -115,17 +115,23 @@ class Span:
end: int end: int
start_char: int start_char: int
end_char: int end_char: int
label: int @property
kb_id: int def label(self) -> int: ...
ent_id: int @property
ent_id_: str def kb_id(self) -> int: ...
@property @property
def id(self) -> int: ... def id(self) -> int: ...
@property @property
def id_(self) -> str: ... def ent_id(self) -> int: ...
@property @property
def orth_(self) -> str: ... def orth_(self) -> str: ...
@property @property
def lemma_(self) -> str: ... def lemma_(self) -> str: ...
label_: str @property
kb_id_: str def label_(self) -> str: ...
@property
def kb_id_(self) -> str: ...
@property
def id_(self) -> str: ...
@property
def ent_id_(self) -> str: ...

View File

@ -802,28 +802,18 @@ cdef class Span:
property id: property id:
def __get__(self): def __get__(self):
cdef SpanC* span_c = self.span_c() return self.span_c().id
return span_c.id
def __set__(self, attr_t id): def __set__(self, attr_t id):
cdef SpanC* span_c = self.span_c() self.span_c().id = id
span_c.id = id
property ent_id: property ent_id:
"""RETURNS (uint64): The entity ID.""" """Alias for the span's ID."""
def __get__(self): def __get__(self):
return self.root.ent_id return self.id
def __set__(self, hash_t key): def __set__(self, attr_t ent_id):
raise NotImplementedError(Errors.E200.format(attr="ent_id")) self.id = ent_id
property ent_id_:
"""RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
@property @property
def orth_(self): def orth_(self):
@ -839,7 +829,7 @@ cdef class Span:
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_: property label_:
"""RETURNS (str): The span's label.""" """The span's label."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]
@ -847,7 +837,7 @@ cdef class Span:
self.label = self.doc.vocab.strings.add(label_) self.label = self.doc.vocab.strings.add(label_)
property kb_id_: property kb_id_:
"""RETURNS (str): The span's KB ID.""" """The span's KB ID."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.kb_id] return self.doc.vocab.strings[self.kb_id]
@ -855,13 +845,22 @@ cdef class Span:
self.kb_id = self.doc.vocab.strings.add(kb_id_) self.kb_id = self.doc.vocab.strings.add(kb_id_)
property id_: property id_:
"""RETURNS (str): The span's ID.""" """The span's ID."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.id] return self.doc.vocab.strings[self.id]
def __set__(self, str id_): def __set__(self, str id_):
self.id = self.doc.vocab.strings.add(id_) self.id = self.doc.vocab.strings.add(id_)
property ent_id_:
"""Alias for the span's ID."""
def __get__(self):
return self.id_
def __set__(self, str ent_id_):
self.id_ = ent_id_
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
# Don't allow spaces to be the root, if there are # Don't allow spaces to be the root, if there are

View File

@ -561,8 +561,8 @@ overlaps with will be returned.
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ | | `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ | | `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ | | `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ | | `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ |
| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ | | `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
| `id` | The hash value of the span's ID. ~~int~~ | | `id` | The hash value of the span's ID. ~~int~~ |
| `id_` | The span's ID. ~~str~~ | | `id_` | The span's ID. ~~str~~ |
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ | | `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |

View File

@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | | `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | | `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | | `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | | `ent_id` | ID of the entity the token is an instance of, if any. ~~int~~ |
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | | `ent_id_` | ID of the entity the token is an instance of, if any. ~~str~~ |
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | | `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | | `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | | `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |

View File

@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc1 = nlp("Apple is opening its first big office in San Francisco.") doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents]) print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.") doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents]) print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
``` ```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
patterns, the `ent_id_` property of the matched entity is set to the `id` given patterns, the `id_` property of the matched entity is set to the `id` given
in the patterns. So in the example above it's easy to identify that "San in the patterns. So in the example above it's easy to identify that "San
Francisco" and "San Fran" are both the same entity. Francisco" and "San Fran" are both the same entity.