mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Make Span/Doc.ents more consistent for ent_kb_id and ent_id (#11328)
* Map `Span.id` to `Token.ent_id` in all cases when setting `Doc.ents` * Reset `Token.ent_id` and `Token.ent_kb_id` when setting `Doc.ents` * Make `Span.ent_id` an alias of `Span.id` rather than a read-only view of the root token's `ent_id` annotation
This commit is contained in:
parent
1a5be63715
commit
bb0e178878
|
@ -45,6 +45,33 @@ def test_ents_reset(en_vocab):
|
||||||
assert [t.ent_iob_ for t in doc] == orig_iobs
|
assert [t.ent_iob_ for t in doc] == orig_iobs
|
||||||
|
|
||||||
|
|
||||||
|
def test_ents_clear(en_vocab):
|
||||||
|
"""Ensure that removing entities clears token attributes"""
|
||||||
|
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||||
|
doc = Doc(en_vocab, words=text)
|
||||||
|
entity = Span(doc, 0, 4, label=391, span_id="TEST")
|
||||||
|
doc.ents = [entity]
|
||||||
|
doc.ents = []
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
doc.ents = [entity]
|
||||||
|
doc.set_ents([], default="missing")
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 0
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
doc.set_ents([], default="blocked")
|
||||||
|
for token in doc:
|
||||||
|
assert token.ent_iob == 3
|
||||||
|
assert token.ent_type == 0
|
||||||
|
assert token.ent_id == 0
|
||||||
|
assert token.ent_kb_id == 0
|
||||||
|
|
||||||
|
|
||||||
def test_add_overlapping_entities(en_vocab):
|
def test_add_overlapping_entities(en_vocab):
|
||||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
|
|
|
@ -692,3 +692,23 @@ def test_span_group_copy(doc):
|
||||||
assert len(doc.spans["test"]) == 3
|
assert len(doc.spans["test"]) == 3
|
||||||
# check that the copy spans were not modified and this is an isolated doc
|
# check that the copy spans were not modified and this is an isolated doc
|
||||||
assert len(doc_copy.spans["test"]) == 2
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(11113)
|
||||||
|
def test_span_ent_id(en_tokenizer):
|
||||||
|
doc = en_tokenizer("a b c d")
|
||||||
|
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
|
||||||
|
span = doc.ents[0]
|
||||||
|
assert doc[1].ent_id_ == "ID0"
|
||||||
|
|
||||||
|
# setting Span.id sets Token.ent_id
|
||||||
|
span.id_ = "ID1"
|
||||||
|
doc.ents = [span]
|
||||||
|
assert doc.ents[0].ent_id_ == "ID1"
|
||||||
|
assert doc[1].ent_id_ == "ID1"
|
||||||
|
|
||||||
|
# Span.ent_id is an alias of Span.id
|
||||||
|
span.ent_id_ = "ID2"
|
||||||
|
doc.ents = [span]
|
||||||
|
assert doc.ents[0].ent_id_ == "ID2"
|
||||||
|
assert doc[1].ent_id_ == "ID2"
|
||||||
|
|
|
@ -808,27 +808,33 @@ cdef class Doc:
|
||||||
self.c[i].ent_iob = 1
|
self.c[i].ent_iob = 1
|
||||||
self.c[i].ent_type = span.label
|
self.c[i].ent_type = span.label
|
||||||
self.c[i].ent_kb_id = span.kb_id
|
self.c[i].ent_kb_id = span.kb_id
|
||||||
# for backwards compatibility in v3, only set ent_id from
|
self.c[i].ent_id = span.id
|
||||||
# span.id if it's set, otherwise don't override
|
|
||||||
self.c[i].ent_id = span.id if span.id else self.c[i].ent_id
|
|
||||||
for span in blocked:
|
for span in blocked:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 3
|
self.c[i].ent_iob = 3
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
for span in missing:
|
for span in missing:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 0
|
self.c[i].ent_iob = 0
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
for span in outside:
|
for span in outside:
|
||||||
for i in range(span.start, span.end):
|
for i in range(span.start, span.end):
|
||||||
self.c[i].ent_iob = 2
|
self.c[i].ent_iob = 2
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
|
|
||||||
# Set tokens outside of all provided spans
|
# Set tokens outside of all provided spans
|
||||||
if default != SetEntsDefault.unmodified:
|
if default != SetEntsDefault.unmodified:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if i not in seen_tokens:
|
if i not in seen_tokens:
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
|
self.c[i].ent_kb_id = 0
|
||||||
|
self.c[i].ent_id = 0
|
||||||
if default == SetEntsDefault.outside:
|
if default == SetEntsDefault.outside:
|
||||||
self.c[i].ent_iob = 2
|
self.c[i].ent_iob = 2
|
||||||
elif default == SetEntsDefault.missing:
|
elif default == SetEntsDefault.missing:
|
||||||
|
|
|
@ -115,17 +115,23 @@ class Span:
|
||||||
end: int
|
end: int
|
||||||
start_char: int
|
start_char: int
|
||||||
end_char: int
|
end_char: int
|
||||||
label: int
|
@property
|
||||||
kb_id: int
|
def label(self) -> int: ...
|
||||||
ent_id: int
|
@property
|
||||||
ent_id_: str
|
def kb_id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def id(self) -> int: ...
|
def id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def id_(self) -> str: ...
|
def ent_id(self) -> int: ...
|
||||||
@property
|
@property
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def lemma_(self) -> str: ...
|
def lemma_(self) -> str: ...
|
||||||
label_: str
|
@property
|
||||||
kb_id_: str
|
def label_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def kb_id_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def id_(self) -> str: ...
|
||||||
|
@property
|
||||||
|
def ent_id_(self) -> str: ...
|
||||||
|
|
|
@ -802,28 +802,18 @@ cdef class Span:
|
||||||
|
|
||||||
property id:
|
property id:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef SpanC* span_c = self.span_c()
|
return self.span_c().id
|
||||||
return span_c.id
|
|
||||||
|
|
||||||
def __set__(self, attr_t id):
|
def __set__(self, attr_t id):
|
||||||
cdef SpanC* span_c = self.span_c()
|
self.span_c().id = id
|
||||||
span_c.id = id
|
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): The entity ID."""
|
"""Alias for the span's ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.id
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, attr_t ent_id):
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
|
self.id = ent_id
|
||||||
|
|
||||||
property ent_id_:
|
|
||||||
"""RETURNS (str): The (string) entity ID."""
|
|
||||||
def __get__(self):
|
|
||||||
return self.root.ent_id_
|
|
||||||
|
|
||||||
def __set__(self, str key):
|
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -839,7 +829,7 @@ cdef class Span:
|
||||||
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""RETURNS (str): The span's label."""
|
"""The span's label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
@ -847,7 +837,7 @@ cdef class Span:
|
||||||
self.label = self.doc.vocab.strings.add(label_)
|
self.label = self.doc.vocab.strings.add(label_)
|
||||||
|
|
||||||
property kb_id_:
|
property kb_id_:
|
||||||
"""RETURNS (str): The span's KB ID."""
|
"""The span's KB ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
|
||||||
|
@ -855,13 +845,22 @@ cdef class Span:
|
||||||
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
||||||
|
|
||||||
property id_:
|
property id_:
|
||||||
"""RETURNS (str): The span's ID."""
|
"""The span's ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.id]
|
return self.doc.vocab.strings[self.id]
|
||||||
|
|
||||||
def __set__(self, str id_):
|
def __set__(self, str id_):
|
||||||
self.id = self.doc.vocab.strings.add(id_)
|
self.id = self.doc.vocab.strings.add(id_)
|
||||||
|
|
||||||
|
property ent_id_:
|
||||||
|
"""Alias for the span's ID."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.id_
|
||||||
|
|
||||||
|
def __set__(self, str ent_id_):
|
||||||
|
self.id_ = ent_id_
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
# Don't allow spaces to be the root, if there are
|
# Don't allow spaces to be the root, if there are
|
||||||
|
|
|
@ -561,8 +561,8 @@ overlaps with will be returned.
|
||||||
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
||||||
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
||||||
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
||||||
| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ |
|
| `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ |
|
||||||
| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ |
|
| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ |
|
||||||
| `id` | The hash value of the span's ID. ~~int~~ |
|
| `id` | The hash value of the span's ID. ~~int~~ |
|
||||||
| `id_` | The span's ID. ~~str~~ |
|
| `id_` | The span's ID. ~~str~~ |
|
||||||
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
||||||
|
|
|
@ -425,8 +425,8 @@ The L2 norm of the token's vector representation.
|
||||||
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
||||||
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
||||||
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
||||||
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
|
| `ent_id` | ID of the entity the token is an instance of, if any. ~~int~~ |
|
||||||
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
|
| `ent_id_` | ID of the entity the token is an instance of, if any. ~~str~~ |
|
||||||
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
||||||
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
||||||
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
|
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ |
|
||||||
|
|
|
@ -1367,14 +1367,14 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents])
|
||||||
|
|
||||||
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
||||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
|
print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
|
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
|
||||||
patterns, the `ent_id_` property of the matched entity is set to the `id` given
|
patterns, the `id_` property of the matched entity is set to the `id` given
|
||||||
in the patterns. So in the example above it's easy to identify that "San
|
in the patterns. So in the example above it's easy to identify that "San
|
||||||
Francisco" and "San Fran" are both the same entity.
|
Francisco" and "San Fran" are both the same entity.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user