mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Modify setting missing and blocked entity tokens
In order to make it easier to construct `Doc` objects as training data, modify how missing and blocked entity tokens are set to prioritize setting `O` and missing entity tokens for training purposes over setting blocked entity tokens. * `Doc.ents` setter sets tokens outside entity spans to `O` regardless of the current state of each token * For `Doc.ents`, setting a span with a missing label sets the `ent_iob` to missing instead of blocked * `Doc.block_ents(spans)` marks spans as hard `O` for use with the `EntityRecognizer`
This commit is contained in:
parent
8303d101a5
commit
8b650f3a78
|
@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
assert len(tokens.ents) == 0
|
||||
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
|
||||
assert len(list(tokens.ents)) == 1
|
||||
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
||||
assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
|
||||
assert tokens.ents[0].label_ == "PRODUCT"
|
||||
assert tokens.ents[0].start == 2
|
||||
assert tokens.ents[0].end == 4
|
||||
|
@ -426,7 +426,7 @@ def test_has_annotation(en_vocab):
|
|||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
doc.ents = [Span(doc, 0, 1, label="HELLO")]
|
||||
doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
|
@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer):
|
|||
doc.is_nered
|
||||
with pytest.deprecated_call():
|
||||
doc.is_sentenced
|
||||
|
||||
|
||||
def test_block_ents(en_tokenizer):
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.block_ents([doc[1:2], doc[3:5]])
|
||||
assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
|
||||
assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
|
||||
assert doc.ents == tuple()
|
||||
|
||||
# invalid IOB repaired
|
||||
doc.ents = [Span(doc, 3, 5, "ENT")]
|
||||
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
|
||||
doc.block_ents([doc[3:4]])
|
||||
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
|
||||
|
|
|
@ -168,7 +168,7 @@ def test_accept_blocked_token():
|
|||
ner2 = nlp2.create_pipe("ner", config=config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
doc2.ents = [(0, 3, 5)]
|
||||
doc2.block_ents([doc2[3:5]])
|
||||
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
|
||||
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
|
||||
|
||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
|||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.ents = [(0, self.start, self.end)]
|
||||
doc.block_ents([doc[self.start:self.end]])
|
||||
return doc
|
||||
|
|
|
@ -590,17 +590,16 @@ cdef class Doc:
|
|||
entity_type = 0
|
||||
kb_id = 0
|
||||
|
||||
# Set ent_iob to Missing (0) by default unless this token was nered before
|
||||
ent_iob = 0
|
||||
if self.c[i].ent_iob != 0:
|
||||
ent_iob = 2
|
||||
# Set ent_iob to Outside (2) by default
|
||||
ent_iob = 2
|
||||
|
||||
# overwrite if the token was part of a specified entity
|
||||
if i in tokens_in_ents.keys():
|
||||
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
|
||||
if entity_type is None or entity_type <= 0:
|
||||
# Blocking this token from being overwritten by downstream NER
|
||||
ent_iob = 3
|
||||
# Empty label: Missing, unset this token
|
||||
ent_iob = 0
|
||||
entity_type = 0
|
||||
elif ent_start == i:
|
||||
# Marking the start of an entity
|
||||
ent_iob = 3
|
||||
|
@ -612,6 +611,20 @@ cdef class Doc:
|
|||
self.c[i].ent_kb_id = kb_id
|
||||
self.c[i].ent_iob = ent_iob
|
||||
|
||||
def block_ents(self, spans):
|
||||
"""Mark spans as never an entity for the EntityRecognizer.
|
||||
|
||||
spans (List[Span]): The spans to block as never entities.
|
||||
"""
|
||||
for span in spans:
|
||||
for i in range(span.start, span.end):
|
||||
self.c[i].ent_iob = 3
|
||||
self.c[i].ent_type = 0
|
||||
# if the following token is I, set to B
|
||||
if span.end < self.length:
|
||||
if self.c[span.end].ent_iob == 1:
|
||||
self.c[span.end].ent_iob = 3
|
||||
|
||||
@property
|
||||
def noun_chunks(self):
|
||||
"""Iterate over the base noun phrases in the document. Yields base
|
||||
|
|
|
@ -172,7 +172,7 @@ cdef class Example:
|
|||
return output
|
||||
|
||||
def get_aligned_ner(self):
|
||||
if not self.y.is_nered:
|
||||
if not self.y.has_annotation("ENT_IOB"):
|
||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||
# Default to 'None' for missing values
|
||||
|
@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data):
|
|||
spans_from_biluo_tags(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], Span):
|
||||
# Ugh, this is super messy. Really hard to set O entities
|
||||
doc.ents = ner_data
|
||||
doc.ents = [span for span in ner_data if span.label_]
|
||||
else:
|
||||
raise ValueError(Errors.E973)
|
||||
|
||||
|
|
|
@ -182,22 +182,18 @@ def tags_to_entities(tags):
|
|||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag is None:
|
||||
continue
|
||||
if tag.startswith("O"):
|
||||
if tag is None or tag.startswith("-"):
|
||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||
if start is not None:
|
||||
start = None
|
||||
else:
|
||||
entities.append(("", i, i))
|
||||
continue
|
||||
elif tag == "-":
|
||||
continue
|
||||
elif tag.startswith("O"):
|
||||
pass
|
||||
elif tag.startswith("I"):
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
|
||||
continue
|
||||
if tag.startswith("U"):
|
||||
elif tag.startswith("U"):
|
||||
entities.append((tag[2:], i, i))
|
||||
elif tag.startswith("B"):
|
||||
start = i
|
||||
|
|
Loading…
Reference in New Issue
Block a user