Merge pull request #6089 from adrianeboyd/feature/doc-ents-v3-2

This commit is contained in:
Ines Montani 2020-09-24 14:44:42 +02:00 committed by GitHub
commit 58dde293ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 221 additions and 59 deletions

View File

@ -696,6 +696,12 @@ class Errors:
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
"through token.morph_ instead or add the string to the "
"StringStore with `nlp.vocab.strings.add(string)`.")
E1010 = ("Unable to set entity information for token {i} which is included "
"in more than one span in entities, blocked, missing or outside.")
E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
"options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to doc.set_ents as lists of `Span` objects.")
@add_codes

View File

@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner.begin_training(lambda: [_ner_example(ner)])
ner(doc)
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
doc.ents = [("ANIMAL", 3, 4)]
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
doc.ents = [("WORD", 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]

View File

@ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer):
assert len(tokens.ents) == 0
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
assert len(list(tokens.ents)) == 1
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
assert tokens.ents[0].label_ == "PRODUCT"
assert tokens.ents[0].start == 2
assert tokens.ents[0].end == 4
@ -427,7 +427,7 @@ def test_has_annotation(en_vocab):
doc[0].lemma_ = "a"
doc[0].dep_ = "dep"
doc[0].head = doc[1]
doc.ents = [Span(doc, 0, 1, label="HELLO")]
doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
for attr in attrs:
assert doc.has_annotation(attr)
@ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer):
doc.is_sentenced
def test_doc_set_ents():
def test_doc_set_ents(en_tokenizer):
# set ents
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# add ents, invalid IOB repaired
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
# missing ents
doc = en_tokenizer("a b c d e")
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# outside ents
doc = en_tokenizer("a b c d e")
doc.set_ents(
[Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
outside=[doc[4:5]],
default="missing",
)
assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
# blocked ents
doc = en_tokenizer("a b c d e")
doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
assert doc.ents == tuple()
# invalid IOB repaired after blocked
doc.ents = [Span(doc, 3, 5, "ENT")]
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
# all types
doc = en_tokenizer("a b c d e")
doc.set_ents(
[Span(doc, 0, 1, 10)],
blocked=[doc[1:2]],
missing=[doc[2:3]],
outside=[doc[3:4]],
default="unmodified",
)
assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
doc = en_tokenizer("a b c d e")
# single span instead of a list
with pytest.raises(ValueError):
doc.set_ents([], missing=doc[1:2])
# invalid default mode
with pytest.raises(ValueError):
doc.set_ents([], missing=[doc[1:2]], default="none")
# conflicting/overlapping specifications
with pytest.raises(ValueError):
doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
def test_doc_ents_setter():
"""Test that both strings and integers can be used to set entities in
tuple format via doc.ents."""
words = ["a", "b", "c", "d", "e"]

View File

@ -168,7 +168,7 @@ def test_accept_blocked_token():
ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity
doc2.ents = [(0, 3, 5)]
doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
@ -358,5 +358,5 @@ class BlockerComponent1:
self.name = name
def __call__(self, doc):
doc.ents = [(0, self.start, self.end)]
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
return doc

View File

@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t
import copy
from collections import Counter
from enum import Enum
import itertools
import numpy
import srsly
from thinc.api import get_array_module
@ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
return get_token_attr(token, feat_name)
class SetEntsDefault(str, Enum):
blocked = "blocked"
missing = "missing"
outside = "outside"
unmodified = "unmodified"
@classmethod
def values(cls):
return list(cls.__members__.keys())
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
@ -660,50 +673,100 @@ cdef class Doc:
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
tokens_in_ents = {}
cdef attr_t entity_type
cdef attr_t kb_id
cdef int ent_start, ent_end, token_index
cdef attr_t entity_type, kb_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
if isinstance(entity_type_, str):
self.vocab.strings.add(entity_type_)
entity_type = self.vocab.strings.as_int(entity_type_)
for token_index in range(ent_start, ent_end):
if token_index in tokens_in_ents:
raise ValueError(Errors.E103.format(
span1=(tokens_in_ents[token_index][0],
tokens_in_ents[token_index][1],
self.vocab.strings[tokens_in_ents[token_index][2]]),
span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
ent_spans.append(span)
self.set_ents(ent_spans, default=SetEntsDefault.outside)
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
"""Set entity annotation.
entities (List[Span]): Spans with labels to set as entities.
blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
entity) for spacy's built-in NER component. Other components may
ignore this setting.
missing (Optional[List[Span]]): Spans with missing/unknown entity
information.
outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
default (str): How to set entity annotation for tokens outside of any
provided spans. Options: "blocked", "missing", "outside" and
"unmodified" (preserve current state). Defaults to "outside".
"""
if default not in SetEntsDefault.values():
raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
# Ignore spans with missing labels
entities = [ent for ent in entities if ent.label > 0]
if blocked is None:
blocked = tuple()
if missing is None:
missing = tuple()
if outside is None:
outside = tuple()
# Find all tokens covered by spans and check that none are overlapping
cdef int i
for i in range(self.length):
# default values
entity_type = 0
kb_id = 0
seen_tokens = set()
for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
if not isinstance(span, Span):
raise ValueError(Errors.E1012.format(span=span))
for i in range(span.start, span.end):
if i in seen_tokens:
raise ValueError(Errors.E1010.format(i=i))
seen_tokens.add(i)
# Set ent_iob to Missing (0) by default unless this token was nered before
ent_iob = 0
if self.c[i].ent_iob != 0:
ent_iob = 2
# overwrite if the token was part of a specified entity
if i in tokens_in_ents.keys():
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
if entity_type is None or entity_type <= 0:
# Blocking this token from being overwritten by downstream NER
ent_iob = 3
elif ent_start == i:
# Marking the start of an entity
ent_iob = 3
# Set all specified entity information
for span in entities:
for i in range(span.start, span.end):
if i == span.start:
self.c[i].ent_iob = 3
else:
# Marking the inside of an entity
ent_iob = 1
self.c[i].ent_iob = 1
self.c[i].ent_type = span.label
self.c[i].ent_kb_id = span.kb_id
for span in blocked:
for i in range(span.start, span.end):
self.c[i].ent_iob = 3
self.c[i].ent_type = 0
for span in missing:
for i in range(span.start, span.end):
self.c[i].ent_iob = 0
self.c[i].ent_type = 0
for span in outside:
for i in range(span.start, span.end):
self.c[i].ent_iob = 2
self.c[i].ent_type = 0
self.c[i].ent_type = entity_type
self.c[i].ent_kb_id = kb_id
self.c[i].ent_iob = ent_iob
# Set tokens outside of all provided spans
if default != SetEntsDefault.unmodified:
for i in range(self.length):
if i not in seen_tokens:
self.c[i].ent_type = 0
if default == SetEntsDefault.outside:
self.c[i].ent_iob = 2
elif default == SetEntsDefault.missing:
self.c[i].ent_iob = 0
elif default == SetEntsDefault.blocked:
self.c[i].ent_iob = 3
# Fix any resulting inconsistent annotation
for i in range(self.length - 1):
# I must follow B or I: convert I to B
if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
self.c[i+1].ent_iob == 1:
self.c[i+1].ent_iob = 3
# Change of type with BI or II: convert second I to B
if self.c[i].ent_type != self.c[i+1].ent_type and \
(self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
self.c[i+1].ent_iob == 1:
self.c[i+1].ent_iob = 3
@property
def noun_chunks(self):

View File

@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
def _add_entities_to_doc(doc, ner_data):
print(ner_data)
if ner_data is None:
return
elif ner_data == []:
@ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
biluo_tags_to_spans(doc, ner_data)
)
elif isinstance(ner_data[0], Span):
# Ugh, this is super messy. Really hard to set O entities
doc.ents = ner_data
doc.ents = [span for span in ner_data if span.label_]
entities = []
missing = []
for span in ner_data:
if span.label:
entities.append(span)
else:
missing.append(span)
doc.set_ents(entities, missing=missing)
else:
raise ValueError(Errors.E973)

View File

@ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
token. Each tag string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of Span objects.
RETURNS (list): A sequence of Span objects. Each token with a missing IOB
tag is returned as a Span with an empty label.
"""
token_offsets = tags_to_entities(tags)
spans = []
@ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
entities = []
start = None
for i, tag in enumerate(tags):
if tag is None:
continue
if tag.startswith("O"):
if tag is None or tag.startswith("-"):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
start = None
else:
entities.append(("", i, i))
continue
elif tag == "-":
continue
elif tag.startswith("O"):
pass
elif tag.startswith("I"):
if start is None:
raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
continue
if tag.startswith("U"):
elif tag.startswith("U"):
entities.append((tag[2:], i, i))
elif tag.startswith("B"):
start = i

View File

@ -219,6 +219,30 @@ alignment mode `"strict".
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Doc.set_ents {#ents tag="method" new="3"}
Set the named entities in the document.
> #### Example
>
> ```python
> from spacy.tokens import Span
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
> doc.set_ents([Span(doc, 0, 2, "PERSON")])
> ents = list(doc.ents)
> assert ents[0].label_ == "PERSON"
> assert ents[0].text == "Mr. Best"
> ```
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| entities | Spans with labels to set as entities. ~~List[Span]~~ |
| _keyword-only_ | |
| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ |
| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ |
| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ |
| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
## Doc.similarity {#similarity tag="method" model="vectors"}
Make a semantic similarity estimate. The default estimate is cosine similarity
@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
> ```python
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents)
> assert ents[0].label == 346
> assert ents[0].label_ == "PERSON"
> assert ents[0].text == "Mr. Best"
> ```