mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Merge pull request #6089 from adrianeboyd/feature/doc-ents-v3-2
This commit is contained in:
commit
58dde293ce
|
@ -696,6 +696,12 @@ class Errors:
|
|||
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
|
||||
"through token.morph_ instead or add the string to the "
|
||||
"StringStore with `nlp.vocab.strings.add(string)`.")
|
||||
E1010 = ("Unable to set entity information for token {i} which is included "
|
||||
"in more than one span in entities, blocked, missing or outside.")
|
||||
E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
|
||||
"options: {modes}")
|
||||
E1012 = ("Entity spans and blocked/missing/outside spans should be "
|
||||
"provided to doc.set_ents as lists of `Span` objects.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
ner.begin_training(lambda: [_ner_example(ner)])
|
||||
ner(doc)
|
||||
|
||||
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||
doc.ents = [("ANIMAL", 3, 4)]
|
||||
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
|
||||
|
||||
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||
doc.ents = [("WORD", 0, 2)]
|
||||
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
|
||||
|
||||
|
||||
|
|
|
@ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
assert len(tokens.ents) == 0
|
||||
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
|
||||
assert len(list(tokens.ents)) == 1
|
||||
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
||||
assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
|
||||
assert tokens.ents[0].label_ == "PRODUCT"
|
||||
assert tokens.ents[0].start == 2
|
||||
assert tokens.ents[0].end == 4
|
||||
|
@ -427,7 +427,7 @@ def test_has_annotation(en_vocab):
|
|||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
doc.ents = [Span(doc, 0, 1, label="HELLO")]
|
||||
doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
|
@ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer):
|
|||
doc.is_sentenced
|
||||
|
||||
|
||||
def test_doc_set_ents():
|
||||
def test_doc_set_ents(en_tokenizer):
|
||||
# set ents
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
|
||||
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
|
||||
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
|
||||
|
||||
# add ents, invalid IOB repaired
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
|
||||
doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
|
||||
assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
|
||||
assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
|
||||
|
||||
# missing ents
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
|
||||
assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
|
||||
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
|
||||
|
||||
# outside ents
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents(
|
||||
[Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
|
||||
outside=[doc[4:5]],
|
||||
default="missing",
|
||||
)
|
||||
assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
|
||||
assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
|
||||
|
||||
# blocked ents
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
|
||||
assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
|
||||
assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
|
||||
assert doc.ents == tuple()
|
||||
|
||||
# invalid IOB repaired after blocked
|
||||
doc.ents = [Span(doc, 3, 5, "ENT")]
|
||||
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
|
||||
doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
|
||||
assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
|
||||
|
||||
# all types
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents(
|
||||
[Span(doc, 0, 1, 10)],
|
||||
blocked=[doc[1:2]],
|
||||
missing=[doc[2:3]],
|
||||
outside=[doc[3:4]],
|
||||
default="unmodified",
|
||||
)
|
||||
assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
|
||||
assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
|
||||
|
||||
doc = en_tokenizer("a b c d e")
|
||||
# single span instead of a list
|
||||
with pytest.raises(ValueError):
|
||||
doc.set_ents([], missing=doc[1:2])
|
||||
# invalid default mode
|
||||
with pytest.raises(ValueError):
|
||||
doc.set_ents([], missing=[doc[1:2]], default="none")
|
||||
# conflicting/overlapping specifications
|
||||
with pytest.raises(ValueError):
|
||||
doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
|
||||
|
||||
|
||||
def test_doc_ents_setter():
|
||||
"""Test that both strings and integers can be used to set entities in
|
||||
tuple format via doc.ents."""
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
|
|
|
@ -168,7 +168,7 @@ def test_accept_blocked_token():
|
|||
ner2 = nlp2.create_pipe("ner", config=config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
doc2.ents = [(0, 3, 5)]
|
||||
doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
|
||||
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
|
||||
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
|
||||
|
||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
|||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.ents = [(0, self.start, self.end)]
|
||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
||||
return doc
|
||||
|
|
|
@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t
|
|||
|
||||
import copy
|
||||
from collections import Counter
|
||||
from enum import Enum
|
||||
import itertools
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import get_array_module
|
||||
|
@ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
|
|||
return get_token_attr(token, feat_name)
|
||||
|
||||
|
||||
class SetEntsDefault(str, Enum):
|
||||
blocked = "blocked"
|
||||
missing = "missing"
|
||||
outside = "outside"
|
||||
unmodified = "unmodified"
|
||||
|
||||
@classmethod
|
||||
def values(cls):
|
||||
return list(cls.__members__.keys())
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||
|
@ -660,50 +673,100 @@ cdef class Doc:
|
|||
# TODO:
|
||||
# 1. Test basic data-driven ORTH gazetteer
|
||||
# 2. Test more nuanced date and currency regex
|
||||
tokens_in_ents = {}
|
||||
cdef attr_t entity_type
|
||||
cdef attr_t kb_id
|
||||
cdef int ent_start, ent_end, token_index
|
||||
cdef attr_t entity_type, kb_id
|
||||
cdef int ent_start, ent_end
|
||||
ent_spans = []
|
||||
for ent_info in ents:
|
||||
entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||
if isinstance(entity_type_, str):
|
||||
self.vocab.strings.add(entity_type_)
|
||||
entity_type = self.vocab.strings.as_int(entity_type_)
|
||||
for token_index in range(ent_start, ent_end):
|
||||
if token_index in tokens_in_ents:
|
||||
raise ValueError(Errors.E103.format(
|
||||
span1=(tokens_in_ents[token_index][0],
|
||||
tokens_in_ents[token_index][1],
|
||||
self.vocab.strings[tokens_in_ents[token_index][2]]),
|
||||
span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
|
||||
tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
|
||||
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
|
||||
ent_spans.append(span)
|
||||
self.set_ents(ent_spans, default=SetEntsDefault.outside)
|
||||
|
||||
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
|
||||
"""Set entity annotation.
|
||||
|
||||
entities (List[Span]): Spans with labels to set as entities.
|
||||
blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
|
||||
entity) for spacy's built-in NER component. Other components may
|
||||
ignore this setting.
|
||||
missing (Optional[List[Span]]): Spans with missing/unknown entity
|
||||
information.
|
||||
outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
|
||||
default (str): How to set entity annotation for tokens outside of any
|
||||
provided spans. Options: "blocked", "missing", "outside" and
|
||||
"unmodified" (preserve current state). Defaults to "outside".
|
||||
"""
|
||||
if default not in SetEntsDefault.values():
|
||||
raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
|
||||
|
||||
# Ignore spans with missing labels
|
||||
entities = [ent for ent in entities if ent.label > 0]
|
||||
|
||||
if blocked is None:
|
||||
blocked = tuple()
|
||||
if missing is None:
|
||||
missing = tuple()
|
||||
if outside is None:
|
||||
outside = tuple()
|
||||
|
||||
# Find all tokens covered by spans and check that none are overlapping
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
# default values
|
||||
entity_type = 0
|
||||
kb_id = 0
|
||||
seen_tokens = set()
|
||||
for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
|
||||
if not isinstance(span, Span):
|
||||
raise ValueError(Errors.E1012.format(span=span))
|
||||
for i in range(span.start, span.end):
|
||||
if i in seen_tokens:
|
||||
raise ValueError(Errors.E1010.format(i=i))
|
||||
seen_tokens.add(i)
|
||||
|
||||
# Set ent_iob to Missing (0) by default unless this token was nered before
|
||||
ent_iob = 0
|
||||
if self.c[i].ent_iob != 0:
|
||||
ent_iob = 2
|
||||
|
||||
# overwrite if the token was part of a specified entity
|
||||
if i in tokens_in_ents.keys():
|
||||
ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
|
||||
if entity_type is None or entity_type <= 0:
|
||||
# Blocking this token from being overwritten by downstream NER
|
||||
ent_iob = 3
|
||||
elif ent_start == i:
|
||||
# Marking the start of an entity
|
||||
ent_iob = 3
|
||||
# Set all specified entity information
|
||||
for span in entities:
|
||||
for i in range(span.start, span.end):
|
||||
if i == span.start:
|
||||
self.c[i].ent_iob = 3
|
||||
else:
|
||||
# Marking the inside of an entity
|
||||
ent_iob = 1
|
||||
self.c[i].ent_iob = 1
|
||||
self.c[i].ent_type = span.label
|
||||
self.c[i].ent_kb_id = span.kb_id
|
||||
for span in blocked:
|
||||
for i in range(span.start, span.end):
|
||||
self.c[i].ent_iob = 3
|
||||
self.c[i].ent_type = 0
|
||||
for span in missing:
|
||||
for i in range(span.start, span.end):
|
||||
self.c[i].ent_iob = 0
|
||||
self.c[i].ent_type = 0
|
||||
for span in outside:
|
||||
for i in range(span.start, span.end):
|
||||
self.c[i].ent_iob = 2
|
||||
self.c[i].ent_type = 0
|
||||
|
||||
self.c[i].ent_type = entity_type
|
||||
self.c[i].ent_kb_id = kb_id
|
||||
self.c[i].ent_iob = ent_iob
|
||||
# Set tokens outside of all provided spans
|
||||
if default != SetEntsDefault.unmodified:
|
||||
for i in range(self.length):
|
||||
if i not in seen_tokens:
|
||||
self.c[i].ent_type = 0
|
||||
if default == SetEntsDefault.outside:
|
||||
self.c[i].ent_iob = 2
|
||||
elif default == SetEntsDefault.missing:
|
||||
self.c[i].ent_iob = 0
|
||||
elif default == SetEntsDefault.blocked:
|
||||
self.c[i].ent_iob = 3
|
||||
|
||||
# Fix any resulting inconsistent annotation
|
||||
for i in range(self.length - 1):
|
||||
# I must follow B or I: convert I to B
|
||||
if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
|
||||
self.c[i+1].ent_iob == 1:
|
||||
self.c[i+1].ent_iob = 3
|
||||
# Change of type with BI or II: convert second I to B
|
||||
if self.c[i].ent_type != self.c[i+1].ent_type and \
|
||||
(self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
|
||||
self.c[i+1].ent_iob == 1:
|
||||
self.c[i+1].ent_iob = 3
|
||||
|
||||
@property
|
||||
def noun_chunks(self):
|
||||
|
|
|
@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
|
||||
|
||||
def _add_entities_to_doc(doc, ner_data):
|
||||
print(ner_data)
|
||||
if ner_data is None:
|
||||
return
|
||||
elif ner_data == []:
|
||||
|
@ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
|
|||
biluo_tags_to_spans(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], Span):
|
||||
# Ugh, this is super messy. Really hard to set O entities
|
||||
doc.ents = ner_data
|
||||
doc.ents = [span for span in ner_data if span.label_]
|
||||
entities = []
|
||||
missing = []
|
||||
for span in ner_data:
|
||||
if span.label:
|
||||
entities.append(span)
|
||||
else:
|
||||
missing.append(span)
|
||||
doc.set_ents(entities, missing=missing)
|
||||
else:
|
||||
raise ValueError(Errors.E973)
|
||||
|
||||
|
|
|
@ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
|
|||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||
token. Each tags string will be of the form of either "", "O" or
|
||||
token. Each tag string will be of the form of either "", "O" or
|
||||
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||
RETURNS (list): A sequence of Span objects.
|
||||
RETURNS (list): A sequence of Span objects. Each token with a missing IOB
|
||||
tag is returned as a Span with an empty label.
|
||||
"""
|
||||
token_offsets = tags_to_entities(tags)
|
||||
spans = []
|
||||
|
@ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
|
|||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag is None:
|
||||
continue
|
||||
if tag.startswith("O"):
|
||||
if tag is None or tag.startswith("-"):
|
||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||
if start is not None:
|
||||
start = None
|
||||
else:
|
||||
entities.append(("", i, i))
|
||||
continue
|
||||
elif tag == "-":
|
||||
continue
|
||||
elif tag.startswith("O"):
|
||||
pass
|
||||
elif tag.startswith("I"):
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
|
||||
continue
|
||||
if tag.startswith("U"):
|
||||
elif tag.startswith("U"):
|
||||
entities.append((tag[2:], i, i))
|
||||
elif tag.startswith("B"):
|
||||
start = i
|
||||
|
|
|
@ -219,6 +219,30 @@ alignment mode `"strict".
|
|||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.set_ents {#ents tag="method" new="3"}
|
||||
|
||||
Set the named entities in the document.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import Span
|
||||
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
|
||||
> doc.set_ents([Span(doc, 0, 2, "PERSON")])
|
||||
> ents = list(doc.ents)
|
||||
> assert ents[0].label_ == "PERSON"
|
||||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| entities | Spans with labels to set as entities. ~~List[Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ |
|
||||
| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ |
|
||||
| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ |
|
||||
| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
Make a semantic similarity estimate. The default estimate is cosine similarity
|
||||
|
@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
|
|||
> ```python
|
||||
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
|
||||
> ents = list(doc.ents)
|
||||
> assert ents[0].label == 346
|
||||
> assert ents[0].label_ == "PERSON"
|
||||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
|
Loading…
Reference in New Issue
Block a user