diff --git a/spacy/errors.py b/spacy/errors.py index 708b7fda8..6fdf8cb57 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -696,6 +696,12 @@ class Errors: E1009 = ("String for hash '{val}' not found in StringStore. Set the value " "through token.morph_ instead or add the string to the " "StringStore with `nlp.vocab.strings.add(string)`.") + E1010 = ("Unable to set entity information for token {i} which is included " + "in more than one span in entities, blocked, missing or outside.") + E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " + "options: {modes}") + E1012 = ("Entity spans and blocked/missing/outside spans should be " + "provided to doc.set_ents as lists of `Span` objects.") @add_codes diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 40aff8e31..615ab9e5b 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] + doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] + doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 163de5ab0..e5e72fe2a 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer): assert len(tokens.ents) == 0 tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] assert len(list(tokens.ents)) == 1 - assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2] assert tokens.ents[0].label_ == "PRODUCT" assert tokens.ents[0].start == 2 assert tokens.ents[0].end == 4 @@ -427,7 +427,7 @@ def test_has_annotation(en_vocab): doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] - doc.ents = [Span(doc, 0, 1, label="HELLO")] + doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") for attr in attrs: assert doc.has_annotation(attr) @@ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer): doc.is_sentenced -def test_doc_set_ents(): +def test_doc_set_ents(en_tokenizer): + # set ents + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # add ents, invalid IOB repaired + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified") + assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2] + assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0] + + # missing ents + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # outside ents + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], + outside=[doc[4:5]], + default="missing", + ) + assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # blocked ents + doc = en_tokenizer("a b c d e") + doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified") + assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] + assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] + assert doc.ents == tuple() + + # invalid IOB repaired after blocked + doc.ents = [Span(doc, 3, 5, "ENT")] + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] + doc.set_ents([], blocked=[doc[3:4]], default="unmodified") + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] + + # all types + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10)], + blocked=[doc[1:2]], + missing=[doc[2:3]], + outside=[doc[3:4]], + default="unmodified", + ) + assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0] + assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0] + + doc = en_tokenizer("a b c d e") + # single span instead of a list + with pytest.raises(ValueError): + doc.set_ents([], missing=doc[1:2]) + # invalid default mode + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], default="none") + # conflicting/overlapping specifications + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]]) + + +def test_doc_ents_setter(): """Test that both strings and integers can be used to set entities in tuple format via doc.ents.""" words = ["a", "b", "c", "d", "e"] diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 548cd2697..cd5581769 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -168,7 +168,7 @@ def test_accept_blocked_token(): ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity - doc2.ents = [(0, 3, 5)] + doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified") assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.ents = [(0, self.start, self.end)] + doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b82bab294..b4027f87e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter +from enum import Enum +import itertools import numpy import srsly from thinc.api import get_array_module @@ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) return get_token_attr(token, feat_name) +class SetEntsDefault(str, Enum): + blocked = "blocked" + missing = "missing" + outside = "outside" + unmodified = "unmodified" + + @classmethod + def values(cls): + return list(cls.__members__.keys()) + + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -660,50 +673,100 @@ cdef class Doc: # TODO: # 1. Test basic data-driven ORTH gazetteer # 2. Test more nuanced date and currency regex - tokens_in_ents = {} - cdef attr_t entity_type - cdef attr_t kb_id - cdef int ent_start, ent_end, token_index + cdef attr_t entity_type, kb_id + cdef int ent_start, ent_end + ent_spans = [] for ent_info in ents: entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info) if isinstance(entity_type_, str): self.vocab.strings.add(entity_type_) - entity_type = self.vocab.strings.as_int(entity_type_) - for token_index in range(ent_start, ent_end): - if token_index in tokens_in_ents: - raise ValueError(Errors.E103.format( - span1=(tokens_in_ents[token_index][0], - tokens_in_ents[token_index][1], - self.vocab.strings[tokens_in_ents[token_index][2]]), - span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) - tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) - cdef int i + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) + + def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): + """Set entity annotation. + + entities (List[Span]): Spans with labels to set as entities. + blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an + entity) for spacy's built-in NER component. Other components may + ignore this setting. + missing (Optional[List[Span]]): Spans with missing/unknown entity + information. + outside (Optional[List[Span]]): Spans outside of entities (O in IOB). + default (str): How to set entity annotation for tokens outside of any + provided spans. Options: "blocked", "missing", "outside" and + "unmodified" (preserve current state). Defaults to "outside". + """ + if default not in SetEntsDefault.values(): + raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) + + # Ignore spans with missing labels + entities = [ent for ent in entities if ent.label > 0] + + if blocked is None: + blocked = tuple() + if missing is None: + missing = tuple() + if outside is None: + outside = tuple() + + # Find all tokens covered by spans and check that none are overlapping + cdef int i + seen_tokens = set() + for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + + # Set all specified entity information + for span in entities: + for i in range(span.start, span.end): + if i == span.start: + self.c[i].ent_iob = 3 + else: + self.c[i].ent_iob = 1 + self.c[i].ent_type = span.label + self.c[i].ent_kb_id = span.kb_id + for span in blocked: + for i in range(span.start, span.end): + self.c[i].ent_iob = 3 + self.c[i].ent_type = 0 + for span in missing: + for i in range(span.start, span.end): + self.c[i].ent_iob = 0 + self.c[i].ent_type = 0 + for span in outside: + for i in range(span.start, span.end): + self.c[i].ent_iob = 2 + self.c[i].ent_type = 0 + + # Set tokens outside of all provided spans + if default != SetEntsDefault.unmodified: for i in range(self.length): - # default values - entity_type = 0 - kb_id = 0 + if i not in seen_tokens: + self.c[i].ent_type = 0 + if default == SetEntsDefault.outside: + self.c[i].ent_iob = 2 + elif default == SetEntsDefault.missing: + self.c[i].ent_iob = 0 + elif default == SetEntsDefault.blocked: + self.c[i].ent_iob = 3 - # Set ent_iob to Missing (0) by default unless this token was nered before - ent_iob = 0 - if self.c[i].ent_iob != 0: - ent_iob = 2 - - # overwrite if the token was part of a specified entity - if i in tokens_in_ents.keys(): - ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] - if entity_type is None or entity_type <= 0: - # Blocking this token from being overwritten by downstream NER - ent_iob = 3 - elif ent_start == i: - # Marking the start of an entity - ent_iob = 3 - else: - # Marking the inside of an entity - ent_iob = 1 - - self.c[i].ent_type = entity_type - self.c[i].ent_kb_id = kb_id - self.c[i].ent_iob = ent_iob + # Fix any resulting inconsistent annotation + for i in range(self.length - 1): + # I must follow B or I: convert I to B + if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 + # Change of type with BI or II: convert second I to B + if self.c[i].ent_type != self.c[i+1].ent_type and \ + (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 @property def noun_chunks(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index fbf05b224..1e7bea5df 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot): def _add_entities_to_doc(doc, ner_data): + print(ner_data) if ner_data is None: return elif ner_data == []: @@ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data): biluo_tags_to_spans(doc, ner_data) ) elif isinstance(ner_data[0], Span): - # Ugh, this is super messy. Really hard to set O entities - doc.ents = ner_data - doc.ents = [span for span in ner_data if span.label_] + entities = [] + missing = [] + for span in ner_data: + if span.label: + entities.append(span) + else: + missing.append(span) + doc.set_ents(entities, missing=missing) else: raise ValueError(Errors.E973) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 91fc40205..0e8e7eed0 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: doc (Doc): The document that the BILUO tags refer to. entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or + token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of Span objects. + RETURNS (list): A sequence of Span objects. Each token with a missing IOB + tag is returned as a Span with an empty label. """ token_offsets = tags_to_entities(tags) spans = [] @@ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: entities = [] start = None for i, tag in enumerate(tags): - if tag is None: - continue - if tag.startswith("O"): + if tag is None or tag.startswith("-"): # TODO: We shouldn't be getting these malformed inputs. Fix this. if start is not None: start = None else: entities.append(("", i, i)) - continue - elif tag == "-": - continue + elif tag.startswith("O"): + pass elif tag.startswith("I"): if start is None: raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) - continue - if tag.startswith("U"): + elif tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 7175f6e7f..e10d9d077 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -219,6 +219,30 @@ alignment mode `"strict". | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +## Doc.set_ents {#ents tag="method" new="3"} + +Set the named entities in the document. + +> #### Example +> +> ```python +> from spacy.tokens import Span +> doc = nlp("Mr. Best flew to New York on Saturday morning.") +> doc.set_ents([Span(doc, 0, 2, "PERSON")]) +> ents = list(doc.ents) +> assert ents[0].label_ == "PERSON" +> assert ents[0].text == "Mr. Best" +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| entities | Spans with labels to set as entities. ~~List[Span]~~ | +| _keyword-only_ | | +| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ | +| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ | +| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ | +| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ | + ## Doc.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied. > ```python > doc = nlp("Mr. Best flew to New York on Saturday morning.") > ents = list(doc.ents) -> assert ents[0].label == 346 > assert ents[0].label_ == "PERSON" > assert ents[0].text == "Mr. Best" > ```