Merge pull request #6089 from adrianeboyd/feature/doc-ents-v3-2

2025-10-31 07:57:35 +03:00 · 2020-09-24 14:44:42 +02:00 · 2020-09-24 14:44:42 +02:00 · 58dde293ce
commit 58dde293ce
parent 74e1f192b4 5c13e0cf1b
8 changed files with 221 additions and 59 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -696,6 +696,12 @@ class Errors:
    E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
             "through token.morph_ instead or add the string to the "
             "StringStore with `nlp.vocab.strings.add(string)`.")
+    E1010 = ("Unable to set entity information for token {i} which is included "
+             "in more than one span in entities, blocked, missing or outside.")
+    E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
+             "options: {modes}")
+    E1012 = ("Entity spans and blocked/missing/outside spans should be "
+             "provided to doc.set_ents as lists of `Span` objects.")


@add_codes
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    ner.begin_training(lambda: [_ner_example(ner)])
    ner(doc)

-    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+    doc.ents = [("ANIMAL", 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]

-    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+    doc.ents = [("WORD", 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]


--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer):
    assert len(tokens.ents) == 0
    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
    assert len(list(tokens.ents)) == 1
-    assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+    assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
    assert tokens.ents[0].label_ == "PRODUCT"
    assert tokens.ents[0].start == 2
    assert tokens.ents[0].end == 4
@ -427,7 +427,7 @@ def test_has_annotation(en_vocab):
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+    doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")

    for attr in attrs:
        assert doc.has_annotation(attr)
@ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer):
        doc.is_sentenced


-def test_doc_set_ents():
+def test_doc_set_ents(en_tokenizer):
+    # set ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # add ents, invalid IOB repaired
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
+    assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
+    assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
+
+    # missing ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # outside ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
+        outside=[doc[4:5]],
+        default="missing",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # blocked ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
+    assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+    assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+    assert doc.ents == tuple()
+
+    # invalid IOB repaired after blocked
+    doc.ents = [Span(doc, 3, 5, "ENT")]
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+    doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
+
+    # all types
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10)],
+        blocked=[doc[1:2]],
+        missing=[doc[2:3]],
+        outside=[doc[3:4]],
+        default="unmodified",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
+
+    doc = en_tokenizer("a b c d e")
+    # single span instead of a list
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=doc[1:2])
+    # invalid default mode
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], default="none")
+    # conflicting/overlapping specifications
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
+
+
+def test_doc_ents_setter():
    """Test that both strings and integers can be used to set entities in
    tuple format via doc.ents."""
    words = ["a", "b", "c", "d", "e"]
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -168,7 +168,7 @@ def test_accept_blocked_token():
    ner2 = nlp2.create_pipe("ner", config=config)

    # set "New York" to a blocked entity
-    doc2.ents = [(0, 3, 5)]
+    doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
    assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
    assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]

@ -358,5 +358,5 @@ class BlockerComponent1:
        self.name = name

    def __call__(self, doc):
-        doc.ents = [(0, self.start, self.end)]
+        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
        return doc
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t

 import copy
 from collections import Counter
+from enum import Enum
+import itertools
 import numpy
 import srsly
 from thinc.api import get_array_module
@ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
        return get_token_attr(token, feat_name)


+class SetEntsDefault(str, Enum):
+    blocked = "blocked"
+    missing = "missing"
+    outside = "outside"
+    unmodified = "unmodified"
+
+    @classmethod
+    def values(cls):
+        return list(cls.__members__.keys())
+
+
 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
@ -660,50 +673,100 @@ cdef class Doc:
            # TODO:
            # 1. Test basic data-driven ORTH gazetteer
            # 2. Test more nuanced date and currency regex
-            tokens_in_ents = {}
-            cdef attr_t entity_type
-            cdef attr_t kb_id
-            cdef int ent_start, ent_end, token_index
+            cdef attr_t entity_type, kb_id
+            cdef int ent_start, ent_end
+            ent_spans = []
            for ent_info in ents:
                entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
                if isinstance(entity_type_, str):
                    self.vocab.strings.add(entity_type_)
-                entity_type = self.vocab.strings.as_int(entity_type_)
-                for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents:
-                        raise ValueError(Errors.E103.format(
-                            span1=(tokens_in_ents[token_index][0],
-                                   tokens_in_ents[token_index][1],
-                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
-                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
-                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
+                span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
+                ent_spans.append(span)
+            self.set_ents(ent_spans, default=SetEntsDefault.outside)
+
+    def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
+        """Set entity annotation.
+
+        entities (List[Span]): Spans with labels to set as entities.
+        blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
+            entity) for spacy's built-in NER component. Other components may
+            ignore this setting.
+        missing (Optional[List[Span]]): Spans with missing/unknown entity
+            information.
+        outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
+        default (str): How to set entity annotation for tokens outside of any
+            provided spans. Options: "blocked", "missing", "outside" and
+            "unmodified" (preserve current state). Defaults to "outside".
+        """
+        if default not in SetEntsDefault.values():
+            raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
+
+        # Ignore spans with missing labels
+        entities = [ent for ent in entities if ent.label > 0]
+
+        if blocked is None:
+            blocked = tuple()
+        if missing is None:
+            missing = tuple()
+        if outside is None:
+            outside = tuple()
+
+        # Find all tokens covered by spans and check that none are overlapping
        cdef int i
-            for i in range(self.length):
-                # default values
-                entity_type = 0
-                kb_id = 0
+        seen_tokens = set()
+        for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)

-                # Set ent_iob to Missing (0) by default unless this token was nered before
-                ent_iob = 0
-                if self.c[i].ent_iob != 0:
-                    ent_iob = 2
-
-                # overwrite if the token was part of a specified entity
-                if i in tokens_in_ents.keys():
-                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
-                    if entity_type is None or entity_type <= 0:
-                        # Blocking this token from being overwritten by downstream NER
-                        ent_iob = 3
-                    elif ent_start == i:
-                        # Marking the start of an entity
-                        ent_iob = 3
+        # Set all specified entity information
+        for span in entities:
+            for i in range(span.start, span.end):
+                if i == span.start:
+                    self.c[i].ent_iob = 3
                else:
-                        # Marking the inside of an entity
-                        ent_iob = 1
+                    self.c[i].ent_iob = 1
+                self.c[i].ent_type = span.label
+                self.c[i].ent_kb_id = span.kb_id
+        for span in blocked:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 3
+                self.c[i].ent_type = 0
+        for span in missing:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 0
+                self.c[i].ent_type = 0
+        for span in outside:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 2
+                self.c[i].ent_type = 0

-                self.c[i].ent_type = entity_type
-                self.c[i].ent_kb_id = kb_id
-                self.c[i].ent_iob = ent_iob
+        # Set tokens outside of all provided spans
+        if default != SetEntsDefault.unmodified:
+            for i in range(self.length):
+                if i not in seen_tokens:
+                    self.c[i].ent_type = 0
+                    if default == SetEntsDefault.outside:
+                        self.c[i].ent_iob = 2
+                    elif default == SetEntsDefault.missing:
+                        self.c[i].ent_iob = 0
+                    elif default == SetEntsDefault.blocked:
+                        self.c[i].ent_iob = 3
+
+        # Fix any resulting inconsistent annotation
+        for i in range(self.length - 1):
+            # I must follow B or I: convert I to B
+            if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3
+            # Change of type with BI or II: convert second I to B
+            if self.c[i].ent_type != self.c[i+1].ent_type and \
+                    (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3

    @property
    def noun_chunks(self):
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):


 def _add_entities_to_doc(doc, ner_data):
+    print(ner_data)
    if ner_data is None:
        return
    elif ner_data == []:
@ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
            biluo_tags_to_spans(doc, ner_data)
        )
    elif isinstance(ner_data[0], Span):
-        # Ugh, this is super messy. Really hard to set O entities
-        doc.ents = ner_data
-        doc.ents = [span for span in ner_data if span.label_]
+        entities = []
+        missing = []
+        for span in ner_data:
+            if span.label:
+                entities.append(span)
+            else:
+                missing.append(span)
+        doc.set_ents(entities, missing=missing)
    else:
        raise ValueError(Errors.E973)

--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:

    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
-        token. Each tags string will be of the form of either "", "O" or
+        token. Each tag string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
-    RETURNS (list): A sequence of Span objects.
+    RETURNS (list): A sequence of Span objects. Each token with a missing IOB
+        tag is returned as a Span with an empty label.
    """
    token_offsets = tags_to_entities(tags)
    spans = []
@ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
    entities = []
    start = None
    for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
+        if tag is None or tag.startswith("-"):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            else:
                entities.append(("", i, i))
-            continue
-        elif tag == "-":
-            continue
+        elif tag.startswith("O"):
+            pass
        elif tag.startswith("I"):
            if start is None:
                raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
-            continue
-        if tag.startswith("U"):
+        elif tag.startswith("U"):
            entities.append((tag[2:], i, i))
        elif tag.startswith("B"):
            start = i
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -219,6 +219,30 @@ alignment mode `"strict".
 | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |

+## Doc.set_ents {#ents tag="method" new="3"}
+
+Set the named entities in the document.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Span
+> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc.set_ents([Span(doc, 0, 2, "PERSON")])
+> ents = list(doc.ents)
+> assert ents[0].label_ == "PERSON"
+> assert ents[0].text == "Mr. Best"
+> ```
+
+| Name           | Description                                                                                                                                                                               |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| entities       | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                           |
+| blocked        | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                        |
+| missing        | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                   |
+| outside        | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                            |
+| default        | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
+
 ## Doc.similarity {#similarity tag="method" model="vectors"}

 Make a semantic similarity estimate. The default estimate is cosine similarity
@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
 > ```python
 > doc = nlp("Mr. Best flew to New York on Saturday morning.")
 > ents = list(doc.ents)
-> assert ents[0].label == 346
 > assert ents[0].label_ == "PERSON"
 > assert ents[0].text == "Mr. Best"
 > ```