From 877671e09a0a72ca20ccbbcd65d7073f588cd320 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 27 Sep 2022 10:16:51 +0200
Subject: [PATCH] Preserve missing entity annotation in augmenters (#11540)

Preserve both `-` and `O` annotation in augmenters rather than relying
on `Example.to_dict`'s default support for one option outside of labeled
entity spans.

This is intended as a temporary workaround for augmenters for v3.4.x.
The behavior of `Example` and related IOB utils could be improved in the
general case for v3.5.
---
 spacy/tests/training/test_augmenters.py |  7 +++++--
 spacy/training/augment.py               | 14 +++++++++++++-
 spacy/training/iob_utils.py             |  8 ++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
index e3639c5da..35860a199 100644
--- a/spacy/tests/training/test_augmenters.py
+++ b/spacy/tests/training/test_augmenters.py
@@ -31,7 +31,7 @@ def doc(nlp):
     words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
     tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
-    ents = ["B-PERSON", "I-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"]
+    ents = ["B-PERSON", "I-PERSON", "O", "", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"]
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
     doc = Doc(nlp.vocab, words=words, tags=tags, pos=pos, ents=ents)
@@ -106,6 +106,7 @@ def test_lowercase_augmenter(nlp, doc):
     assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents
     for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents):
         assert ref_ent.text == orig_ent.text.lower()
+    assert [t.ent_iob for t in doc] == [t.ent_iob for t in eg.reference]
     assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc]
 
     # check that augmentation works when lowercasing leads to different
@@ -166,7 +167,7 @@ def test_make_whitespace_variant(nlp):
     lemmas = ["they", "fly", "to", "New", "York", "City", ".", "\n", "then", "they", "drive", "to", "Washington", ",", "D.C."]
     heads = [1, 1, 1, 4, 5, 2, 1, 10, 10, 10, 10, 10, 11, 12, 12]
     deps = ["nsubj", "ROOT", "prep", "compound", "compound", "pobj", "punct", "dep", "advmod", "nsubj", "ROOT", "prep", "pobj", "punct", "appos"]
-    ents = ["O", "O", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"]
+    ents = ["O", "", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"]
     # fmt: on
     doc = Doc(
         nlp.vocab,
@@ -215,6 +216,8 @@ def test_make_whitespace_variant(nlp):
             assert mod_ex2.reference[j].head.i == j - 1
         # entities are well-formed
         assert len(doc.ents) == len(mod_ex.reference.ents)
+        # there is one token with missing entity information
+        assert any(t.ent_iob == 0 for t in mod_ex.reference)
         for ent in mod_ex.reference.ents:
             assert not ent[0].is_space
             assert not ent[-1].is_space
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 55d780ba4..2fe8c24fb 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -6,7 +6,7 @@ from functools import partial
 
 from ..util import registry
 from .example import Example
-from .iob_utils import split_bilu_label
+from .iob_utils import split_bilu_label, _doc_to_biluo_tags_with_partial
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
@@ -62,6 +62,9 @@ def combined_augmenter(
     if orth_variants and random.random() < orth_level:
         raw_text = example.text
         orig_dict = example.to_dict()
+        orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+            example.reference
+        )
         variant_text, variant_token_annot = make_orth_variants(
             nlp,
             raw_text,
@@ -128,6 +131,9 @@ def lower_casing_augmenter(
 
 def make_lowercase_variant(nlp: "Language", example: Example):
     example_dict = example.to_dict()
+    example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+        example.reference
+    )
     doc = nlp.make_doc(example.text.lower())
     example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
     return example.from_dict(doc, example_dict)
@@ -146,6 +152,9 @@ def orth_variants_augmenter(
     else:
         raw_text = example.text
         orig_dict = example.to_dict()
+        orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+            example.reference
+        )
         variant_text, variant_token_annot = make_orth_variants(
             nlp,
             raw_text,
@@ -248,6 +257,9 @@ def make_whitespace_variant(
     RETURNS (Example): Example with one additional space token.
     """
     example_dict = example.to_dict()
+    example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+        example.reference
+    )
     doc_dict = example_dict.get("doc_annotation", {})
     token_dict = example_dict.get("token_annotation", {})
     # returned unmodified if:
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 61f83a1c3..0d4d246b0 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -60,6 +60,14 @@ def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
     )
 
 
+def _doc_to_biluo_tags_with_partial(doc: Doc) -> List[str]:
+    ents = doc_to_biluo_tags(doc, missing="-")
+    for i, token in enumerate(doc):
+        if token.ent_iob == 2:
+            ents[i] = "O"
+    return ents
+
+
 def offsets_to_biluo_tags(
     doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
 ) -> List[str]: