From 70da1fd2d6e96256ad863f1e625091c46dac4835 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Tue, 19 May 2020 16:01:18 +0200
Subject: [PATCH] Add warning for misaligned character offset spans (#5007)

* Add warning for misaligned character offset spans

* Resolve conflict

* Filter warnings in example scripts

Filter warnings in example scripts to show warnings once, in particular
warnings about misaligned entities.

Co-authored-by: Ines Montani <ines@ines.io>
---
 examples/training/rehearsal.py             | 6 +++++-
 examples/training/train_ner.py             | 9 +++++++--
 examples/training/train_new_entity_type.py | 9 +++++++--
 spacy/errors.py                            | 6 +++++-
 spacy/gold.pyx                             | 6 ++++++
 spacy/tests/test_gold.py                   | 3 ++-
 6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py
index 9ece91427..24b1cea00 100644
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@@ -1,6 +1,7 @@
 """Prevent catastrophic forgetting with rehearsal updates."""
 import plac
 import random
+import warnings
 import srsly
 import spacy
 from spacy.gold import GoldParse
@@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
     sizes = compounding(1.0, 4.0, 1.001)
-    with nlp.disable_pipes(*other_pipes):
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             random.shuffle(raw_docs)
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 01bb6a67b..ff6029567 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -8,12 +8,13 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities
 
 Compatible with: spaCy v2.0.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
+import warnings
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
@@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
     # get names of other pipes to disable them during training
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train NER
+    # only train NER
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
         # reset and initialize the weights randomly – but only if we're
         # training a new model
         if model is None:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 72d33ad50..e8ff6802a 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -24,12 +24,13 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities
 
 Compatible with: spaCy v2.1.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function
 
 import plac
 import random
+import warnings
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
@@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
     # get names of other pipes to disable them during training
     pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
     other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train NER
+    # only train NER
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
         sizes = compounding(1.0, 4.0, 1.001)
         # batch up the examples using spaCy's minibatch
         for itn in range(n_iter):
diff --git a/spacy/errors.py b/spacy/errors.py
index d99c96922..1b268d5ab 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -110,7 +110,11 @@ class Warnings(object):
             "in problems with the vocab further on in the pipeline.")
     W029 = ("Unable to align tokens with entities from character offsets. "
             "Discarding entity annotation for the text: {text}.")
-
+    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
+            "entities \"{entities}\". Use "
+            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            " to check the alignment. Misaligned entities ('-') will be "
+            "ignored during training.")
 
 @add_codes
 class Errors(object):
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 4b8a4f52d..cf67a2ac7 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
                 break
         else:
             biluo[token.i] = missing
+    if "-" in biluo:
+        ent_str = str(entities)
+        warnings.warn(Warnings.W030.format(
+            text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
+            entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
+        ))
     return biluo
 
 
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index fc9e624eb..37b877561 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
     spaces = [True, True, True, True, True, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    with pytest.warns(UserWarning):
+        tags = biluo_tags_from_offsets(doc, entities)
     assert tags == ["O", "O", "O", "-", "-", "-"]