Add warning for misaligned character offset spans (#5007)

* Add warning for misaligned character offset spans * Resolve conflict * Filter warnings in example scripts Filter warnings in example scripts to show warnings once, in particular warnings about misaligned entities. Co-authored-by: Ines Montani <ines@ines.io>
2025-11-18 16:56:07 +03:00 · 2020-05-19 16:01:18 +02:00 · 2020-05-19 16:01:18 +02:00 · 70da1fd2d6
commit 70da1fd2d6
parent 0061992d95
6 changed files with 32 additions and 7 deletions
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -1,6 +1,7 @@
 """Prevent catastrophic forgetting with rehearsal updates."""
 import plac
 import random
+import warnings
 import srsly
 import spacy
 from spacy.gold import GoldParse
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    sizes = compounding(1.0, 4.0, 1.001)
-    with nlp.disable_pipes(*other_pipes):
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            random.shuffle(raw_docs)
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -8,12 +8,13 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities

 Compatible with: spaCy v2.0.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function

 import plac
 import random
+import warnings
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train NER
+    # only train NER
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -24,12 +24,13 @@ For more details, see the documentation:
 * NER: https://spacy.io/usage/linguistic-features#named-entities

 Compatible with: spaCy v2.1.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.4
 """
 from __future__ import unicode_literals, print_function

 import plac
 import random
+import warnings
 from pathlib import Path
 import spacy
 from spacy.util import minibatch, compounding
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train NER
+    # only train NER
+    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+        # show warnings for misaligned entity spans once
+        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -110,7 +110,11 @@ class Warnings(object):
            "in problems with the vocab further on in the pipeline.")
    W029 = ("Unable to align tokens with entities from character offsets. "
            "Discarding entity annotation for the text: {text}.")
-
+    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
+            "entities \"{entities}\". Use "
+            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            " to check the alignment. Misaligned entities ('-') will be "
+            "ignored during training.")

@add_codes
 class Errors(object):
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
                break
        else:
            biluo[token.i] = missing
+    if "-" in biluo:
+        ent_str = str(entities)
+        warnings.warn(Warnings.W030.format(
+            text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
+            entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
+        ))
    return biluo


--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
    spaces = [True, True, True, True, True, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    with pytest.warns(UserWarning):
+        tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "-", "-", "-"]