From 70da1fd2d6e96256ad863f1e625091c46dac4835 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 19 May 2020 16:01:18 +0200 Subject: [PATCH] Add warning for misaligned character offset spans (#5007) * Add warning for misaligned character offset spans * Resolve conflict * Filter warnings in example scripts Filter warnings in example scripts to show warnings once, in particular warnings about misaligned entities. Co-authored-by: Ines Montani --- examples/training/rehearsal.py | 6 +++++- examples/training/train_ner.py | 9 +++++++-- examples/training/train_new_entity_type.py | 9 +++++++-- spacy/errors.py | 6 +++++- spacy/gold.pyx | 6 ++++++ spacy/tests/test_gold.py | 3 ++- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 9ece91427..24b1cea00 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -1,6 +1,7 @@ """Prevent catastrophic forgetting with rehearsal updates.""" import plac import random +import warnings import srsly import spacy from spacy.gold import GoldParse @@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc): pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] sizes = compounding(1.0, 4.0, 1.001) - with nlp.disable_pipes(*other_pipes): + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + for itn in range(n_iter): random.shuffle(TRAIN_DATA) random.shuffle(raw_docs) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 01bb6a67b..ff6029567 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -8,12 +8,13 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ -Last tested with: v2.1.0 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function import plac import random +import warnings from pathlib import Path import spacy from spacy.util import minibatch, compounding @@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100): # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + # only train NER + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + # reset and initialize the weights randomly – but only if we're # training a new model if model is None: diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 72d33ad50..e8ff6802a 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,12 +24,13 @@ For more details, see the documentation: * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.1.0+ -Last tested with: v2.1.0 +Last tested with: v2.2.4 """ from __future__ import unicode_literals, print_function import plac import random +import warnings from pathlib import Path import spacy from spacy.util import minibatch, compounding @@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] - with nlp.disable_pipes(*other_pipes): # only train NER + # only train NER + with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): + # show warnings for misaligned entity spans once + warnings.filterwarnings("once", category=UserWarning, module='spacy') + sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): diff --git a/spacy/errors.py b/spacy/errors.py index d99c96922..1b268d5ab 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -110,7 +110,11 @@ class Warnings(object): "in problems with the vocab further on in the pipeline.") W029 = ("Unable to align tokens with entities from character offsets. " "Discarding entity annotation for the text: {text}.") - + W030 = ("Some entities could not be aligned in the text \"{text}\" with " + "entities \"{entities}\". Use " + "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + " to check the alignment. Misaligned entities ('-') will be " + "ignored during training.") @add_codes class Errors(object): diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 4b8a4f52d..cf67a2ac7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): break else: biluo[token.i] = missing + if "-" in biluo: + ent_str = str(entities) + warnings.warn(Warnings.W030.format( + text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text, + entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str + )) return biluo diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index fc9e624eb..37b877561 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab): spaces = [True, True, True, True, True, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - tags = biluo_tags_from_offsets(doc, entities) + with pytest.warns(UserWarning): + tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "-", "-", "-"]