Add warning for misaligned character offset spans (#5007)

* Add warning for misaligned character offset spans

* Resolve conflict

* Filter warnings in example scripts

Filter warnings in example scripts to show warnings once, in particular
warnings about misaligned entities.

Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
adrianeboyd 2020-05-19 16:01:18 +02:00 committed by GitHub
parent 0061992d95
commit 70da1fd2d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 32 additions and 7 deletions

View File

@ -1,6 +1,7 @@
"""Prevent catastrophic forgetting with rehearsal updates.""" """Prevent catastrophic forgetting with rehearsal updates."""
import plac import plac
import random import random
import warnings
import srsly import srsly
import spacy import spacy
from spacy.gold import GoldParse from spacy.gold import GoldParse
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
sizes = compounding(1.0, 4.0, 1.001) sizes = compounding(1.0, 4.0, 1.001)
with nlp.disable_pipes(*other_pipes): with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
random.shuffle(raw_docs) random.shuffle(raw_docs)

View File

@ -8,12 +8,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities * NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+ Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0 Last tested with: v2.2.4
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
import warnings
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER # only train NER
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
# reset and initialize the weights randomly but only if we're # reset and initialize the weights randomly but only if we're
# training a new model # training a new model
if model is None: if model is None:

View File

@ -24,12 +24,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities * NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.1.0+ Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0 Last tested with: v2.2.4
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
import warnings
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER # only train NER
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
sizes = compounding(1.0, 4.0, 1.001) sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
for itn in range(n_iter): for itn in range(n_iter):

View File

@ -110,7 +110,11 @@ class Warnings(object):
"in problems with the vocab further on in the pipeline.") "in problems with the vocab further on in the pipeline.")
W029 = ("Unable to align tokens with entities from character offsets. " W029 = ("Unable to align tokens with entities from character offsets. "
"Discarding entity annotation for the text: {text}.") "Discarding entity annotation for the text: {text}.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use "
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be "
"ignored during training.")
@add_codes @add_codes
class Errors(object): class Errors(object):

View File

@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
break break
else: else:
biluo[token.i] = missing biluo[token.i] = missing
if "-" in biluo:
ent_str = str(entities)
warnings.warn(Warnings.W030.format(
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
))
return biluo return biluo

View File

@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
spaces = [True, True, True, True, True, False] spaces = [True, True, True, True, True, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities) with pytest.warns(UserWarning):
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ["O", "O", "O", "-", "-", "-"] assert tags == ["O", "O", "O", "-", "-", "-"]