Add warning for misaligned character offset spans (#5007)

* Add warning for misaligned character offset spans

* Resolve conflict

* Filter warnings in example scripts

Filter warnings in example scripts to show warnings once, in particular
warnings about misaligned entities.

Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
adrianeboyd 2020-05-19 16:01:18 +02:00 committed by GitHub
parent 0061992d95
commit 70da1fd2d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 32 additions and 7 deletions

View File

@ -1,6 +1,7 @@
"""Prevent catastrophic forgetting with rehearsal updates."""
import plac
import random
import warnings
import srsly
import spacy
from spacy.gold import GoldParse
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
sizes = compounding(1.0, 4.0, 1.001)
with nlp.disable_pipes(*other_pipes):
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
random.shuffle(raw_docs)

View File

@ -8,12 +8,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
# only train NER
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
# reset and initialize the weights randomly but only if we're
# training a new model
if model is None:

View File

@ -24,12 +24,13 @@ For more details, see the documentation:
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0
Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
# only train NER
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module='spacy')
sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
for itn in range(n_iter):

View File

@ -110,7 +110,11 @@ class Warnings(object):
"in problems with the vocab further on in the pipeline.")
W029 = ("Unable to align tokens with entities from character offsets. "
"Discarding entity annotation for the text: {text}.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use "
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be "
"ignored during training.")
@add_codes
class Errors(object):

View File

@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
break
else:
biluo[token.i] = missing
if "-" in biluo:
ent_str = str(entities)
warnings.warn(Warnings.W030.format(
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
))
return biluo

View File

@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
spaces = [True, True, True, True, True, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
with pytest.warns(UserWarning):
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ["O", "O", "O", "-", "-", "-"]