mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Add warning for misaligned character offset spans (#5007)
* Add warning for misaligned character offset spans * Resolve conflict * Filter warnings in example scripts Filter warnings in example scripts to show warnings once, in particular warnings about misaligned entities. Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
parent
0061992d95
commit
70da1fd2d6
|
@ -1,6 +1,7 @@
|
|||
"""Prevent catastrophic forgetting with rehearsal updates."""
|
||||
import plac
|
||||
import random
|
||||
import warnings
|
||||
import srsly
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
|
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
|
|||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
with nlp.disable_pipes(*other_pipes):
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(raw_docs)
|
||||
|
|
|
@ -8,12 +8,13 @@ For more details, see the documentation:
|
|||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
Last tested with: v2.1.0
|
||||
Last tested with: v2.2.4
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
|
|||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
|
||||
# reset and initialize the weights randomly – but only if we're
|
||||
# training a new model
|
||||
if model is None:
|
||||
|
|
|
@ -24,12 +24,13 @@ For more details, see the documentation:
|
|||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||
|
||||
Compatible with: spaCy v2.1.0+
|
||||
Last tested with: v2.1.0
|
||||
Last tested with: v2.2.4
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
|
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
for itn in range(n_iter):
|
||||
|
|
|
@ -110,7 +110,11 @@ class Warnings(object):
|
|||
"in problems with the vocab further on in the pipeline.")
|
||||
W029 = ("Unable to align tokens with entities from character offsets. "
|
||||
"Discarding entity annotation for the text: {text}.")
|
||||
|
||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||
"entities \"{entities}\". Use "
|
||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||
" to check the alignment. Misaligned entities ('-') will be "
|
||||
"ignored during training.")
|
||||
|
||||
@add_codes
|
||||
class Errors(object):
|
||||
|
|
|
@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
|||
break
|
||||
else:
|
||||
biluo[token.i] = missing
|
||||
if "-" in biluo:
|
||||
ent_str = str(entities)
|
||||
warnings.warn(Warnings.W030.format(
|
||||
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
|
||||
))
|
||||
return biluo
|
||||
|
||||
|
||||
|
|
|
@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
|
|||
spaces = [True, True, True, True, True, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
with pytest.warns(UserWarning):
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user