mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Add warning for misaligned character offset spans (#5007)
* Add warning for misaligned character offset spans * Resolve conflict * Filter warnings in example scripts Filter warnings in example scripts to show warnings once, in particular warnings about misaligned entities. Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
parent
0061992d95
commit
70da1fd2d6
|
@ -1,6 +1,7 @@
|
||||||
"""Prevent catastrophic forgetting with rehearsal updates."""
|
"""Prevent catastrophic forgetting with rehearsal updates."""
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
@ -66,7 +67,10 @@ def main(model_name, unlabelled_loc):
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
sizes = compounding(1.0, 4.0, 1.001)
|
||||||
with nlp.disable_pipes(*other_pipes):
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
random.shuffle(raw_docs)
|
random.shuffle(raw_docs)
|
||||||
|
|
|
@ -8,12 +8,13 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.0.0+
|
Compatible with: spaCy v2.0.0+
|
||||||
Last tested with: v2.1.0
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
@ -57,7 +58,11 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
# only train NER
|
||||||
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
# reset and initialize the weights randomly – but only if we're
|
# reset and initialize the weights randomly – but only if we're
|
||||||
# training a new model
|
# training a new model
|
||||||
if model is None:
|
if model is None:
|
||||||
|
|
|
@ -24,12 +24,13 @@ For more details, see the documentation:
|
||||||
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
* NER: https://spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Compatible with: spaCy v2.1.0+
|
Compatible with: spaCy v2.1.0+
|
||||||
Last tested with: v2.1.0
|
Last tested with: v2.2.4
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
@ -97,7 +98,11 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
# only train NER
|
||||||
|
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
|
# show warnings for misaligned entity spans once
|
||||||
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
sizes = compounding(1.0, 4.0, 1.001)
|
sizes = compounding(1.0, 4.0, 1.001)
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
|
|
|
@ -110,7 +110,11 @@ class Warnings(object):
|
||||||
"in problems with the vocab further on in the pipeline.")
|
"in problems with the vocab further on in the pipeline.")
|
||||||
W029 = ("Unable to align tokens with entities from character offsets. "
|
W029 = ("Unable to align tokens with entities from character offsets. "
|
||||||
"Discarding entity annotation for the text: {text}.")
|
"Discarding entity annotation for the text: {text}.")
|
||||||
|
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||||
|
"entities \"{entities}\". Use "
|
||||||
|
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||||
|
" to check the alignment. Misaligned entities ('-') will be "
|
||||||
|
"ignored during training.")
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors(object):
|
class Errors(object):
|
||||||
|
|
|
@ -957,6 +957,12 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
biluo[token.i] = missing
|
biluo[token.i] = missing
|
||||||
|
if "-" in biluo:
|
||||||
|
ent_str = str(entities)
|
||||||
|
warnings.warn(Warnings.W030.format(
|
||||||
|
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||||
|
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str
|
||||||
|
))
|
||||||
return biluo
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,8 @@ def test_gold_biluo_misalign(en_vocab):
|
||||||
spaces = [True, True, True, True, True, False]
|
spaces = [True, True, True, True, True, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
with pytest.warns(UserWarning):
|
||||||
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user