mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Ensure the NER remains consistent after resizing (#4330)
* test and fix for second bug of issue 4042 * fix for first bug in 4042 * crashing test for Issue 4313 * forgot one instance of resize * remove prints * undo uncomment * delete test for 4313 (uses third party lib) * add fix for Issue 4313 * unit test for 4313
This commit is contained in:
parent
3906785b49
commit
22b9e12159
|
@ -180,21 +180,28 @@ class EntityRuler(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||||
"""
|
"""
|
||||||
for entry in patterns:
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
label = entry["label"]
|
try:
|
||||||
if "id" in entry:
|
current_index = self.nlp.pipe_names.index(self.name)
|
||||||
label = self._create_label(label, entry["id"])
|
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
|
||||||
pattern = entry["pattern"]
|
except ValueError:
|
||||||
if isinstance(pattern, basestring_):
|
subsequent_pipes = []
|
||||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
with self.nlp.disable_pipes(*subsequent_pipes):
|
||||||
elif isinstance(pattern, list):
|
for entry in patterns:
|
||||||
self.token_patterns[label].append(pattern)
|
label = entry["label"]
|
||||||
else:
|
if "id" in entry:
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
label = self._create_label(label, entry["id"])
|
||||||
for label, patterns in self.token_patterns.items():
|
pattern = entry["pattern"]
|
||||||
self.matcher.add(label, None, *patterns)
|
if isinstance(pattern, basestring_):
|
||||||
for label, patterns in self.phrase_patterns.items():
|
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||||
self.phrase_matcher.add(label, None, *patterns)
|
elif isinstance(pattern, list):
|
||||||
|
self.token_patterns[label].append(pattern)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
|
for label, patterns in self.token_patterns.items():
|
||||||
|
self.matcher.add(label, None, *patterns)
|
||||||
|
for label, patterns in self.phrase_patterns.items():
|
||||||
|
self.phrase_matcher.add(label, None, *patterns)
|
||||||
|
|
||||||
def _split_label(self, label):
|
def _split_label(self, label):
|
||||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||||
|
|
|
@ -163,10 +163,16 @@ cdef class Parser:
|
||||||
added = self.moves.add_action(action, label)
|
added = self.moves.add_action(action, label)
|
||||||
if added:
|
if added:
|
||||||
resized = True
|
resized = True
|
||||||
if resized and "nr_class" in self.cfg:
|
if resized:
|
||||||
|
self._resize()
|
||||||
|
|
||||||
|
def _resize(self):
|
||||||
|
if "nr_class" in self.cfg:
|
||||||
self.cfg["nr_class"] = self.moves.n_moves
|
self.cfg["nr_class"] = self.moves.n_moves
|
||||||
if self.model not in (True, False, None) and resized:
|
if self.model not in (True, False, None):
|
||||||
self.model.resize_output(self.moves.n_moves)
|
self.model.resize_output(self.moves.n_moves)
|
||||||
|
if self._rehearsal_model not in (True, False, None):
|
||||||
|
self._rehearsal_model.resize_output(self.moves.n_moves)
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, target):
|
||||||
# Defined in subclasses, to avoid circular import
|
# Defined in subclasses, to avoid circular import
|
||||||
|
@ -237,7 +243,9 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
return self.moves.init_batch(docs)
|
result = self.moves.init_batch(docs)
|
||||||
|
self._resize()
|
||||||
|
return result
|
||||||
if beam_width < 2:
|
if beam_width < 2:
|
||||||
return self.greedy_parse(docs, drop=drop)
|
return self.greedy_parse(docs, drop=drop)
|
||||||
else:
|
else:
|
||||||
|
@ -251,7 +259,7 @@ cdef class Parser:
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
self.model.resize_output(self.moves.n_moves)
|
self._resize()
|
||||||
model = self.model(docs)
|
model = self.model(docs)
|
||||||
weights = get_c_weights(model)
|
weights = get_c_weights(model)
|
||||||
for state in batch:
|
for state in batch:
|
||||||
|
@ -271,7 +279,7 @@ cdef class Parser:
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
self.model.resize_output(self.moves.n_moves)
|
self._resize()
|
||||||
model = self.model(docs)
|
model = self.model(docs)
|
||||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||||
dtype='i', order='C')
|
dtype='i', order='C')
|
||||||
|
@ -445,8 +453,7 @@ cdef class Parser:
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
self.model.resize_output(self.moves.n_moves)
|
self._resize()
|
||||||
self._rehearsal_model.resize_output(self.moves.n_moves)
|
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||||
tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
|
tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
|
||||||
model, finish_update = self.model.begin_update(docs, drop=0.0)
|
model, finish_update = self.model.begin_update(docs, drop=0.0)
|
||||||
|
|
|
@ -63,6 +63,13 @@ cdef class TransitionSystem:
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
beams = []
|
beams = []
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
|
|
||||||
|
# Doc objects might contain labels that we need to register actions for. We need to check for that
|
||||||
|
# *before* we create any Beam objects, because the Beam object needs the correct number of
|
||||||
|
# actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
|
||||||
|
# and it doesn't matter that we create and discard the state objects.
|
||||||
|
self.init_batch(docs)
|
||||||
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
||||||
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
||||||
|
|
83
spacy/tests/regression/test_issue4042.py
Normal file
83
spacy/tests/regression/test_issue4042.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
from spacy.tokens import Span
|
||||||
|
from spacy.util import ensure_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4042():
|
||||||
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# Add entity ruler
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [
|
||||||
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||||
|
]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||||
|
doc1 = nlp("What do you think about Apple ?")
|
||||||
|
assert doc1.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
|
||||||
|
nlp2 = spacy.load(output_dir)
|
||||||
|
doc2 = nlp2("What do you think about Apple ?")
|
||||||
|
assert doc2.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4042_bug2():
|
||||||
|
"""
|
||||||
|
Test that serialization of an NER works fine when new labels were added.
|
||||||
|
This is the second bug of two bugs underlying the issue 4042.
|
||||||
|
"""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab = nlp1.vocab
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner1 = nlp1.create_pipe("ner")
|
||||||
|
ner1.add_label("SOME_LABEL")
|
||||||
|
nlp1.add_pipe(ner1)
|
||||||
|
nlp1.begin_training()
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
|
assert len(ner1.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||||
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# reapply the NER - at this point it should resize itself
|
||||||
|
ner1(doc1)
|
||||||
|
assert len(ner1.labels) == 2
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
assert "MY_ORG" in ner1.labels
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# assert IO goes fine
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
|
nlp2 = English(vocab)
|
||||||
|
ner2 = EntityRecognizer(vocab)
|
||||||
|
ner2.from_disk(output_dir)
|
||||||
|
assert len(ner2.labels) == 2
|
39
spacy/tests/regression/test_issue4313.py
Normal file
39
spacy/tests/regression/test_issue4313.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4313():
|
||||||
|
""" This should not crash or exit with some strange error code """
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
nlp = English()
|
||||||
|
ner = EntityRecognizer(nlp.vocab)
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
ner.begin_training([])
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc = nlp("What do you think about Apple ?")
|
||||||
|
assert len(ner.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner.labels
|
||||||
|
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||||
|
doc.ents = list(doc.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# ensure the beam_parse still works with the new label
|
||||||
|
docs = [doc]
|
||||||
|
beams = nlp.entity.beam_parse(
|
||||||
|
docs, beam_width=beam_width, beam_density=beam_density
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc, beam in zip(docs, beams):
|
||||||
|
entity_scores = defaultdict(float)
|
||||||
|
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||||
|
for start, end, label in ents:
|
||||||
|
entity_scores[(start, end, label)] += score
|
Loading…
Reference in New Issue
Block a user