Ensure the NER remains consistent after resizing (#4330)

* test and fix for second bug of issue 4042 * fix for first bug in 4042 * crashing test for Issue 4313 * forgot one instance of resize * remove prints * undo uncomment * delete test for 4313 (uses third party lib) * add fix for Issue 4313 * unit test for 4313
2025-10-18 17:54:17 +03:00 · 2019-09-27 20:57:13 +02:00 · 2019-09-27 20:57:13 +02:00 · 22b9e12159
commit 22b9e12159
parent 3906785b49
5 changed files with 165 additions and 22 deletions
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -180,21 +180,28 @@ class EntityRuler(object):
        DOCS: https://spacy.io/api/entityruler#add_patterns
        """
-        for entry in patterns:
+        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
-            label = entry["label"]
+        try:
-            if "id" in entry:
+            current_index = self.nlp.pipe_names.index(self.name)
-                label = self._create_label(label, entry["id"])
+            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
-            pattern = entry["pattern"]
+        except ValueError:
-            if isinstance(pattern, basestring_):
+            subsequent_pipes = []
-                self.phrase_patterns[label].append(self.nlp(pattern))
+        with self.nlp.disable_pipes(*subsequent_pipes):
-            elif isinstance(pattern, list):
+            for entry in patterns:
-                self.token_patterns[label].append(pattern)
+                label = entry["label"]
-            else:
+                if "id" in entry:
-                raise ValueError(Errors.E097.format(pattern=pattern))
+                    label = self._create_label(label, entry["id"])
-        for label, patterns in self.token_patterns.items():
+                pattern = entry["pattern"]
-            self.matcher.add(label, None, *patterns)
+                if isinstance(pattern, basestring_):
-        for label, patterns in self.phrase_patterns.items():
+                    self.phrase_patterns[label].append(self.nlp(pattern))
-            self.phrase_matcher.add(label, None, *patterns)
+                elif isinstance(pattern, list):
                    self.token_patterns[label].append(pattern)
                else:
                    raise ValueError(Errors.E097.format(pattern=pattern))
            for label, patterns in self.token_patterns.items():
                self.matcher.add(label, None, *patterns)
            for label, patterns in self.phrase_patterns.items():
                self.phrase_matcher.add(label, None, *patterns)
    def _split_label(self, label):
        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -163,10 +163,16 @@ cdef class Parser:
            added = self.moves.add_action(action, label)
            if added:
                resized = True
-        if resized and "nr_class" in self.cfg:
+        if resized:
            self._resize()
    def _resize(self):
        if "nr_class" in self.cfg:
            self.cfg["nr_class"] = self.moves.n_moves
-        if self.model not in (True, False, None) and resized:
+        if self.model not in (True, False, None):
            self.model.resize_output(self.moves.n_moves)
        if self._rehearsal_model not in (True, False, None):
            self._rehearsal_model.resize_output(self.moves.n_moves)
    def add_multitask_objective(self, target):
        # Defined in subclasses, to avoid circular import
@ -237,7 +243,9 @@ cdef class Parser:
        if isinstance(docs, Doc):
            docs = [docs]
        if not any(len(doc) for doc in docs):
-            return self.moves.init_batch(docs)
+            result = self.moves.init_batch(docs)
            self._resize()
            return result
        if beam_width < 2:
            return self.greedy_parse(docs, drop=drop)
        else:
@ -251,7 +259,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
        model = self.model(docs)
        weights = get_c_weights(model)
        for state in batch:
@ -271,7 +279,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
        model = self.model(docs)
        token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                 dtype='i', order='C')
@ -445,8 +453,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
        self._rehearsal_model.resize_output(self.moves.n_moves)
        # Prepare the stepwise model, and get the callback for finishing the batch
        tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
        model, finish_update = self.model.begin_update(docs, drop=0.0)
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -63,6 +63,13 @@ cdef class TransitionSystem:
        cdef Doc doc
        beams = []
        cdef int offset = 0
        # Doc objects might contain labels that we need to register actions for. We need to check for that
        # *before* we create any Beam objects, because the Beam object needs the correct number of
        # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
        # and it doesn't matter that we create and discard the state objects.
        self.init_batch(docs)
        for doc in docs:
            beam = Beam(self.n_moves, beam_width, min_density=beam_density)
            beam.initialize(self.init_beam_state, doc.length, doc.c)
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -0,0 +1,83 @@
 # coding: utf8
 from __future__ import unicode_literals
 import spacy
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
 from spacy.tokens import Span
 from spacy.util import ensure_path
 def test_issue4042():
    """Test that serialization of an EntityRuler before NER works fine."""
    nlp = English()
    # add ner pipe
    ner = nlp.create_pipe("ner")
    ner.add_label("SOME_LABEL")
    nlp.add_pipe(ner)
    nlp.begin_training()
    # Add entity ruler
    ruler = EntityRuler(nlp)
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
    doc1 = nlp("What do you think about Apple ?")
    assert doc1.ents[0].label_ == "MY_ORG"
    with make_tempdir() as d:
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2("What do you think about Apple ?")
        assert doc2.ents[0].label_ == "MY_ORG"
 def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab
    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        nlp2 = English(vocab)
        ner2 = EntityRecognizer(vocab)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -0,0 +1,39 @@
 # coding: utf8
 from __future__ import unicode_literals
 from collections import defaultdict
 from spacy.pipeline import EntityRecognizer
 from spacy.lang.en import English
 from spacy.tokens import Span
 def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    ner = EntityRecognizer(nlp.vocab)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]
    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(
        docs, beam_width=beam_width, beam_density=beam_density
    )
    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score