import spacy from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English from spacy.tokens import Span from spacy.util import ensure_path from spacy.pipeline.defaults import default_ner from ..util import make_tempdir def test_issue4042(): """Test that serialization of an EntityRuler before NER works fine.""" nlp = English() # add ner pipe ner = nlp.create_pipe("ner") ner.add_label("SOME_LABEL") nlp.add_pipe(ner) nlp.begin_training() # Add entity ruler ruler = EntityRuler(nlp) patterns = [ {"label": "MY_ORG", "pattern": "Apple"}, {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, ] ruler.add_patterns(patterns) nlp.add_pipe(ruler, before="ner") # works fine with "after" doc1 = nlp("What do you think about Apple ?") assert doc1.ents[0].label_ == "MY_ORG" with make_tempdir() as d: output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2("What do you think about Apple ?") assert doc2.ents[0].label_ == "MY_ORG" def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() vocab = nlp1.vocab # add ner pipe ner1 = nlp1.create_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.add_pipe(ner1) nlp1.begin_training() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # reapply the NER - at this point it should resize itself ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2