2019-09-27 21:57:13 +03:00
|
|
|
import spacy
|
|
|
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.tokens import Span
|
|
|
|
from spacy.util import ensure_path
|
2020-05-19 17:20:03 +03:00
|
|
|
from spacy.pipeline.defaults import default_ner
|
2019-09-27 21:57:13 +03:00
|
|
|
|
2019-09-29 18:34:56 +03:00
|
|
|
from ..util import make_tempdir
|
|
|
|
|
2019-09-27 21:57:13 +03:00
|
|
|
|
|
|
|
def test_issue4042():
|
|
|
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
|
|
nlp = English()
|
|
|
|
|
|
|
|
# add ner pipe
|
|
|
|
ner = nlp.create_pipe("ner")
|
|
|
|
ner.add_label("SOME_LABEL")
|
|
|
|
nlp.add_pipe(ner)
|
|
|
|
nlp.begin_training()
|
|
|
|
|
|
|
|
# Add entity ruler
|
|
|
|
ruler = EntityRuler(nlp)
|
|
|
|
patterns = [
|
|
|
|
{"label": "MY_ORG", "pattern": "Apple"},
|
|
|
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
|
|
]
|
|
|
|
ruler.add_patterns(patterns)
|
|
|
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
|
|
doc1 = nlp("What do you think about Apple ?")
|
|
|
|
assert doc1.ents[0].label_ == "MY_ORG"
|
|
|
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
output_dir = ensure_path(d)
|
|
|
|
if not output_dir.exists():
|
|
|
|
output_dir.mkdir()
|
|
|
|
nlp.to_disk(output_dir)
|
|
|
|
|
|
|
|
nlp2 = spacy.load(output_dir)
|
|
|
|
doc2 = nlp2("What do you think about Apple ?")
|
|
|
|
assert doc2.ents[0].label_ == "MY_ORG"
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue4042_bug2():
|
|
|
|
"""
|
|
|
|
Test that serialization of an NER works fine when new labels were added.
|
|
|
|
This is the second bug of two bugs underlying the issue 4042.
|
|
|
|
"""
|
|
|
|
nlp1 = English()
|
|
|
|
vocab = nlp1.vocab
|
|
|
|
|
|
|
|
# add ner pipe
|
|
|
|
ner1 = nlp1.create_pipe("ner")
|
|
|
|
ner1.add_label("SOME_LABEL")
|
|
|
|
nlp1.add_pipe(ner1)
|
|
|
|
nlp1.begin_training()
|
|
|
|
|
|
|
|
# add a new label to the doc
|
|
|
|
doc1 = nlp1("What do you think about Apple ?")
|
|
|
|
assert len(ner1.labels) == 1
|
|
|
|
assert "SOME_LABEL" in ner1.labels
|
|
|
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
|
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
|
|
|
|
|
|
# reapply the NER - at this point it should resize itself
|
|
|
|
ner1(doc1)
|
|
|
|
assert len(ner1.labels) == 2
|
|
|
|
assert "SOME_LABEL" in ner1.labels
|
|
|
|
assert "MY_ORG" in ner1.labels
|
|
|
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
# assert IO goes fine
|
|
|
|
output_dir = ensure_path(d)
|
|
|
|
if not output_dir.exists():
|
|
|
|
output_dir.mkdir()
|
|
|
|
ner1.to_disk(output_dir)
|
|
|
|
|
2020-06-12 03:02:07 +03:00
|
|
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
|
|
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
2019-09-27 21:57:13 +03:00
|
|
|
ner2.from_disk(output_dir)
|
|
|
|
assert len(ner2.labels) == 2
|