mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-08 16:26:37 +03:00
c0f4a1e43b
* verbose and tag_map options * adding init_tok2vec option and only changing the tok2vec that is specified * adding omit_extra_lookups and verifying textcat config * wip * pretrain bugfix * add replace and resume options * train_textcat fix * raw text functionality * improve UX when KeyError or when input data can't be parsed * avoid unnecessary access to goldparse in TextCat pipe * save performance information in nlp.meta * add noise_level to config * move nn_parser's defaults to config file * multitask in config - doesn't work yet * scorer offering both F and AUC options, need to be specified in config * add textcat verification code from old train script * small fixes to config files * clean up * set default config for ner/parser to allow create_pipe to work as before * two more test fixes * small fixes * cleanup * fix NER pickling + additional unit test * create_pipe as before
81 lines
2.4 KiB
Python
81 lines
2.4 KiB
Python
import spacy
|
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
from spacy.lang.en import English
|
|
from spacy.tokens import Span
|
|
from spacy.util import ensure_path
|
|
from spacy.pipeline.defaults import default_ner
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
def test_issue4042():
|
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
nlp = English()
|
|
|
|
# add ner pipe
|
|
ner = nlp.create_pipe("ner")
|
|
ner.add_label("SOME_LABEL")
|
|
nlp.add_pipe(ner)
|
|
nlp.begin_training()
|
|
|
|
# Add entity ruler
|
|
ruler = EntityRuler(nlp)
|
|
patterns = [
|
|
{"label": "MY_ORG", "pattern": "Apple"},
|
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
]
|
|
ruler.add_patterns(patterns)
|
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
doc1 = nlp("What do you think about Apple ?")
|
|
assert doc1.ents[0].label_ == "MY_ORG"
|
|
|
|
with make_tempdir() as d:
|
|
output_dir = ensure_path(d)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
nlp.to_disk(output_dir)
|
|
|
|
nlp2 = spacy.load(output_dir)
|
|
doc2 = nlp2("What do you think about Apple ?")
|
|
assert doc2.ents[0].label_ == "MY_ORG"
|
|
|
|
|
|
def test_issue4042_bug2():
|
|
"""
|
|
Test that serialization of an NER works fine when new labels were added.
|
|
This is the second bug of two bugs underlying the issue 4042.
|
|
"""
|
|
nlp1 = English()
|
|
vocab = nlp1.vocab
|
|
|
|
# add ner pipe
|
|
ner1 = nlp1.create_pipe("ner")
|
|
ner1.add_label("SOME_LABEL")
|
|
nlp1.add_pipe(ner1)
|
|
nlp1.begin_training()
|
|
|
|
# add a new label to the doc
|
|
doc1 = nlp1("What do you think about Apple ?")
|
|
assert len(ner1.labels) == 1
|
|
assert "SOME_LABEL" in ner1.labels
|
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
|
|
# reapply the NER - at this point it should resize itself
|
|
ner1(doc1)
|
|
assert len(ner1.labels) == 2
|
|
assert "SOME_LABEL" in ner1.labels
|
|
assert "MY_ORG" in ner1.labels
|
|
|
|
with make_tempdir() as d:
|
|
# assert IO goes fine
|
|
output_dir = ensure_path(d)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
ner1.to_disk(output_dir)
|
|
|
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
|
ner2.from_disk(output_dir)
|
|
assert len(ner2.labels) == 2
|