mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
06f0a8daa0
* fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
80 lines
2.3 KiB
Python
80 lines
2.3 KiB
Python
import spacy
|
|
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
from spacy.lang.en import English
|
|
from spacy.tokens import Span
|
|
from spacy.util import ensure_path
|
|
from spacy.ml.models.defaults import default_ner
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
def test_issue4042():
|
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
nlp = English()
|
|
|
|
# add ner pipe
|
|
ner = nlp.create_pipe("ner")
|
|
ner.add_label("SOME_LABEL")
|
|
nlp.add_pipe(ner)
|
|
nlp.begin_training()
|
|
|
|
# Add entity ruler
|
|
ruler = EntityRuler(nlp)
|
|
patterns = [
|
|
{"label": "MY_ORG", "pattern": "Apple"},
|
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
]
|
|
ruler.add_patterns(patterns)
|
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
doc1 = nlp("What do you think about Apple ?")
|
|
assert doc1.ents[0].label_ == "MY_ORG"
|
|
|
|
with make_tempdir() as d:
|
|
output_dir = ensure_path(d)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
nlp.to_disk(output_dir)
|
|
|
|
nlp2 = spacy.load(output_dir)
|
|
doc2 = nlp2("What do you think about Apple ?")
|
|
assert doc2.ents[0].label_ == "MY_ORG"
|
|
|
|
|
|
def test_issue4042_bug2():
|
|
"""
|
|
Test that serialization of an NER works fine when new labels were added.
|
|
This is the second bug of two bugs underlying the issue 4042.
|
|
"""
|
|
nlp1 = English()
|
|
vocab = nlp1.vocab
|
|
|
|
# add ner pipe
|
|
ner1 = nlp1.create_pipe("ner")
|
|
ner1.add_label("SOME_LABEL")
|
|
nlp1.add_pipe(ner1)
|
|
nlp1.begin_training()
|
|
|
|
# add a new label to the doc
|
|
doc1 = nlp1("What do you think about Apple ?")
|
|
assert len(ner1.labels) == 1
|
|
assert "SOME_LABEL" in ner1.labels
|
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
|
|
# reapply the NER - at this point it should resize itself
|
|
ner1(doc1)
|
|
assert len(ner1.labels) == 2
|
|
assert "SOME_LABEL" in ner1.labels
|
|
assert "MY_ORG" in ner1.labels
|
|
|
|
with make_tempdir() as d:
|
|
# assert IO goes fine
|
|
output_dir = ensure_path(d)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
ner1.to_disk(output_dir)
|
|
|
|
ner2 = EntityRecognizer(vocab, default_ner())
|
|
ner2.from_disk(output_dir)
|
|
assert len(ner2.labels) == 2
|