mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-28 01:20:38 +03:00
* ensure Language passes on valid examples for initialization * fix tagger model initialization * check for valid get_examples across components * assume labels were added before begin_training * fix senter initialization * fix morphologizer initialization * use methods to check arguments * test textcat init, requires thinc>=8.0.0a31 * fix tok2vec init * fix entity linker init * use islice * fix simple NER * cleanup debug model * fix assert statements * fix tests * throw error when adding a label if the output layer can't be resized anymore * fix test * add failing test for simple_ner * UX improvements * morphologizer UX * assume begin_training gets a representative set and processes the labels * remove assumptions for output of untrained NER model * restore test for original purpose
107 lines
3.3 KiB
Python
107 lines
3.3 KiB
Python
import pytest
|
|
from spacy.lang.en import English
|
|
from spacy.gold import Example
|
|
from spacy import util
|
|
from ..util import make_tempdir
|
|
|
|
|
|
TRAIN_DATA = [
|
|
("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}),
|
|
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
|
|
]
|
|
|
|
|
|
def test_no_label():
|
|
nlp = English()
|
|
nlp.add_pipe("simple_ner")
|
|
with pytest.raises(ValueError):
|
|
nlp.begin_training()
|
|
|
|
|
|
def test_implicit_label():
|
|
nlp = English()
|
|
ner = nlp.add_pipe("simple_ner")
|
|
train_examples = []
|
|
ner.add_label("ORG")
|
|
for t in TRAIN_DATA:
|
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
|
nlp.begin_training(get_examples=lambda: train_examples)
|
|
|
|
|
|
@pytest.mark.skip(reason="Should be fixed")
|
|
def test_untrained():
|
|
# This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG']
|
|
nlp = English()
|
|
ner = nlp.add_pipe("simple_ner")
|
|
ner.add_label("PERSON")
|
|
ner.add_label("LOC")
|
|
ner.add_label("ORG")
|
|
nlp.begin_training()
|
|
nlp("Example sentence")
|
|
|
|
|
|
def test_resize():
|
|
nlp = English()
|
|
ner = nlp.add_pipe("simple_ner")
|
|
ner.add_label("PERSON")
|
|
ner.add_label("LOC")
|
|
nlp.begin_training()
|
|
assert len(ner.labels) == 2
|
|
ner.add_label("ORG")
|
|
nlp.begin_training()
|
|
assert len(ner.labels) == 3
|
|
|
|
|
|
def test_begin_training_examples():
|
|
nlp = English()
|
|
ner = nlp.add_pipe("simple_ner")
|
|
train_examples = []
|
|
for text, annotations in TRAIN_DATA:
|
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
|
for ent in annotations.get("entities"):
|
|
ner.add_label(ent[2])
|
|
# you shouldn't really call this more than once, but for testing it should be fine
|
|
nlp.begin_training()
|
|
nlp.begin_training(get_examples=lambda: train_examples)
|
|
with pytest.raises(TypeError):
|
|
nlp.begin_training(get_examples=lambda: None)
|
|
with pytest.raises(TypeError):
|
|
nlp.begin_training(get_examples=lambda: train_examples[0])
|
|
with pytest.raises(ValueError):
|
|
nlp.begin_training(get_examples=lambda: [])
|
|
with pytest.raises(ValueError):
|
|
nlp.begin_training(get_examples=train_examples)
|
|
|
|
|
|
def test_overfitting_IO():
|
|
# Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
|
|
nlp = English()
|
|
ner = nlp.add_pipe("simple_ner")
|
|
train_examples = []
|
|
for text, annotations in TRAIN_DATA:
|
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
|
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
|
|
|
for i in range(50):
|
|
losses = {}
|
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
assert losses["ner"] < 0.0001
|
|
|
|
# test the trained model
|
|
test_text = "I like London."
|
|
doc = nlp(test_text)
|
|
ents = doc.ents
|
|
assert len(ents) == 1
|
|
assert ents[0].text == "London"
|
|
assert ents[0].label_ == "LOC"
|
|
|
|
# Also test the results are still the same after IO
|
|
with make_tempdir() as tmp_dir:
|
|
nlp.to_disk(tmp_dir)
|
|
nlp2 = util.load_model_from_path(tmp_dir)
|
|
doc2 = nlp2(test_text)
|
|
ents2 = doc2.ents
|
|
assert len(ents2) == 1
|
|
assert ents2[0].text == "London"
|
|
assert ents2[0].label_ == "LOC"
|