spaCy/spacy/tests/pipeline/test_simple_ner.py
Sofie Van Landeghem 60f22e1800
Pipe API (#6034)
* ensure Language passes on valid examples for initialization

* fix tagger model initialization

* check for valid get_examples across components

* assume labels were added before begin_training

* fix senter initialization

* fix morphologizer initialization

* use methods to check arguments

* test textcat init, requires thinc>=8.0.0a31

* fix tok2vec init

* fix entity linker init

* use islice

* fix simple NER

* cleanup debug model

* fix assert statements

* fix tests

* throw error when adding a label if the output layer can't be resized anymore

* fix test

* add failing test for simple_ner

* UX improvements

* morphologizer UX

* assume begin_training gets a representative set and processes the labels

* remove assumptions for output of untrained NER model

* restore test for original purpose
2020-09-08 22:44:25 +02:00

107 lines
3.3 KiB
Python

import pytest
from spacy.lang.en import English
from spacy.gold import Example
from spacy import util
from ..util import make_tempdir
TRAIN_DATA = [
("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
def test_no_label():
nlp = English()
nlp.add_pipe("simple_ner")
with pytest.raises(ValueError):
nlp.begin_training()
def test_implicit_label():
nlp = English()
ner = nlp.add_pipe("simple_ner")
train_examples = []
ner.add_label("ORG")
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
@pytest.mark.skip(reason="Should be fixed")
def test_untrained():
# This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG']
nlp = English()
ner = nlp.add_pipe("simple_ner")
ner.add_label("PERSON")
ner.add_label("LOC")
ner.add_label("ORG")
nlp.begin_training()
nlp("Example sentence")
def test_resize():
nlp = English()
ner = nlp.add_pipe("simple_ner")
ner.add_label("PERSON")
ner.add_label("LOC")
nlp.begin_training()
assert len(ner.labels) == 2
ner.add_label("ORG")
nlp.begin_training()
assert len(ner.labels) == 3
def test_begin_training_examples():
nlp = English()
ner = nlp.add_pipe("simple_ner")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: [])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
def test_overfitting_IO():
# Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
nlp = English()
ner = nlp.add_pipe("simple_ner")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.0001
# test the trained model
test_text = "I like London."
doc = nlp(test_text)
ents = doc.ents
assert len(ents) == 1
assert ents[0].text == "London"
assert ents[0].label_ == "LOC"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
ents2 = doc2.ents
assert len(ents2) == 1
assert ents2[0].text == "London"
assert ents2[0].label_ == "LOC"