2020-04-02 15:46:32 +03:00
|
|
|
import pytest
|
2020-10-13 22:07:13 +03:00
|
|
|
from numpy.testing import assert_equal
|
2020-04-02 15:46:32 +03:00
|
|
|
|
|
|
|
from spacy import util
|
2020-09-09 11:31:03 +03:00
|
|
|
from spacy.training import Example
|
2020-04-02 15:46:32 +03:00
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.language import Language
|
|
|
|
from spacy.tests.util import make_tempdir
|
2020-07-19 12:10:51 +03:00
|
|
|
from spacy.morphology import Morphology
|
2020-10-13 22:07:13 +03:00
|
|
|
from spacy.attrs import MORPH
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
from spacy.tokens import Doc
|
2020-04-02 15:46:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_label_types():
|
|
|
|
nlp = Language()
|
2020-07-22 14:42:59 +03:00
|
|
|
morphologizer = nlp.add_pipe("morphologizer")
|
|
|
|
morphologizer.add_label("Feat=A")
|
2020-04-02 15:46:32 +03:00
|
|
|
with pytest.raises(ValueError):
|
2020-07-22 14:42:59 +03:00
|
|
|
morphologizer.add_label(9)
|
2020-04-02 15:46:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
TRAIN_DATA = [
|
2020-06-20 15:15:04 +03:00
|
|
|
(
|
|
|
|
"I like green eggs",
|
|
|
|
{
|
|
|
|
"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"],
|
|
|
|
"pos": ["NOUN", "VERB", "ADJ", "NOUN"],
|
|
|
|
},
|
|
|
|
),
|
2020-07-19 12:10:51 +03:00
|
|
|
# test combinations of morph+POS
|
2020-09-08 23:44:25 +03:00
|
|
|
("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}),
|
2020-04-02 15:46:32 +03:00
|
|
|
]
|
|
|
|
|
|
|
|
|
2020-09-08 23:44:25 +03:00
|
|
|
def test_no_label():
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe("morphologizer")
|
|
|
|
with pytest.raises(ValueError):
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize()
|
2020-09-08 23:44:25 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_implicit_label():
|
|
|
|
nlp = Language()
|
|
|
|
nlp.add_pipe("morphologizer")
|
|
|
|
train_examples = []
|
|
|
|
for t in TRAIN_DATA:
|
|
|
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize(get_examples=lambda: train_examples)
|
2020-09-08 23:44:25 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_no_resize():
|
|
|
|
nlp = Language()
|
|
|
|
morphologizer = nlp.add_pipe("morphologizer")
|
|
|
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
|
|
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize()
|
2020-09-08 23:44:25 +03:00
|
|
|
# this throws an error because the morphologizer can't be resized after initialization
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
|
|
|
|
|
|
|
|
2020-09-28 22:35:09 +03:00
|
|
|
def test_initialize_examples():
|
2020-09-08 23:44:25 +03:00
|
|
|
nlp = Language()
|
|
|
|
morphologizer = nlp.add_pipe("morphologizer")
|
|
|
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
|
|
|
train_examples = []
|
|
|
|
for t in TRAIN_DATA:
|
|
|
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
|
|
|
# you shouldn't really call this more than once, but for testing it should be fine
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize()
|
|
|
|
nlp.initialize(get_examples=lambda: train_examples)
|
2020-10-08 22:33:49 +03:00
|
|
|
with pytest.raises(TypeError):
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize(get_examples=lambda: None)
|
2020-10-08 22:33:49 +03:00
|
|
|
with pytest.raises(TypeError):
|
2020-09-28 22:35:09 +03:00
|
|
|
nlp.initialize(get_examples=train_examples)
|
2020-09-08 23:44:25 +03:00
|
|
|
|
|
|
|
|
2020-04-02 15:46:32 +03:00
|
|
|
def test_overfitting_IO():
|
|
|
|
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
|
|
|
|
nlp = English()
|
2020-09-08 23:44:25 +03:00
|
|
|
nlp.add_pipe("morphologizer")
|
2020-07-06 14:02:36 +03:00
|
|
|
train_examples = []
|
2020-04-02 15:46:32 +03:00
|
|
|
for inst in TRAIN_DATA:
|
2020-07-06 14:02:36 +03:00
|
|
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
2020-09-28 22:35:09 +03:00
|
|
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
2020-04-02 15:46:32 +03:00
|
|
|
|
|
|
|
for i in range(50):
|
|
|
|
losses = {}
|
2020-07-06 14:02:36 +03:00
|
|
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
2020-04-02 15:46:32 +03:00
|
|
|
assert losses["morphologizer"] < 0.00001
|
|
|
|
|
|
|
|
# test the trained model
|
2020-07-19 12:10:51 +03:00
|
|
|
test_text = "I like blue ham"
|
2020-04-02 15:46:32 +03:00
|
|
|
doc = nlp(test_text)
|
2020-09-08 23:44:25 +03:00
|
|
|
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
|
|
|
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
|
2020-10-01 23:21:46 +03:00
|
|
|
assert [str(t.morph) for t in doc] == gold_morphs
|
2020-07-19 12:10:51 +03:00
|
|
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
2020-04-02 15:46:32 +03:00
|
|
|
|
|
|
|
# Also test the results are still the same after IO
|
|
|
|
with make_tempdir() as tmp_dir:
|
|
|
|
nlp.to_disk(tmp_dir)
|
|
|
|
nlp2 = util.load_model_from_path(tmp_dir)
|
|
|
|
doc2 = nlp2(test_text)
|
2020-10-01 23:21:46 +03:00
|
|
|
assert [str(t.morph) for t in doc2] == gold_morphs
|
2020-07-19 12:10:51 +03:00
|
|
|
assert [t.pos_ for t in doc2] == gold_pos_tags
|
2020-10-13 22:07:13 +03:00
|
|
|
|
|
|
|
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
|
|
|
|
texts = [
|
|
|
|
"Just a sentence.",
|
|
|
|
"Then one more sentence about London.",
|
|
|
|
"Here is another one.",
|
|
|
|
"I like London.",
|
|
|
|
]
|
|
|
|
batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
|
|
|
|
batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)]
|
|
|
|
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
|
|
|
assert_equal(batch_deps_1, batch_deps_2)
|
|
|
|
assert_equal(batch_deps_1, no_batch_deps)
|
2020-11-10 15:15:09 +03:00
|
|
|
|
|
|
|
# Test without POS
|
|
|
|
nlp.remove_pipe("morphologizer")
|
|
|
|
nlp.add_pipe("morphologizer")
|
|
|
|
for example in train_examples:
|
|
|
|
for token in example.reference:
|
|
|
|
token.pos_ = ""
|
|
|
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
|
|
|
for i in range(50):
|
|
|
|
losses = {}
|
|
|
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
|
|
assert losses["morphologizer"] < 0.00001
|
|
|
|
|
|
|
|
# Test the trained model
|
|
|
|
test_text = "I like blue ham"
|
|
|
|
doc = nlp(test_text)
|
|
|
|
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
|
|
|
gold_pos_tags = ["", "", "", ""]
|
|
|
|
assert [str(t.morph) for t in doc] == gold_morphs
|
|
|
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
2021-01-15 19:20:10 +03:00
|
|
|
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
# Test overwrite+extend settings
|
|
|
|
# (note that "" is unset, "_" is set and empty)
|
|
|
|
morphs = ["Feat=V", "Feat=N", "_"]
|
|
|
|
doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
|
|
|
|
orig_morphs = [str(t.morph) for t in doc]
|
|
|
|
orig_pos_tags = [t.pos_ for t in doc]
|
|
|
|
morphologizer = nlp.get_pipe("morphologizer")
|
|
|
|
|
|
|
|
# don't overwrite or extend
|
|
|
|
morphologizer.cfg["overwrite"] = False
|
|
|
|
doc = morphologizer(doc)
|
|
|
|
assert [str(t.morph) for t in doc] == orig_morphs
|
|
|
|
assert [t.pos_ for t in doc] == orig_pos_tags
|
|
|
|
|
|
|
|
# overwrite and extend
|
|
|
|
morphologizer.cfg["overwrite"] = True
|
|
|
|
morphologizer.cfg["extend"] = True
|
|
|
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
|
|
|
doc = morphologizer(doc)
|
|
|
|
assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
|
|
|
|
|
|
|
|
# extend without overwriting
|
|
|
|
morphologizer.cfg["overwrite"] = False
|
|
|
|
morphologizer.cfg["extend"] = True
|
|
|
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
|
|
|
|
doc = morphologizer(doc)
|
|
|
|
assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
|
|
|
|
|
|
|
|
# overwrite without extending
|
|
|
|
morphologizer.cfg["overwrite"] = True
|
|
|
|
morphologizer.cfg["extend"] = False
|
|
|
|
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
|
|
|
|
doc = morphologizer(doc)
|
|
|
|
assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
|
|
|
|
|
2021-01-15 19:20:10 +03:00
|
|
|
# Test with unset morph and partial POS
|
|
|
|
nlp.remove_pipe("morphologizer")
|
|
|
|
nlp.add_pipe("morphologizer")
|
|
|
|
for example in train_examples:
|
|
|
|
for token in example.reference:
|
|
|
|
if token.text == "ham":
|
|
|
|
token.pos_ = "NOUN"
|
|
|
|
else:
|
|
|
|
token.pos_ = ""
|
|
|
|
token.set_morph(None)
|
|
|
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
|
|
|
print(nlp.get_pipe("morphologizer").labels)
|
|
|
|
for i in range(50):
|
|
|
|
losses = {}
|
|
|
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
|
|
assert losses["morphologizer"] < 0.00001
|
|
|
|
|
|
|
|
# Test the trained model
|
|
|
|
test_text = "I like blue ham"
|
|
|
|
doc = nlp(test_text)
|
|
|
|
gold_morphs = ["", "", "", ""]
|
|
|
|
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
|
|
|
assert [str(t.morph) for t in doc] == gold_morphs
|
|
|
|
assert [t.pos_ for t in doc] == gold_pos_tags
|