spaCy/spacy/tests/regression/test_issue3001-3500.py

284 lines
9.5 KiB
Python
Raw Normal View History

2019-07-10 13:49:18 +03:00
import pytest
from spacy.lang.en import English
from spacy.lang.de import German
Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
2020-02-27 20:42:27 +03:00
from spacy.ml.models.defaults import default_ner
2019-07-10 13:49:18 +03:00
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.compat import pickle
2019-07-10 13:49:18 +03:00
from spacy import displacy
from spacy.util import decaying
import numpy
from spacy.vectors import Vectors
2019-07-10 13:49:18 +03:00
from ..util import get_doc
def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German()
doc = nlp(
"880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
)
assert len(doc) == 5
def test_issue3009(en_vocab):
"""Test problem with matcher quantifiers"""
patterns = [
[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
2019-07-10 13:49:18 +03:00
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
{"LOWER": "to"},
{"LOWER": "do"},
{"TAG": "IN"},
2019-07-10 13:49:18 +03:00
],
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
{"LOWER": "to"},
{"LOWER": "do"},
{"TAG": "IN"},
2019-07-10 13:49:18 +03:00
],
]
words = ["also", "has", "to", "do", "with"]
tags = ["RB", "VBZ", "TO", "VB", "IN"]
doc = get_doc(en_vocab, words=words, tags=tags)
matcher = Matcher(en_vocab)
for i, pattern in enumerate(patterns):
matcher.add(str(i), [pattern])
2019-07-10 13:49:18 +03:00
matches = matcher(doc)
assert matches
def test_issue3012(en_vocab):
"""Test that the is_tagged attribute doesn't get overwritten when we from_array
without tag information."""
words = ["This", "is", "10", "%", "."]
tags = ["DT", "VBZ", "CD", "NN", "."]
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
ents = [(2, 4, "PERCENT")]
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
assert doc.is_tagged
expected = ("10", "NUM", "CD", "PERCENT")
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
header = [ENT_IOB, ENT_TYPE]
ent_array = doc.to_array(header)
doc.from_array(header, ent_array)
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
# Serializing then deserializing
doc_bytes = doc.to_bytes()
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
def test_issue3199():
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
is available. To make this test future-proof, we're constructing a Doc
with a new Vocab here and setting is_parsed to make sure the noun chunks run.
"""
doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
doc.is_parsed = True
assert list(doc[0:3].noun_chunks) == []
def test_issue3209():
"""Test issue that occurred in spaCy nightly where NER labels were being
mapped to classes incorrectly after loading the model, when the labels
were added using ner.add_label().
"""
nlp = English()
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("ANIMAL")
nlp.begin_training()
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
assert ner.move_names == move_names
nlp2 = English()
nlp2.add_pipe(nlp2.create_pipe("ner"))
Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
2020-02-27 20:42:27 +03:00
nlp2.get_pipe("ner").model.resize_output(ner.moves.n_moves)
2019-07-10 13:49:18 +03:00
nlp2.from_bytes(nlp.to_bytes())
assert nlp2.get_pipe("ner").move_names == move_names
def test_issue3248_1():
"""Test that the PhraseMatcher correctly reports its number of rules, not
total number of patterns."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", [nlp("d")])
2019-07-10 13:49:18 +03:00
assert len(matcher) == 2
def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
matcher.add("TEST2", [nlp("d")])
2019-07-10 13:49:18 +03:00
data = pickle.dumps(matcher)
new_matcher = pickle.loads(data)
assert len(new_matcher) == len(matcher)
def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
assert len(doc) == 14
assert doc[0].text == "\u2014"
assert doc[5].text == "\u2013"
assert doc[9].text == "\u2013"
def test_issue3288(en_vocab):
"""Test that retokenization works correctly via displaCy when punctuation
is merged onto the preceeding token and tensor is resized."""
words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
heads = [1, 0, -1, 1, 0, 1, -2, -3]
deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
displacy.render(doc)
def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model."""
nlp = English()
nlp.add_pipe(nlp.create_pipe("textcat"))
bytes_data = nlp.to_bytes()
new_nlp = English()
new_nlp.add_pipe(nlp.create_pipe("textcat"))
new_nlp.from_bytes(bytes_data)
def test_issue3328(en_vocab):
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
matcher = Matcher(en_vocab)
patterns = [
[{"LOWER": {"IN": ["hello", "how"]}}],
[{"LOWER": {"IN": ["you", "doing"]}}],
]
matcher.add("TEST", patterns)
2019-07-10 13:49:18 +03:00
matches = matcher(doc)
assert len(matches) == 4
matched_texts = [doc[start:end].text for _, start, end in matches]
assert matched_texts == ["Hello", "how", "you", "doing"]
def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple
matches, one per rule.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
2019-07-10 13:49:18 +03:00
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
matches = matcher(doc)
assert len(matches) == 2
match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
assert sorted(match_ids) == ["A", "B"]
def test_issue3345():
"""Test case where preset entity crosses sentence boundary."""
nlp = English()
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
Default settings to configurations (#4995) * fix grad_clip naming * cleaning up pretrained_vectors out of cfg * further refactoring Model init's * move Model building out of pipes * further refactor to require a model config when creating a pipe * small fixes * making cfg in nn_parser more consistent * fixing nr_class for parser * fixing nn_parser's nO * fix printing of loss * architectures in own file per type, consistent naming * convenience methods default_tagger_config and default_tok2vec_config * let create_pipe access default config if available for that component * default_parser_config * move defaults to separate folder * allow reading nlp from package or dir with argument 'name' * architecture spacy.VocabVectors.v1 to read static vectors from file * cleanup * default configs for nel, textcat, morphologizer, tensorizer * fix imports * fixing unit tests * fixes and clean up * fixing defaults, nO, fix unit tests * restore parser IO * fix IO * 'fix' serialization test * add *.cfg to manifest * fix example configs with additional arguments * replace Morpohologizer with Tagger * add IO bit when testing overfitting of tagger (currently failing) * fix IO - don't initialize when reading from disk * expand overfitting tests to also check IO goes OK * remove dropout from HashEmbed to fix Tagger performance * add defaults for sentrec * update thinc * always pass a Model instance to a Pipe * fix piped_added statement * remove obsolete W029 * remove obsolete errors * restore byte checking tests (work again) * clean up test * further test cleanup * convert from config to Model in create_pipe * bring back error when component is not initialized * cleanup * remove calls for nlp2.begin_training * use thinc.api in imports * allow setting charembed's nM and nC * fix for hardcoded nM/nC + unit test * formatting fixes * trigger build
2020-02-27 20:42:27 +03:00
ner = EntityRecognizer(doc.vocab, default_ner())
2019-07-10 13:49:18 +03:00
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")
ner.add_label("GPE")
doc = ruler(doc)
# Get into the state just before "New"
state = ner.moves.init_batch([doc])[0]
ner.moves.apply_transition(state, "O")
ner.moves.apply_transition(state, "O")
ner.moves.apply_transition(state, "O")
# Check that B-GPE is valid.
assert ner.moves.is_valid(state, "B-GPE")
def test_issue3410():
texts = ["Hello world", "This is a test"]
nlp = English()
matcher = Matcher(nlp.vocab)
phrasematcher = PhraseMatcher(nlp.vocab)
with pytest.deprecated_call():
docs = list(nlp.pipe(texts, n_threads=4))
with pytest.deprecated_call():
docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
with pytest.deprecated_call():
list(matcher.pipe(docs, n_threads=4))
with pytest.deprecated_call():
list(phrasematcher.pipe(docs, n_threads=4))
def test_issue3412():
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
vectors = Vectors(data=data)
2019-10-24 17:21:08 +03:00
keys, best_rows, scores = vectors.most_similar(
numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
)
assert best_rows[0] == 2
2019-07-10 13:49:18 +03:00
def test_issue3447():
sizes = decaying(10.0, 1.0, 0.5)
size = next(sizes)
assert size == 10.0
size = next(sizes)
assert size == 10.0 - 0.5
size = next(sizes)
assert size == 10.0 - 0.5 - 0.5
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449():
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
text1 = "He gave the ball to I. Do you want to go to the movies with I?"
text2 = "He gave the ball to I. Do you want to go to the movies with I?"
text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
t1 = nlp(text1)
t2 = nlp(text2)
t3 = nlp(text3)
assert t1[5].text == "I"
assert t2[5].text == "I"
assert t3[5].text == "I"
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3456():
# this crashed because of a padding error in layer.ops.unflatten in thinc
nlp = English()
nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.begin_training()
2019-10-18 12:27:38 +03:00
list(nlp.pipe(["hi", ""]))
2019-07-10 13:49:18 +03:00
def test_issue3468():
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can
be restored after serialization."""
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
doc = nlp("Hello world")
assert doc[0].is_sent_start
assert doc.is_sentenced
assert len(list(doc.sents)) == 1
doc_bytes = doc.to_bytes()
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
assert new_doc[0].is_sent_start
assert new_doc.is_sentenced
assert len(list(new_doc.sents)) == 1