spaCy/spacy/tests/regression/test_issue1501-2000.py

247 lines
7.8 KiB
Python
Raw Normal View History

💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-07-25 00:38:44 +03:00
# coding: utf8
from __future__ import unicode_literals
import pytest
import gc
import numpy
import copy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.pipeline import Tagger, EntityRecognizer
from spacy.attrs import HEAD, DEP
from spacy.matcher import Matcher
from ..util import make_tempdir
def test_issue1506():
def string_generator():
for _ in range(10001):
yield "It's sentence produced by that bug."
for _ in range(10001):
yield "I erase some hbdsaj lemmas."
for _ in range(10001):
yield "I erase lemmas."
for _ in range(10001):
yield "It's sentence produced by that bug."
for _ in range(10001):
yield "It's sentence produced by that bug."
nlp = English()
for i, d in enumerate(nlp.pipe(string_generator())):
# We should run cleanup more than one time to actually cleanup data.
# In first run — clean up only mark strings as «not hitted».
if i == 10000 or i == 20000 or i == 30000:
gc.collect()
for t in d:
str(t.lemma_)
def test_issue1518():
"""Test vectors.resize() works."""
vectors = Vectors(shape=(10, 10))
vectors.add('hello', row=2)
vectors.resize((5, 9))
def test_issue1537():
"""Test that Span.as_doc() doesn't segfault."""
string = 'The sky is blue . The man is pink . The dog is purple .'
doc = Doc(Vocab(), words=string.split())
doc[0].sent_start = True
for word in doc[1:]:
if word.nbor(-1).text == '.':
word.sent_start = True
else:
word.sent_start = False
sents = list(doc.sents)
sent0 = sents[0].as_doc()
sent1 = sents[1].as_doc()
assert isinstance(sent0, Doc)
assert isinstance(sent1, Doc)
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
#def test_issue1537_model():
# nlp = load_spacy('en')
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
# sents = [s.as_doc() for s in doc.sents]
# print(list(sents[0].noun_chunks))
# print(list(sents[1].noun_chunks))
def test_issue1539():
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
v = Vectors(shape=(10, 10), keys=[5,3,98,100])
v.resize((100,100))
def test_issue1547():
"""Test that entity labels still match after merging tokens."""
words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
doc[5:7].merge()
assert [ent.text for ent in doc.ents]
def test_issue1612(en_tokenizer):
doc = en_tokenizer('The black cat purrs.')
span = doc[1: 3]
assert span.orth_ == span.text
def test_issue1654():
nlp = Language(Vocab())
assert not nlp.pipeline
nlp.add_pipe(lambda doc: doc, name='1')
nlp.add_pipe(lambda doc: doc, name='2', after='1')
nlp.add_pipe(lambda doc: doc, name='3', after='2')
assert nlp.pipe_names == ['1', '2', '3']
nlp2 = Language(Vocab())
assert not nlp2.pipeline
nlp2.add_pipe(lambda doc: doc, name='3')
nlp2.add_pipe(lambda doc: doc, name='2', before='3')
nlp2.add_pipe(lambda doc: doc, name='1', before='2')
assert nlp2.pipe_names == ['1', '2', '3']
@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
def test_issue1698(en_tokenizer, text):
doc = en_tokenizer(text)
assert len(doc) == 1
assert not doc[0].like_url
def test_issue1727():
"""Test that models with no pretrained vectors can be deserialized
correctly after vectors are added."""
data = numpy.ones((3, 300), dtype='f')
vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
tagger = Tagger(Vocab())
tagger.add_label('PRP')
tagger.begin_training()
assert tagger.cfg.get('pretrained_dims', 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:
tagger.to_disk(path)
tagger = Tagger(Vocab()).from_disk(path)
assert tagger.cfg.get('pretrained_dims', 0) == 0
def test_issue1757():
"""Test comparison against None doesn't cause segfault."""
doc = Doc(Vocab(), words=['a', 'b', 'c'])
assert not doc[0] < None
assert not doc[0] == None
assert doc[0] >= None
assert not doc[:2] < None
assert not doc[:2] == None
assert doc[:2] >= None
assert not doc.vocab['a'] == None
assert not doc.vocab['a'] < None
def test_issue1758(en_tokenizer):
"""Test that "would've" is handled by the English tokenizer exceptions."""
tokens = en_tokenizer("would've")
assert len(tokens) == 2
assert tokens[0].tag_ == "MD"
assert tokens[1].lemma_ == "have"
def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for
non-projective sentences."""
heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
[0, 8206900633647566924], [18446744073709551615, 440],
[18446744073709551614, 442]], dtype='uint64')
doc = Doc(Vocab(), words='Just what I was looking for .'.split())
doc.vocab.strings.add('ROOT')
doc = doc.from_array([HEAD, DEP], heads_deps)
assert len(list(doc.sents)) == 1
def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab()
assert 'hello' not in vocab
vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
assert 'hello' in vocab
def test_issue1834():
"""Test that sentence boundaries & parse/tag flags are not lost
during serialization."""
string = "This is a first sentence . And another one"
doc = Doc(Vocab(), words=string.split())
doc[6].sent_start = True
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc[6].sent_start
assert not new_doc.is_parsed
assert not new_doc.is_tagged
doc.is_parsed = True
doc.is_tagged = True
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc.is_parsed
assert new_doc.is_tagged
def test_issue1868():
"""Test Vocab.__contains__ works with int keys."""
vocab = Vocab()
lex = vocab['hello']
assert lex.orth in vocab
assert lex.orth_ in vocab
assert 'some string' not in vocab
int_id = vocab.strings.add('some string')
assert int_id not in vocab
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add('pat1', None, [{'orth': 'hello'}])
doc = Doc(matcher.vocab, words=['hello'])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=['hello'])
assert len(new_matcher(new_doc)) == 1
@pytest.mark.parametrize('word', ['the'])
def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
def test_issue1915():
cfg = {'hidden_depth': 2} # should error out
nlp = Language()
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.get_pipe('ner').add_label('answer')
with pytest.raises(ValueError):
nlp.begin_training(**cfg)
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2
assert matches[0][1:] == (0, 2)
assert matches[1][1:] == (1, 3)
@pytest.mark.parametrize('label', ['U-JOB-NAME'])
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
gold_parses = [(None, [(entry, None)])]
ner.moves.get_actions(gold_parses=gold_parses)