mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
		
			
				
	
	
		
			247 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			247 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| import gc
 | |
| import numpy
 | |
| import copy
 | |
| from spacy.lang.en import English
 | |
| from spacy.lang.en.stop_words import STOP_WORDS
 | |
| from spacy.lang.lex_attrs import is_stop
 | |
| from spacy.vectors import Vectors
 | |
| from spacy.vocab import Vocab
 | |
| from spacy.language import Language
 | |
| from spacy.tokens import Doc, Span
 | |
| from spacy.pipeline import Tagger, EntityRecognizer
 | |
| from spacy.attrs import HEAD, DEP
 | |
| from spacy.matcher import Matcher
 | |
| 
 | |
| from ..util import make_tempdir
 | |
| 
 | |
| 
 | |
| def test_issue1506():
 | |
|     def string_generator():
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
|         for _ in range(10001):
 | |
|             yield "I erase some hbdsaj lemmas."
 | |
|         for _ in range(10001):
 | |
|             yield "I erase lemmas."
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
|         for _ in range(10001):
 | |
|             yield "It's sentence produced by that bug."
 | |
| 
 | |
|     nlp = English()
 | |
|     for i, d in enumerate(nlp.pipe(string_generator())):
 | |
|         # We should run cleanup more than one time to actually cleanup data.
 | |
|         # In first run — clean up only mark strings as «not hitted».
 | |
|         if i == 10000 or i == 20000 or i == 30000:
 | |
|             gc.collect()
 | |
|         for t in d:
 | |
|             str(t.lemma_)
 | |
| 
 | |
| 
 | |
| def test_issue1518():
 | |
|     """Test vectors.resize() works."""
 | |
|     vectors = Vectors(shape=(10, 10))
 | |
|     vectors.add('hello', row=2)
 | |
|     vectors.resize((5, 9))
 | |
| 
 | |
| 
 | |
| def test_issue1537():
 | |
|     """Test that Span.as_doc() doesn't segfault."""
 | |
|     string = 'The sky is blue . The man is pink . The dog is purple .'
 | |
|     doc = Doc(Vocab(), words=string.split())
 | |
|     doc[0].sent_start = True
 | |
|     for word in doc[1:]:
 | |
|         if word.nbor(-1).text == '.':
 | |
|             word.sent_start = True
 | |
|         else:
 | |
|             word.sent_start = False
 | |
|     sents = list(doc.sents)
 | |
|     sent0 = sents[0].as_doc()
 | |
|     sent1 = sents[1].as_doc()
 | |
|     assert isinstance(sent0, Doc)
 | |
|     assert isinstance(sent1, Doc)
 | |
| 
 | |
| 
 | |
| # TODO: Currently segfaulting, due to l_edge and r_edge misalignment
 | |
| #def test_issue1537_model():
 | |
| #    nlp = load_spacy('en')
 | |
| #    doc = nlp('The sky is blue. The man is pink. The dog is purple.')
 | |
| #    sents = [s.as_doc() for s in doc.sents]
 | |
| #    print(list(sents[0].noun_chunks))
 | |
| #    print(list(sents[1].noun_chunks))
 | |
| 
 | |
| 
 | |
| def test_issue1539():
 | |
|     """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
 | |
|     v = Vectors(shape=(10, 10), keys=[5,3,98,100])
 | |
|     v.resize((100,100))
 | |
| 
 | |
| 
 | |
| def test_issue1547():
 | |
|     """Test that entity labels still match after merging tokens."""
 | |
|     words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
 | |
|     doc = Doc(Vocab(), words=words)
 | |
|     doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
 | |
|     doc[5:7].merge()
 | |
|     assert [ent.text for ent in doc.ents]
 | |
| 
 | |
| 
 | |
| def test_issue1612(en_tokenizer):
 | |
|     doc = en_tokenizer('The black cat purrs.')
 | |
|     span = doc[1: 3]
 | |
|     assert span.orth_ == span.text
 | |
| 
 | |
| 
 | |
| def test_issue1654():
 | |
|     nlp = Language(Vocab())
 | |
|     assert not nlp.pipeline
 | |
|     nlp.add_pipe(lambda doc: doc, name='1')
 | |
|     nlp.add_pipe(lambda doc: doc, name='2', after='1')
 | |
|     nlp.add_pipe(lambda doc: doc, name='3', after='2')
 | |
|     assert nlp.pipe_names == ['1', '2', '3']
 | |
|     nlp2 = Language(Vocab())
 | |
|     assert not nlp2.pipeline
 | |
|     nlp2.add_pipe(lambda doc: doc, name='3')
 | |
|     nlp2.add_pipe(lambda doc: doc, name='2', before='3')
 | |
|     nlp2.add_pipe(lambda doc: doc, name='1', before='2')
 | |
|     assert nlp2.pipe_names == ['1', '2', '3']
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
 | |
| def test_issue1698(en_tokenizer, text):
 | |
|     doc = en_tokenizer(text)
 | |
|     assert len(doc) == 1
 | |
|     assert not doc[0].like_url
 | |
| 
 | |
| 
 | |
| def test_issue1727():
 | |
|     """Test that models with no pretrained vectors can be deserialized
 | |
|     correctly after vectors are added."""
 | |
|     data = numpy.ones((3, 300), dtype='f')
 | |
|     vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
 | |
|     tagger = Tagger(Vocab())
 | |
|     tagger.add_label('PRP')
 | |
|     tagger.begin_training()
 | |
|     assert tagger.cfg.get('pretrained_dims', 0) == 0
 | |
|     tagger.vocab.vectors = vectors
 | |
|     with make_tempdir() as path:
 | |
|         tagger.to_disk(path)
 | |
|         tagger = Tagger(Vocab()).from_disk(path)
 | |
|         assert tagger.cfg.get('pretrained_dims', 0) == 0
 | |
| 
 | |
| 
 | |
| def test_issue1757():
 | |
|     """Test comparison against None doesn't cause segfault."""
 | |
|     doc = Doc(Vocab(), words=['a', 'b', 'c'])
 | |
|     assert not doc[0] < None
 | |
|     assert not doc[0] == None
 | |
|     assert doc[0] >= None
 | |
|     assert not doc[:2] < None
 | |
|     assert not doc[:2] == None
 | |
|     assert doc[:2] >= None
 | |
|     assert not doc.vocab['a'] == None
 | |
|     assert not doc.vocab['a'] < None
 | |
| 
 | |
| 
 | |
| def test_issue1758(en_tokenizer):
 | |
|     """Test that "would've" is handled by the English tokenizer exceptions."""
 | |
|     tokens = en_tokenizer("would've")
 | |
|     assert len(tokens) == 2
 | |
|     assert tokens[0].tag_ == "MD"
 | |
|     assert tokens[1].lemma_ == "have"
 | |
| 
 | |
| 
 | |
| def test_issue1799():
 | |
|     """Test sentence boundaries are deserialized correctly, even for
 | |
|     non-projective sentences."""
 | |
|     heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
 | |
|                                 [0, 8206900633647566924], [18446744073709551615, 440],
 | |
|                                 [18446744073709551614, 442]], dtype='uint64')
 | |
|     doc = Doc(Vocab(), words='Just what I was looking for .'.split())
 | |
|     doc.vocab.strings.add('ROOT')
 | |
|     doc = doc.from_array([HEAD, DEP], heads_deps)
 | |
|     assert len(list(doc.sents)) == 1
 | |
| 
 | |
| 
 | |
| def test_issue1807():
 | |
|     """Test vocab.set_vector also adds the word to the vocab."""
 | |
|     vocab = Vocab()
 | |
|     assert 'hello' not in vocab
 | |
|     vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
 | |
|     assert 'hello' in vocab
 | |
| 
 | |
| 
 | |
| def test_issue1834():
 | |
|     """Test that sentence boundaries & parse/tag flags are not lost
 | |
|     during serialization."""
 | |
|     string = "This is a first sentence . And another one"
 | |
|     doc = Doc(Vocab(), words=string.split())
 | |
|     doc[6].sent_start = True
 | |
|     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | |
|     assert new_doc[6].sent_start
 | |
|     assert not new_doc.is_parsed
 | |
|     assert not new_doc.is_tagged
 | |
|     doc.is_parsed = True
 | |
|     doc.is_tagged = True
 | |
|     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | |
|     assert new_doc.is_parsed
 | |
|     assert new_doc.is_tagged
 | |
| 
 | |
| 
 | |
| def test_issue1868():
 | |
|     """Test Vocab.__contains__ works with int keys."""
 | |
|     vocab = Vocab()
 | |
|     lex = vocab['hello']
 | |
|     assert lex.orth in vocab
 | |
|     assert lex.orth_ in vocab
 | |
|     assert 'some string' not in vocab
 | |
|     int_id = vocab.strings.add('some string')
 | |
|     assert int_id not in vocab
 | |
| 
 | |
| 
 | |
| def test_issue1883():
 | |
|     matcher = Matcher(Vocab())
 | |
|     matcher.add('pat1', None, [{'orth': 'hello'}])
 | |
|     doc = Doc(matcher.vocab, words=['hello'])
 | |
|     assert len(matcher(doc)) == 1
 | |
|     new_matcher = copy.deepcopy(matcher)
 | |
|     new_doc = Doc(new_matcher.vocab, words=['hello'])
 | |
|     assert len(new_matcher(new_doc)) == 1
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize('word', ['the'])
 | |
| def test_issue1889(word):
 | |
|     assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
 | |
| 
 | |
| 
 | |
| def test_issue1915():
 | |
|     cfg = {'hidden_depth': 2}  # should error out
 | |
|     nlp = Language()
 | |
|     nlp.add_pipe(nlp.create_pipe('ner'))
 | |
|     nlp.get_pipe('ner').add_label('answer')
 | |
|     with pytest.raises(ValueError):
 | |
|         nlp.begin_training(**cfg)
 | |
| 
 | |
| 
 | |
| def test_issue1945():
 | |
|     """Test regression in Matcher introduced in v2.0.6."""
 | |
|     matcher = Matcher(Vocab())
 | |
|     matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
 | |
|     doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
 | |
|     matches = matcher(doc)  # we should see two overlapping matches here
 | |
|     assert len(matches) == 2
 | |
|     assert matches[0][1:] == (0, 2)
 | |
|     assert matches[1][1:] == (1, 3)
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize('label', ['U-JOB-NAME'])
 | |
| def test_issue1967(label):
 | |
|     ner = EntityRecognizer(Vocab())
 | |
|     entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
 | |
|     gold_parses = [(None, [(entry, None)])]
 | |
|     ner.moves.get_actions(gold_parses=gold_parses)
 |