mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Merge pull request #359 from wbwseeker/reorganize_tests
Fix German noun chunker
This commit is contained in:
		
						commit
						1822bb4ff1
					
				|  | @ -32,7 +32,10 @@ def german_noun_chunks(doc): | |||
|     np_deps = set(doc.vocab.strings[label] for label in labels) | ||||
|     close_app = doc.vocab.strings['nk'] | ||||
| 
 | ||||
|     for word in doc: | ||||
|     rbracket = 0 | ||||
|     for i, word in enumerate(doc): | ||||
|         if i < rbracket: | ||||
|             continue | ||||
|         if word.pos == NOUN and word.dep in np_deps: | ||||
|             rbracket = word.i+1 | ||||
|             # try to extend the span to the right | ||||
|  |  | |||
|  | @ -225,6 +225,11 @@ cdef class Parser: | |||
|     def step_through(self, Doc doc): | ||||
|         return StepwiseState(self, doc) | ||||
| 
 | ||||
|     def from_transition_sequence(self, Doc doc, sequence): | ||||
|         with self.step_through(doc) as stepwise: | ||||
|             for transition in sequence: | ||||
|                 stepwise.transition(transition) | ||||
| 
 | ||||
|     def add_label(self, label): | ||||
|         for action in self.moves.action_types: | ||||
|             self.moves.add_action(action, label) | ||||
|  |  | |||
|  | @ -1,17 +1,15 @@ | |||
| from spacy.en import English | ||||
| 
 | ||||
| import pytest | ||||
| import os | ||||
| 
 | ||||
| import spacy | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def EN(): | ||||
|     if os.environ.get('SPACY_DATA'): | ||||
|         data_dir = os.environ.get('SPACY_DATA') | ||||
|     else: | ||||
|         data_dir = None | ||||
|     print("Load EN from %s" % data_dir) | ||||
|     return English(data_dir=data_dir) | ||||
|     return spacy.load("en") | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def DE(): | ||||
|     return spacy.load("de") | ||||
| 
 | ||||
| 
 | ||||
| def pytest_addoption(parser): | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/integration/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/integration/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										62
									
								
								spacy/tests/integration/test_model_sanity.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								spacy/tests/integration/test_model_sanity.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,62 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| import pytest | ||||
| import numpy | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| class TestModelSanity: | ||||
| 	""" | ||||
| 	This is to make sure the model works as expected. The tests make sure that values are properly set. | ||||
| 	Tests are not meant to evaluate the content of the output, only make sure the output is formally okay. | ||||
| 	""" | ||||
| 
 | ||||
| 	@pytest.fixture(scope='class', params=['en','de']) | ||||
| 	def example(self, request, EN, DE): | ||||
| 		if request.param == 'en': | ||||
| 			return EN(u'There was a stranger standing at the big street talking to herself.') | ||||
| 		elif request.param == 'de': | ||||
| 			return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.') | ||||
| 
 | ||||
| 	def test_tokenization(self, example): | ||||
| 		# tokenization should split the document into tokens | ||||
| 		assert len(example) > 1 | ||||
| 
 | ||||
| 	def test_tagging(self, example): | ||||
| 		# if tagging was done properly, pos tags shouldn't be empty | ||||
| 		assert example.is_tagged | ||||
| 		assert all( t.pos != 0 for t in example ) | ||||
| 		assert all( t.tag != 0 for t in example ) | ||||
| 
 | ||||
| 	def test_parsing(self, example): | ||||
| 		# if parsing was done properly | ||||
| 		# - dependency labels shouldn't be empty | ||||
| 		# - the head of some tokens should not be root | ||||
| 		assert example.is_parsed | ||||
| 		assert all( t.dep != 0 for t in example ) | ||||
| 		assert any( t.dep != i for i,t in enumerate(example) ) | ||||
| 
 | ||||
| 	def test_ner(self, example): | ||||
| 		# if ner was done properly, ent_iob shouldn't be empty | ||||
| 		assert all( t.ent_iob != 0 for t in example ) | ||||
| 
 | ||||
| 	def test_vectors(self, example): | ||||
| 		# if vectors are available, they should differ on different words | ||||
| 		# this isn't a perfect test since this could in principle fail in a sane model as well, | ||||
| 		# but that's very unlikely and a good indicator if something is wrong | ||||
| 		vector0 = example[0].vector | ||||
| 		vector1 = example[1].vector | ||||
| 		vector2 = example[2].vector | ||||
| 		assert not numpy.array_equal(vector0,vector1) | ||||
| 		assert not numpy.array_equal(vector0,vector2) | ||||
| 		assert not numpy.array_equal(vector1,vector2) | ||||
| 
 | ||||
| 	def test_probs(self, example): | ||||
| 		# if frequencies/probabilities are okay, they should differ for different words | ||||
| 		# this isn't a perfect test since this could in principle fail in a sane model as well, | ||||
| 		# but that's very unlikely and a good indicator if something is wrong | ||||
| 		prob0 = example[0].prob | ||||
| 		prob1 = example[1].prob | ||||
| 		prob2 = example[2].prob | ||||
| 		assert not prob0 == prob1 | ||||
| 		assert not prob0 == prob2 | ||||
| 		assert not prob1 == prob2 | ||||
|  | @ -2,30 +2,30 @@ from __future__ import unicode_literals | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_nsubj(EN): | ||||
|     sent = EN(u'A base phrase should be recognized.') | ||||
|     base_nps = list(sent.noun_chunks) | ||||
|     assert len(base_nps) == 1 | ||||
|     assert base_nps[0].string == 'A base phrase ' | ||||
| # @pytest.mark.models | ||||
| # def test_nsubj(EN): | ||||
| #     sent = EN(u'A base phrase should be recognized.') | ||||
| #     base_nps = list(sent.noun_chunks) | ||||
| #     assert len(base_nps) == 1 | ||||
| #     assert base_nps[0].string == 'A base phrase ' | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_coord(EN): | ||||
|     sent = EN(u'A base phrase and a good phrase are often the same.') | ||||
|     base_nps = list(sent.noun_chunks) | ||||
|     assert len(base_nps) == 2 | ||||
|     assert base_nps[0].string == 'A base phrase ' | ||||
|     assert base_nps[1].string == 'a good phrase ' | ||||
| # @pytest.mark.models | ||||
| # def test_coord(EN): | ||||
| #     sent = EN(u'A base phrase and a good phrase are often the same.') | ||||
| #     base_nps = list(sent.noun_chunks) | ||||
| #     assert len(base_nps) == 2 | ||||
| #     assert base_nps[0].string == 'A base phrase ' | ||||
| #     assert base_nps[1].string == 'a good phrase ' | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_pp(EN): | ||||
|     sent = EN(u'A phrase with another phrase occurs') | ||||
|     base_nps = list(sent.noun_chunks) | ||||
|     assert len(base_nps) == 2 | ||||
|     assert base_nps[0].string == 'A phrase ' | ||||
|     assert base_nps[1].string == 'another phrase '  | ||||
| # @pytest.mark.models | ||||
| # def test_pp(EN): | ||||
| #     sent = EN(u'A phrase with another phrase occurs') | ||||
| #     base_nps = list(sent.noun_chunks) | ||||
| #     assert len(base_nps) == 2 | ||||
| #     assert base_nps[0].string == 'A phrase ' | ||||
| #     assert base_nps[1].string == 'another phrase '  | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/unit/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/unit/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										138
									
								
								spacy/tests/unit/test_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										138
									
								
								spacy/tests/unit/test_parser.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,138 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| import numpy | ||||
| 
 | ||||
| from spacy.attrs import HEAD, DEP | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| class TestNounChunks: | ||||
|     @pytest.fixture(scope="class") | ||||
|     def ex1_en(self, EN): | ||||
|         example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) | ||||
|         EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) | ||||
|         det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] ) | ||||
|         example.from_array([HEAD, DEP], | ||||
|         numpy.asarray( | ||||
|             [ | ||||
|                 [2, det], | ||||
|                 [1, compound], | ||||
|                 [3, nsubjpass], | ||||
|                 [2, aux], | ||||
|                 [1, auxpass], | ||||
|                 [0, root], | ||||
|                 [-1, punct] | ||||
|             ], dtype='int32')) | ||||
|         return example | ||||
| 
 | ||||
|     @pytest.fixture(scope="class") | ||||
|     def ex2_en(self, EN): | ||||
|         example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) | ||||
|         EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) | ||||
|         det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] ) | ||||
|         example.from_array([HEAD, DEP], | ||||
|         numpy.asarray( | ||||
|             [ | ||||
|                 [2, det], | ||||
|                 [1, compound], | ||||
|                 [5, nsubj], | ||||
|                 [-1, cc], | ||||
|                 [1, det], | ||||
|                 [1, amod], | ||||
|                 [-4, conj], | ||||
|                 [0, root], | ||||
|                 [-1, advmod], | ||||
|                 [1, det], | ||||
|                 [-3, attr], | ||||
|                 [-4, punct] | ||||
|             ], dtype='int32')) | ||||
|         return example | ||||
| 
 | ||||
|     @pytest.fixture(scope="class") | ||||
|     def ex3_en(self, EN): | ||||
|         example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) | ||||
|         EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) | ||||
|         det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] ) | ||||
|         example.from_array([HEAD, DEP], | ||||
|         numpy.asarray( | ||||
|             [ | ||||
|                 [1, det], | ||||
|                 [4, nsubj], | ||||
|                 [-1, prep], | ||||
|                 [1, det], | ||||
|                 [-2, pobj], | ||||
|                 [0, root], | ||||
|                 [-1, punct] | ||||
|             ], dtype='int32')) | ||||
|         return example | ||||
| 
 | ||||
|     @pytest.fixture(scope="class") | ||||
|     def ex1_de(self, DE): | ||||
|         example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) | ||||
|         DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) | ||||
|         nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct']) | ||||
|         example.from_array([HEAD, DEP], | ||||
|         numpy.asarray( | ||||
|             [ | ||||
|                 [1, nk], | ||||
|                 [1, sb], | ||||
|                 [0, root], | ||||
|                 [-1, mo], | ||||
|                 [1, nk], | ||||
|                 [-2, nk], | ||||
|                 [-3, punct] | ||||
|             ], dtype='int32')) | ||||
|         return example | ||||
| 
 | ||||
|     @pytest.fixture(scope="class") | ||||
|     def ex2_de(self, DE): | ||||
|         example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' ')) | ||||
|         DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' ')) | ||||
|         nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa']) | ||||
|         example.from_array([HEAD, DEP], | ||||
|         numpy.asarray( | ||||
|             [ | ||||
|                 [1, nk], | ||||
|                 [1, sb], | ||||
|                 [0, root], | ||||
|                 [-1, mo], | ||||
|                 [1, nk], | ||||
|                 [-2, nk], | ||||
|                 [-1, nk], | ||||
|                 [-5, oa], | ||||
|                 [-6, punct] | ||||
|             ], dtype='int32')) | ||||
|         return example | ||||
| 
 | ||||
|     def test_en_standard_chunk(self, ex1_en): | ||||
|         chunks = list(ex1_en.noun_chunks) | ||||
|         assert len(chunks) == 1 | ||||
|         assert chunks[0].string == 'A base phrase ' | ||||
| 
 | ||||
|     def test_en_coordinated_chunks(self, ex2_en): | ||||
|         chunks = list(ex2_en.noun_chunks) | ||||
|         assert len(chunks) == 2 | ||||
|         assert chunks[0].string == 'A base phrase ' | ||||
|         assert chunks[1].string == 'a good phrase ' | ||||
| 
 | ||||
|     def test_en_pp_chunks(self, ex3_en): | ||||
|         chunks = list(ex3_en.noun_chunks) | ||||
|         assert len(chunks) == 2 | ||||
|         assert chunks[0].string == 'A phrase ' | ||||
|         assert chunks[1].string == 'another phrase ' | ||||
| 
 | ||||
|     def test_de_standard_chunk(self, ex1_de): | ||||
|         chunks = list(ex1_de.noun_chunks) | ||||
|         assert len(chunks) == 2 | ||||
|         assert chunks[0].string == 'Eine Tasse ' | ||||
|         assert chunks[1].string == 'dem Tisch ' | ||||
| 
 | ||||
|     def test_de_extended_chunk(self, ex2_de): | ||||
|         chunks = list(ex2_de.noun_chunks) | ||||
|         assert len(chunks) == 3 | ||||
|         assert chunks[0].string == 'Die Sängerin ' | ||||
|         assert chunks[1].string == 'einer Tasse Kaffee ' | ||||
|         assert chunks[2].string == 'Arien ' | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user