mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Update tests
This commit is contained in:
		
							parent
							
								
									fd574a89c4
								
							
						
					
					
						commit
						cf33b48fe0
					
				|  | @ -207,11 +207,13 @@ def create_gold_scores( | |||
| 
 | ||||
| def spans2ints(doc): | ||||
|     """Convert doc.spans to nested list of ints for comparison. | ||||
|     The ints are token indices. | ||||
|     The ints are character indices, and the spans groups are sorted by key first. | ||||
| 
 | ||||
|     This is useful for checking consistency of predictions. | ||||
|     """ | ||||
|     out = [] | ||||
|     for key, cluster in doc.spans.items(): | ||||
|         out.append([(ss.start, ss.end) for ss in cluster]) | ||||
|     keys = sorted([key for key in doc.spans]) | ||||
|     for key in keys: | ||||
|         cluster = doc.spans[key] | ||||
|         out.append([(ss.start_char, ss.end_char) for ss in cluster]) | ||||
|     return out | ||||
|  |  | |||
|  | @ -114,13 +114,15 @@ def test_overfitting_IO(nlp): | |||
|     test_text = TRAIN_DATA[0][0] | ||||
|     doc = nlp(test_text) | ||||
| 
 | ||||
|     for i in range(1500): | ||||
|     for i in range(15): | ||||
|         losses = {} | ||||
|         nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
|         doc = nlp(test_text) | ||||
| 
 | ||||
|     # test the trained model, using the pred since it has heads | ||||
|     doc = nlp(train_examples[0].predicted) | ||||
|     # XXX This actually tests that it can overfit | ||||
|     assert spans2ints(doc) == spans2ints(train_examples[0].reference) | ||||
| 
 | ||||
|     # Also test the results are still the same after IO | ||||
|     with make_tempdir() as tmp_dir: | ||||
|  | @ -134,6 +136,7 @@ def test_overfitting_IO(nlp): | |||
|         "I noticed many friends around me", | ||||
|         "They received it. They received the SMS.", | ||||
|     ] | ||||
|     # XXX Note these have no predictions because they have no input spans | ||||
|     docs1 = list(nlp.pipe(texts)) | ||||
|     docs2 = list(nlp.pipe(texts)) | ||||
|     docs3 = [nlp(text) for text in texts] | ||||
|  | @ -175,7 +178,7 @@ def test_tokenization_mismatch(nlp): | |||
|     test_text = TRAIN_DATA[0][0] | ||||
|     doc = nlp(test_text) | ||||
| 
 | ||||
|     for i in range(100): | ||||
|     for i in range(15): | ||||
|         losses = {} | ||||
|         nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
|         doc = nlp(test_text) | ||||
|  | @ -183,12 +186,8 @@ def test_tokenization_mismatch(nlp): | |||
|     # test the trained model; need to use doc with head spans on it already | ||||
|     test_doc = train_examples[0].predicted | ||||
|     doc = nlp(test_doc) | ||||
| 
 | ||||
|     # XXX DEBUG | ||||
|     print("SPANS", len(doc.spans)) | ||||
|     for key, val in doc.spans.items(): | ||||
|         print(key, val) | ||||
|         print("...") | ||||
|     # XXX This actually tests that it can overfit | ||||
|     assert spans2ints(doc) == spans2ints(train_examples[0].reference) | ||||
| 
 | ||||
|     # Also test the results are still the same after IO | ||||
|     with make_tempdir() as tmp_dir: | ||||
|  | @ -209,5 +208,4 @@ def test_tokenization_mismatch(nlp): | |||
|     docs3 = [nlp(text) for text in texts] | ||||
|     assert spans2ints(docs1[0]) == spans2ints(docs2[0]) | ||||
|     assert spans2ints(docs1[0]) == spans2ints(docs3[0]) | ||||
|     assert False | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user