mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
		
			
				
	
	
		
			118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| from ..util import get_doc
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def doc(en_tokenizer):
 | |
|     text = "He jests at scars, that never felt a wound."
 | |
|     heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
 | |
|     deps = [
 | |
|         "nsubj",
 | |
|         "ccomp",
 | |
|         "prep",
 | |
|         "pobj",
 | |
|         "punct",
 | |
|         "nsubj",
 | |
|         "neg",
 | |
|         "ROOT",
 | |
|         "det",
 | |
|         "dobj",
 | |
|         "punct",
 | |
|     ]
 | |
|     tokens = en_tokenizer(text)
 | |
|     return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | |
| 
 | |
| 
 | |
| def test_issue3962(doc):
 | |
|     """ Ensure that as_doc does not result in out-of-bound access of tokens.
 | |
|     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 | |
|     span2 = doc[1:5]  # "jests at scars ,"
 | |
|     doc2 = span2.as_doc()
 | |
|     doc2_json = doc2.to_json()
 | |
|     assert doc2_json
 | |
| 
 | |
|     assert (
 | |
|         doc2[0].head.text == "jests"
 | |
|     )  # head set to itself, being the new artificial root
 | |
|     assert doc2[0].dep_ == "dep"
 | |
|     assert doc2[1].head.text == "jests"
 | |
|     assert doc2[1].dep_ == "prep"
 | |
|     assert doc2[2].head.text == "at"
 | |
|     assert doc2[2].dep_ == "pobj"
 | |
|     assert doc2[3].head.text == "jests"  # head set to the new artificial root
 | |
|     assert doc2[3].dep_ == "dep"
 | |
| 
 | |
|     # We should still have 1 sentence
 | |
|     assert len(list(doc2.sents)) == 1
 | |
| 
 | |
|     span3 = doc[6:9]  # "never felt a"
 | |
|     doc3 = span3.as_doc()
 | |
|     doc3_json = doc3.to_json()
 | |
|     assert doc3_json
 | |
| 
 | |
|     assert doc3[0].head.text == "felt"
 | |
|     assert doc3[0].dep_ == "neg"
 | |
|     assert doc3[1].head.text == "felt"
 | |
|     assert doc3[1].dep_ == "ROOT"
 | |
|     assert doc3[2].head.text == "felt"  # head set to ancestor
 | |
|     assert doc3[2].dep_ == "dep"
 | |
| 
 | |
|     # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
 | |
|     assert len(list(doc3.sents)) == 1
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def two_sent_doc(en_tokenizer):
 | |
|     text = "He jests at scars. They never felt a wound."
 | |
|     heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
 | |
|     deps = [
 | |
|         "nsubj",
 | |
|         "ROOT",
 | |
|         "prep",
 | |
|         "pobj",
 | |
|         "punct",
 | |
|         "nsubj",
 | |
|         "neg",
 | |
|         "ROOT",
 | |
|         "det",
 | |
|         "dobj",
 | |
|         "punct",
 | |
|     ]
 | |
|     tokens = en_tokenizer(text)
 | |
|     return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | |
| 
 | |
| 
 | |
| def test_issue3962_long(two_sent_doc):
 | |
|     """ Ensure that as_doc does not result in out-of-bound access of tokens.
 | |
|     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 | |
|     span2 = two_sent_doc[1:7]  # "jests at scars. They never"
 | |
|     doc2 = span2.as_doc()
 | |
|     doc2_json = doc2.to_json()
 | |
|     assert doc2_json
 | |
| 
 | |
|     assert (
 | |
|         doc2[0].head.text == "jests"
 | |
|     )  # head set to itself, being the new artificial root (in sentence 1)
 | |
|     assert doc2[0].dep_ == "ROOT"
 | |
|     assert doc2[1].head.text == "jests"
 | |
|     assert doc2[1].dep_ == "prep"
 | |
|     assert doc2[2].head.text == "at"
 | |
|     assert doc2[2].dep_ == "pobj"
 | |
|     assert doc2[3].head.text == "jests"
 | |
|     assert doc2[3].dep_ == "punct"
 | |
|     assert (
 | |
|         doc2[4].head.text == "They"
 | |
|     )  # head set to itself, being the new artificial root (in sentence 2)
 | |
|     assert doc2[4].dep_ == "dep"
 | |
|     assert (
 | |
|         doc2[4].head.text == "They"
 | |
|     )  # head set to the new artificial head (in sentence 2)
 | |
|     assert doc2[4].dep_ == "dep"
 | |
| 
 | |
|     # We should still have 2 sentences
 | |
|     sents = list(doc2.sents)
 | |
|     assert len(sents) == 2
 | |
|     assert sents[0].text == "jests at scars ."
 | |
|     assert sents[1].text == "They never"
 |