mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			121 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			121 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import pytest
 | 
						|
 | 
						|
from ..util import get_doc
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def doc(en_tokenizer):
 | 
						|
    text = "He jests at scars, that never felt a wound."
 | 
						|
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
 | 
						|
    deps = [
 | 
						|
        "nsubj",
 | 
						|
        "ccomp",
 | 
						|
        "prep",
 | 
						|
        "pobj",
 | 
						|
        "punct",
 | 
						|
        "nsubj",
 | 
						|
        "neg",
 | 
						|
        "ROOT",
 | 
						|
        "det",
 | 
						|
        "dobj",
 | 
						|
        "punct",
 | 
						|
    ]
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | 
						|
 | 
						|
 | 
						|
def test_issue3962(doc):
 | 
						|
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
 | 
						|
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 | 
						|
    span2 = doc[1:5]  # "jests at scars ,"
 | 
						|
    doc2 = span2.as_doc()
 | 
						|
    doc2_json = doc2.to_json()
 | 
						|
    assert doc2_json
 | 
						|
 | 
						|
    assert (
 | 
						|
        doc2[0].head.text == "jests"
 | 
						|
    )  # head set to itself, being the new artificial root
 | 
						|
    assert doc2[0].dep_ == "dep"
 | 
						|
    assert doc2[1].head.text == "jests"
 | 
						|
    assert doc2[1].dep_ == "prep"
 | 
						|
    assert doc2[2].head.text == "at"
 | 
						|
    assert doc2[2].dep_ == "pobj"
 | 
						|
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
 | 
						|
    assert doc2[3].dep_ == "dep"
 | 
						|
 | 
						|
    # We should still have 1 sentence
 | 
						|
    assert len(list(doc2.sents)) == 1
 | 
						|
 | 
						|
    span3 = doc[6:9]  # "never felt a"
 | 
						|
    doc3 = span3.as_doc()
 | 
						|
    doc3_json = doc3.to_json()
 | 
						|
    assert doc3_json
 | 
						|
 | 
						|
    assert doc3[0].head.text == "felt"
 | 
						|
    assert doc3[0].dep_ == "neg"
 | 
						|
    assert doc3[1].head.text == "felt"
 | 
						|
    assert doc3[1].dep_ == "ROOT"
 | 
						|
    assert doc3[2].head.text == "felt"  # head set to ancestor
 | 
						|
    assert doc3[2].dep_ == "dep"
 | 
						|
 | 
						|
    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
 | 
						|
    assert len(list(doc3.sents)) == 1
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def two_sent_doc(en_tokenizer):
 | 
						|
    text = "He jests at scars. They never felt a wound."
 | 
						|
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
 | 
						|
    deps = [
 | 
						|
        "nsubj",
 | 
						|
        "ROOT",
 | 
						|
        "prep",
 | 
						|
        "pobj",
 | 
						|
        "punct",
 | 
						|
        "nsubj",
 | 
						|
        "neg",
 | 
						|
        "ROOT",
 | 
						|
        "det",
 | 
						|
        "dobj",
 | 
						|
        "punct",
 | 
						|
    ]
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | 
						|
 | 
						|
 | 
						|
def test_issue3962_long(two_sent_doc):
 | 
						|
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
 | 
						|
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
 | 
						|
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
 | 
						|
    doc2 = span2.as_doc()
 | 
						|
    doc2_json = doc2.to_json()
 | 
						|
    assert doc2_json
 | 
						|
 | 
						|
    assert (
 | 
						|
        doc2[0].head.text == "jests"
 | 
						|
    )  # head set to itself, being the new artificial root (in sentence 1)
 | 
						|
    assert doc2[0].dep_ == "ROOT"
 | 
						|
    assert doc2[1].head.text == "jests"
 | 
						|
    assert doc2[1].dep_ == "prep"
 | 
						|
    assert doc2[2].head.text == "at"
 | 
						|
    assert doc2[2].dep_ == "pobj"
 | 
						|
    assert doc2[3].head.text == "jests"
 | 
						|
    assert doc2[3].dep_ == "punct"
 | 
						|
    assert (
 | 
						|
        doc2[4].head.text == "They"
 | 
						|
    )  # head set to itself, being the new artificial root (in sentence 2)
 | 
						|
    assert doc2[4].dep_ == "dep"
 | 
						|
    assert (
 | 
						|
        doc2[4].head.text == "They"
 | 
						|
    )  # head set to the new artificial head (in sentence 2)
 | 
						|
    assert doc2[4].dep_ == "dep"
 | 
						|
 | 
						|
    # We should still have 2 sentences
 | 
						|
    sents = list(doc2.sents)
 | 
						|
    assert len(sents) == 2
 | 
						|
    assert sents[0].text == "jests at scars ."
 | 
						|
    assert sents[1].text == "They never"
 |