mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
ba02957c80
* failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
from ..util import get_doc
|
|
|
|
|
|
@pytest.fixture
|
|
def doc(en_tokenizer):
|
|
text = "He jests at scars, that never felt a wound."
|
|
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
|
deps = [
|
|
"nsubj",
|
|
"ccomp",
|
|
"prep",
|
|
"pobj",
|
|
"punct",
|
|
"nsubj",
|
|
"neg",
|
|
"ROOT",
|
|
"det",
|
|
"dobj",
|
|
"punct",
|
|
]
|
|
tokens = en_tokenizer(text)
|
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
|
|
|
|
def test_issue3962(doc):
|
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
span2 = doc[1:5] # "jests at scars ,"
|
|
doc2 = span2.as_doc()
|
|
doc2_json = doc2.to_json()
|
|
assert doc2_json
|
|
|
|
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
|
|
assert doc2[0].dep_ == "dep"
|
|
assert doc2[1].head.text == "jests"
|
|
assert doc2[1].dep_ == "prep"
|
|
assert doc2[2].head.text == "at"
|
|
assert doc2[2].dep_ == "pobj"
|
|
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
|
assert doc2[3].dep_ == "dep"
|
|
|
|
# We should still have 1 sentence
|
|
assert len(list(doc2.sents)) == 1
|
|
|
|
span3 = doc[6:9] # "never felt a"
|
|
doc3 = span3.as_doc()
|
|
doc3_json = doc3.to_json()
|
|
assert doc3_json
|
|
|
|
assert doc3[0].head.text == "felt"
|
|
assert doc3[0].dep_ == "neg"
|
|
assert doc3[1].head.text == "felt"
|
|
assert doc3[1].dep_ == "ROOT"
|
|
assert doc3[2].head.text == "felt" # head set to ancestor
|
|
assert doc3[2].dep_ == "dep"
|
|
|
|
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
|
assert len(list(doc3.sents)) == 1
|
|
|
|
|
|
@pytest.fixture
|
|
def two_sent_doc(en_tokenizer):
|
|
text = "He jests at scars. They never felt a wound."
|
|
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
|
deps = [
|
|
"nsubj",
|
|
"ROOT",
|
|
"prep",
|
|
"pobj",
|
|
"punct",
|
|
"nsubj",
|
|
"neg",
|
|
"ROOT",
|
|
"det",
|
|
"dobj",
|
|
"punct",
|
|
]
|
|
tokens = en_tokenizer(text)
|
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
|
|
|
|
def test_issue3962_long(two_sent_doc):
|
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
|
doc2 = span2.as_doc()
|
|
doc2_json = doc2.to_json()
|
|
assert doc2_json
|
|
|
|
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
|
|
assert doc2[0].dep_ == "ROOT"
|
|
assert doc2[1].head.text == "jests"
|
|
assert doc2[1].dep_ == "prep"
|
|
assert doc2[2].head.text == "at"
|
|
assert doc2[2].dep_ == "pobj"
|
|
assert doc2[3].head.text == "jests"
|
|
assert doc2[3].dep_ == "punct"
|
|
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
|
|
assert doc2[4].dep_ == "dep"
|
|
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
|
|
assert doc2[4].dep_ == "dep"
|
|
|
|
# We should still have 2 sentences
|
|
sents = list(doc2.sents)
|
|
assert len(sents) == 2
|
|
assert sents[0].text == "jests at scars ."
|
|
assert sents[1].text == "They never"
|