spaCy/spacy/tests/regression/test_issue3962.py

# coding: utf8
from __future__ import unicode_literals

import pytest

from ..util import get_doc


@pytest.fixture
def doc(en_tokenizer):
    text = "He jests at scars, that never felt a wound."
    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ccomp",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)


def test_issue3962(doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = doc[1:5]  # "jests at scars ,"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json

    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root
    assert doc2[0].dep_ == "dep"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"  # head set to the new artificial root
    assert doc2[3].dep_ == "dep"

    # We should still have 1 sentence
    assert len(list(doc2.sents)) == 1

    span3 = doc[6:9]  # "never felt a"
    doc3 = span3.as_doc()
    doc3_json = doc3.to_json()
    assert doc3_json

    assert doc3[0].head.text == "felt"
    assert doc3[0].dep_ == "neg"
    assert doc3[1].head.text == "felt"
    assert doc3[1].dep_ == "ROOT"
    assert doc3[2].head.text == "felt"  # head set to ancestor
    assert doc3[2].dep_ == "dep"

    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
    assert len(list(doc3.sents)) == 1


@pytest.fixture
def two_sent_doc(en_tokenizer):
    text = "He jests at scars. They never felt a wound."
    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
    deps = [
        "nsubj",
        "ROOT",
        "prep",
        "pobj",
        "punct",
        "nsubj",
        "neg",
        "ROOT",
        "det",
        "dobj",
        "punct",
    ]
    tokens = en_tokenizer(text)
    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)


def test_issue3962_long(two_sent_doc):
    """ Ensure that as_doc does not result in out-of-bound access of tokens.
    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
    doc2 = span2.as_doc()
    doc2_json = doc2.to_json()
    assert doc2_json

    assert (
        doc2[0].head.text == "jests"
    )  # head set to itself, being the new artificial root (in sentence 1)
    assert doc2[0].dep_ == "ROOT"
    assert doc2[1].head.text == "jests"
    assert doc2[1].dep_ == "prep"
    assert doc2[2].head.text == "at"
    assert doc2[2].dep_ == "pobj"
    assert doc2[3].head.text == "jests"
    assert doc2[3].dep_ == "punct"
    assert (
        doc2[4].head.text == "They"
    )  # head set to itself, being the new artificial root (in sentence 2)
    assert doc2[4].dep_ == "dep"
    assert (
        doc2[4].head.text == "They"
    )  # head set to the new artificial head (in sentence 2)
    assert doc2[4].dep_ == "dep"

    # We should still have 2 sentences
    sents = list(doc2.sents)
    assert len(sents) == 2
    assert sents[0].text == "jests at scars ."
    assert sents[1].text == "They never"
Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences 2019-07-23 19:28:55 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`import pytest`

			`from ..util import get_doc`


			`@pytest.fixture`
			`def doc(en_tokenizer):`
			`text = "He jests at scars, that never felt a wound."`
			`heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]`
			`deps = [`
			`"nsubj",`
			`"ccomp",`
			`"prep",`
			`"pobj",`
			`"punct",`
			`"nsubj",`
			`"neg",`
			`"ROOT",`
			`"det",`
			`"dobj",`
			`"punct",`
			`]`
			`tokens = en_tokenizer(text)`
			`return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)`


			`def test_issue3962(doc):`
			`""" Ensure that as_doc does not result in out-of-bound access of tokens.`
			`This is achieved by setting the head to itself if it would lie out of the span otherwise."""`
			`span2 = doc[1:5] # "jests at scars ,"`
			`doc2 = span2.as_doc()`
			`doc2_json = doc2.to_json()`
			`assert doc2_json`

Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert (`
			`doc2[0].head.text == "jests"`
			`) # head set to itself, being the new artificial root`
Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences 2019-07-23 19:28:55 +03:00			`assert doc2[0].dep_ == "dep"`
			`assert doc2[1].head.text == "jests"`
			`assert doc2[1].dep_ == "prep"`
			`assert doc2[2].head.text == "at"`
			`assert doc2[2].dep_ == "pobj"`
			`assert doc2[3].head.text == "jests" # head set to the new artificial root`
			`assert doc2[3].dep_ == "dep"`

			`# We should still have 1 sentence`
			`assert len(list(doc2.sents)) == 1`

			`span3 = doc[6:9] # "never felt a"`
			`doc3 = span3.as_doc()`
			`doc3_json = doc3.to_json()`
			`assert doc3_json`

			`assert doc3[0].head.text == "felt"`
			`assert doc3[0].dep_ == "neg"`
			`assert doc3[1].head.text == "felt"`
			`assert doc3[1].dep_ == "ROOT"`
			`assert doc3[2].head.text == "felt" # head set to ancestor`
			`assert doc3[2].dep_ == "dep"`

			`# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"`
			`assert len(list(doc3.sents)) == 1`


			`@pytest.fixture`
			`def two_sent_doc(en_tokenizer):`
			`text = "He jests at scars. They never felt a wound."`
			`heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]`
			`deps = [`
			`"nsubj",`
			`"ROOT",`
			`"prep",`
			`"pobj",`
			`"punct",`
			`"nsubj",`
			`"neg",`
			`"ROOT",`
			`"det",`
			`"dobj",`
			`"punct",`
			`]`
			`tokens = en_tokenizer(text)`
			`return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)`


			`def test_issue3962_long(two_sent_doc):`
			`""" Ensure that as_doc does not result in out-of-bound access of tokens.`
			`This is achieved by setting the head to itself if it would lie out of the span otherwise."""`
			`span2 = two_sent_doc[1:7] # "jests at scars. They never"`
			`doc2 = span2.as_doc()`
			`doc2_json = doc2.to_json()`
			`assert doc2_json`

Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert (`
			`doc2[0].head.text == "jests"`
			`) # head set to itself, being the new artificial root (in sentence 1)`
Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences 2019-07-23 19:28:55 +03:00			`assert doc2[0].dep_ == "ROOT"`
			`assert doc2[1].head.text == "jests"`
			`assert doc2[1].dep_ == "prep"`
			`assert doc2[2].head.text == "at"`
			`assert doc2[2].dep_ == "pobj"`
			`assert doc2[3].head.text == "jests"`
			`assert doc2[3].dep_ == "punct"`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert (`
			`doc2[4].head.text == "They"`
			`) # head set to itself, being the new artificial root (in sentence 2)`
Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences 2019-07-23 19:28:55 +03:00			`assert doc2[4].dep_ == "dep"`
Tidy up and auto-format 2019-08-20 18:36:34 +03:00			`assert (`
			`doc2[4].head.text == "They"`
			`) # head set to the new artificial head (in sentence 2)`
Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences 2019-07-23 19:28:55 +03:00			`assert doc2[4].dep_ == "dep"`

			`# We should still have 2 sentences`
			`sents = list(doc2.sents)`
			`assert len(sents) == 2`
			`assert sents[0].text == "jests at scars ."`
			`assert sents[1].text == "They never"`