spaCy/tests/test_parse_navigate.py

from __future__ import unicode_literals
from os import path
import codecs

from spacy.en import English

import pytest


@pytest.fixture
def sun_text():
    with codecs.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', 'utf8') as file_:
        text = file_.read()
    return text


@pytest.fixture
def nlp():
    return English()


def test_consistency(nlp, sun_text):
    tokens = nlp(sun_text)
    for head in tokens:
        for child in head.lefts:
            assert child.head is head
        for child in head.rights:
            assert child.head is head


def test_child_consistency(nlp, sun_text):
    tokens = nlp(sun_text)

    lefts = {}
    rights = {}
    for head in tokens:
        assert head.i not in lefts
        lefts[head.i] = set()
        for left in head.lefts:
            lefts[head.i].add(left.i)
        assert head.i not in rights
        rights[head.i] = set()
        for right in head.rights:
            rights[head.i].add(right.i)
    for head in tokens:
        assert head.n_rights == len(rights[head.i])
        assert head.n_lefts == len(lefts[head.i])
    for child in tokens:
        if child.i < child.head.i:
            assert child.i in lefts[child.head.i]
            assert child.i not in rights[child.head.i]
            lefts[child.head.i].remove(child.i)
        elif child.i > child.head.i:
            assert child.i in rights[child.head.i]
            assert child.i not in lefts[child.head.i]
            rights[child.head.i].remove(child.i)
    for head_index, children in lefts.items():
        assert not children
    for head_index, children in rights.items():
        assert not children


def test_edges(nlp):
    sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
    tokens = nlp(sun_text)
    for token in tokens:
        subtree = list(token.subtree)
        debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_))
        assert token.left_edge == subtree[0], debug
        debug = '\t'.join((token.orth_, token.right_edge.orth_, subtree[-1].orth_, token.right_edge.head.orth_))
        assert token.right_edge == subtree[-1], debug
* Add test for parse tree navigation 2015-01-30 10:02:58 +03:00			`from __future__ import unicode_literals`
			`from os import path`
			`import codecs`

			`from spacy.en import English`

			`import pytest`


			`@pytest.fixture`
			`def sun_text():`
			`with codecs.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', 'utf8') as file_:`
			`text = file_.read()`
			`return text`


			`@pytest.fixture`
			`def nlp():`
			`return English()`


			`def test_consistency(nlp, sun_text):`
			`tokens = nlp(sun_text)`
			`for head in tokens:`
			`for child in head.lefts:`
			`assert child.head is head`
			`for child in head.rights:`
			`assert child.head is head`
* Extend parse tree navigation tests 2015-02-08 02:28:45 +03:00

			`def test_child_consistency(nlp, sun_text):`
			`tokens = nlp(sun_text)`

			`lefts = {}`
			`rights = {}`
			`for head in tokens:`
			`assert head.i not in lefts`
			`lefts[head.i] = set()`
			`for left in head.lefts:`
			`lefts[head.i].add(left.i)`
			`assert head.i not in rights`
			`rights[head.i] = set()`
			`for right in head.rights:`
			`rights[head.i].add(right.i)`
			`for head in tokens:`
			`assert head.n_rights == len(rights[head.i])`
			`assert head.n_lefts == len(lefts[head.i])`
			`for child in tokens:`
			`if child.i < child.head.i:`
			`assert child.i in lefts[child.head.i]`
			`assert child.i not in rights[child.head.i]`
			`lefts[child.head.i].remove(child.i)`
			`elif child.i > child.head.i:`
			`assert child.i in rights[child.head.i]`
			`assert child.i not in lefts[child.head.i]`
			`rights[child.head.i].remove(child.i)`
			`for head_index, children in lefts.items():`
			`assert not children`
			`for head_index, children in rights.items():`
			`assert not children`
* Add test for right_edge and left_edge 2015-04-29 23:08:27 +03:00

			`def test_edges(nlp):`
			`sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."`
			`tokens = nlp(sun_text)`
			`for token in tokens:`
			`subtree = list(token.subtree)`
			`debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_))`
			`assert token.left_edge == subtree[0], debug`
			`debug = '\t'.join((token.orth_, token.right_edge.orth_, subtree[-1].orth_, token.right_edge.head.orth_))`
			`assert token.right_edge == subtree[-1], debug`