Modernise Doc parse tree navigation tests and don't depend on models

This commit is contained in:
Ines Montani 2017-01-11 21:14:15 +01:00
parent 7262421bb2
commit 55d151aa61

View File

@ -1,35 +1,74 @@
# coding: utf-8
from __future__ import unicode_literals
from os import path
import io
from ..util import get_doc
import pytest
@pytest.fixture
def sun_text():
with io.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r',
encoding='utf8') as file_:
text = file_.read()
return text
def text():
return u"""
It was a bright cold day in April, and the clocks were striking thirteen.
Winston Smith, his chin nuzzled into his breast in an effort to escape the
vile wind, slipped quickly through the glass doors of Victory Mansions,
though not quickly enough to prevent a swirl of gritty dust from entering
along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a
coloured poster, too large for indoor display, had been tacked to the wall.
It depicted simply an enormous face, more than a metre wide: the face of a
man of about forty-five, with a heavy black moustache and ruggedly handsome
features. Winston made for the stairs. It was no use trying the lift. Even at
the best of times it was seldom working, and at present the electric current
was cut off during daylight hours. It was part of the economy drive in
preparation for Hate Week. The flat was seven flights up, and Winston, who
was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
resting several times on the way. On each landing, opposite the lift-shaft,
the poster with the enormous face gazed from the wall. It was one of those
pictures which are so contrived that the eyes follow you about when you move.
BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
"""
@pytest.mark.models
def test_consistency(EN, sun_text):
tokens = EN(sun_text)
for head in tokens:
@pytest.fixture
def heads():
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-1, -8, -9, -1]
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for head in doc:
for child in head.lefts:
assert child.head is head
for child in head.rights:
assert child.head is head
@pytest.mark.models
def test_child_consistency(EN, sun_text):
tokens = EN(sun_text)
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
lefts = {}
rights = {}
for head in tokens:
for head in doc:
assert head.i not in lefts
lefts[head.i] = set()
for left in head.lefts:
@ -38,10 +77,10 @@ def test_child_consistency(EN, sun_text):
rights[head.i] = set()
for right in head.rights:
rights[head.i].add(right.i)
for head in tokens:
for head in doc:
assert head.n_rights == len(rights[head.i])
assert head.n_lefts == len(lefts[head.i])
for child in tokens:
for child in doc:
if child.i < child.head.i:
assert child.i in lefts[child.head.i]
assert child.i not in rights[child.head.i]
@ -56,12 +95,12 @@ def test_child_consistency(EN, sun_text):
assert not children
@pytest.mark.models
def test_edges(EN, sun_text):
tokens = EN(sun_text)
for token in tokens:
def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for token in doc:
subtree = list(token.subtree)
debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_))
debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text))
assert token.left_edge == subtree[0], debug
debug = '\t'.join((token.orth_, token.right_edge.orth_, subtree[-1].orth_, token.right_edge.head.orth_))
debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text))
assert token.right_edge == subtree[-1], debug