Modernise Doc parse tree navigation tests and don't depend on models

This commit is contained in:
Ines Montani 2017-01-11 21:14:15 +01:00
parent 7262421bb2
commit 55d151aa61

View File

@ -1,35 +1,74 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
import io from ..util import get_doc
import pytest import pytest
@pytest.fixture @pytest.fixture
def sun_text(): def text():
with io.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', return u"""
encoding='utf8') as file_: It was a bright cold day in April, and the clocks were striking thirteen.
text = file_.read() Winston Smith, his chin nuzzled into his breast in an effort to escape the
return text vile wind, slipped quickly through the glass doors of Victory Mansions,
though not quickly enough to prevent a swirl of gritty dust from entering
along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a
coloured poster, too large for indoor display, had been tacked to the wall.
It depicted simply an enormous face, more than a metre wide: the face of a
man of about forty-five, with a heavy black moustache and ruggedly handsome
features. Winston made for the stairs. It was no use trying the lift. Even at
the best of times it was seldom working, and at present the electric current
was cut off during daylight hours. It was part of the economy drive in
preparation for Hate Week. The flat was seven flights up, and Winston, who
was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
resting several times on the way. On each landing, opposite the lift-shaft,
the poster with the enormous face gazed from the wall. It was one of those
pictures which are so contrived that the eyes follow you about when you move.
BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
"""
@pytest.mark.models @pytest.fixture
def test_consistency(EN, sun_text): def heads():
tokens = EN(sun_text) return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
for head in tokens: -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-1, -8, -9, -1]
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for head in doc:
for child in head.lefts: for child in head.lefts:
assert child.head is head assert child.head is head
for child in head.rights: for child in head.rights:
assert child.head is head assert child.head is head
@pytest.mark.models def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
def test_child_consistency(EN, sun_text): tokens = en_tokenizer(text)
tokens = EN(sun_text) doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
lefts = {} lefts = {}
rights = {} rights = {}
for head in tokens: for head in doc:
assert head.i not in lefts assert head.i not in lefts
lefts[head.i] = set() lefts[head.i] = set()
for left in head.lefts: for left in head.lefts:
@ -38,10 +77,10 @@ def test_child_consistency(EN, sun_text):
rights[head.i] = set() rights[head.i] = set()
for right in head.rights: for right in head.rights:
rights[head.i].add(right.i) rights[head.i].add(right.i)
for head in tokens: for head in doc:
assert head.n_rights == len(rights[head.i]) assert head.n_rights == len(rights[head.i])
assert head.n_lefts == len(lefts[head.i]) assert head.n_lefts == len(lefts[head.i])
for child in tokens: for child in doc:
if child.i < child.head.i: if child.i < child.head.i:
assert child.i in lefts[child.head.i] assert child.i in lefts[child.head.i]
assert child.i not in rights[child.head.i] assert child.i not in rights[child.head.i]
@ -56,12 +95,12 @@ def test_child_consistency(EN, sun_text):
assert not children assert not children
@pytest.mark.models def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
def test_edges(EN, sun_text): tokens = en_tokenizer(text)
tokens = EN(sun_text) doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for token in tokens: for token in doc:
subtree = list(token.subtree) subtree = list(token.subtree)
debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_)) debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text))
assert token.left_edge == subtree[0], debug assert token.left_edge == subtree[0], debug
debug = '\t'.join((token.orth_, token.right_edge.orth_, subtree[-1].orth_, token.right_edge.head.orth_)) debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text))
assert token.right_edge == subtree[-1], debug assert token.right_edge == subtree[-1], debug