spaCy/tests/munge/test_read_ptb.py
2015-06-07 16:49:46 +02:00

47 lines
1.3 KiB
Python

from spacy.munge import read_ptb
import pytest
from os import path
ptb_loc = path.join(path.dirname(__file__), 'wsj_0001.parse')
file3_loc = path.join(path.dirname(__file__), 'wsj_0003.parse')
@pytest.fixture
def ptb_text():
return open(path.join(ptb_loc)).read()
@pytest.fixture
def sentence_strings(ptb_text):
return read_ptb.split(ptb_text)
def test_split(sentence_strings):
assert len(sentence_strings) == 2
assert sentence_strings[0].startswith('(TOP (S (NP-SBJ')
assert sentence_strings[0].endswith('(. .)))')
assert sentence_strings[1].startswith('(TOP (S (NP-SBJ')
assert sentence_strings[1].endswith('(. .)))')
def test_tree_read(sentence_strings):
words, brackets = read_ptb.parse(sentence_strings[0])
assert len(brackets) == 11
string = ("Pierre Vinken , 61 years old , will join the board as a nonexecutive "
"director Nov. 29 .")
word_strings = string.split()
starts = [s for l, s, e in brackets]
ends = [e for l, s, e in brackets]
assert min(starts) == 0
assert max(ends) == len(words)
assert brackets[-1] == ('S', 0, len(words))
assert ('NP-SBJ', 0, 7) in brackets
def test_traces():
sent_strings = sentence_strings(open(file3_loc).read())
words, brackets = read_ptb.parse(sent_strings[0])
assert len(words) == 36