diff --git a/tests/test_read_ptb.py b/tests/test_read_ptb.py new file mode 100644 index 000000000..dfc9ba469 --- /dev/null +++ b/tests/test_read_ptb.py @@ -0,0 +1,46 @@ +from spacy.munge import read_ptb + +import pytest + +from os import path + +ptb_loc = path.join(path.dirname(__file__), 'wsj_0001.parse') +file3_loc = path.join(path.dirname(__file__), 'wsj_0003.parse') + + +@pytest.fixture +def ptb_text(): + return open(path.join(ptb_loc)).read() + + +@pytest.fixture +def sentence_strings(ptb_text): + return read_ptb.split(ptb_text) + + +def test_split(sentence_strings): + assert len(sentence_strings) == 2 + assert sentence_strings[0].startswith('(TOP (S (NP-SBJ') + assert sentence_strings[0].endswith('(. .)))') + assert sentence_strings[1].startswith('(TOP (S (NP-SBJ') + assert sentence_strings[1].endswith('(. .)))') + + +def test_tree_read(sentence_strings): + words, brackets = read_ptb.parse(sentence_strings[0]) + assert len(brackets) == 11 + string = ("Pierre Vinken , 61 years old , will join the board as a nonexecutive " + "director Nov. 29 .") + word_strings = string.split() + starts = [s for l, s, e in brackets] + ends = [e for l, s, e in brackets] + assert min(starts) == 0 + assert max(ends) == len(words) + assert brackets[-1] == ('S', 0, len(words)) + assert ('NP-SBJ', 0, 7) in brackets + + +def test_traces(): + sent_strings = sentence_strings(open(file3_loc).read()) + words, brackets = read_ptb.parse(sent_strings[0]) + assert len(words) == 36