Fix test formatting and consistency

This commit is contained in:
Ines Montani 2017-01-14 13:41:19 +01:00
parent 3424e3a7e5
commit a89e269a5a
4 changed files with 16 additions and 16 deletions

View File

@ -193,7 +193,7 @@ def test_doc_api_runtime_error(en_tokenizer):
def test_doc_api_right_edge(en_tokenizer): def test_doc_api_right_edge(en_tokenizer):
# Test for bug occurring from Unshift action, causing incorrect right edge """Test for bug occurring from Unshift action, causing incorrect right edge"""
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue." text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1, heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26] -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
@ -202,7 +202,8 @@ def test_doc_api_right_edge(en_tokenizer):
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert doc[6].text == 'for' assert doc[6].text == 'for'
subtree = [w.text for w in doc[6].subtree] subtree = [w.text for w in doc[6].subtree]
assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ','] assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
assert doc[6].right_edge.text == ',' assert doc[6].right_edge.text == ','

View File

@ -10,9 +10,6 @@ from ...util import compile_prefix_regex
from ...language_data import TOKENIZER_PREFIXES from ...language_data import TOKENIZER_PREFIXES
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
PUNCT_OPEN = ['(', '[', '{', '*'] PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*'] PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@ -99,7 +96,8 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Hello"]) @pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
punct_close, text):
tokens = en_tokenizer(punct_open + text + punct_close) tokens = en_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].text == punct_open assert tokens[0].text == punct_open
@ -108,20 +106,22 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")]) @pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Hello"]) @pytest.mark.parametrize('text', ["Hello"])
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text): def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add) punct_open2, punct_close2, text):
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].text == punct_open_add assert tokens[0].text == punct_open2
assert tokens[1].text == punct_open assert tokens[1].text == punct_open
assert tokens[2].text == text assert tokens[2].text == text
assert tokens[3].text == punct_close assert tokens[3].text == punct_close
assert tokens[4].text == punct_close_add assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text,punct', [("(can't", "(")]) @pytest.mark.parametrize('text,punct', [("(can't", "(")])
def test_tokenizer_splits_pre_punct_regex(text, punct): def test_tokenizer_splits_pre_punct_regex(text, punct):
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
match = en_search_prefixes(text) match = en_search_prefixes(text)
assert match.group() == punct assert match.group() == punct

View File

@ -29,8 +29,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11), ("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6), ("They ran about 10km.", 6),
# ("But then the 6,000-year ice age came...", 10) pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
])
def test_tokenizer_handles_cnts(en_tokenizer, text, length): def test_tokenizer_handles_cnts(en_tokenizer, text, length):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
assert len(tokens) == length assert len(tokens) == length