diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e66c1da58..58813ec58 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -31,7 +31,7 @@ def test_doc_api_getitem(en_tokenizer): tokens[len(tokens)] def to_str(span): - return '/'.join(token.text for token in span) + return '/'.join(token.text for token in span) span = tokens[1:1] assert not to_str(span) @@ -193,7 +193,7 @@ def test_doc_api_runtime_error(en_tokenizer): def test_doc_api_right_edge(en_tokenizer): - # Test for bug occurring from Unshift action, causing incorrect right edge + """Test for bug occurring from Unshift action, causing incorrect right edge""" text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue." heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1, -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26] @@ -202,7 +202,8 @@ def test_doc_api_right_edge(en_tokenizer): doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) assert doc[6].text == 'for' subtree = [w.text for w in doc[6].subtree] - assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ','] + assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as', + 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ','] assert doc[6].right_edge.text == ',' diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 41ef14b39..959ff017b 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -85,8 +85,8 @@ def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors): assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0]) assert sum(tokens[0].vector) != sum(tokens[1].vector) assert numpy.isclose( - tokens[0].vector_norm, - numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector))) + tokens[0].vector_norm, + numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector))) def test_doc_token_api_ancestors(en_tokenizer): diff --git a/spacy/tests/en/test_punct.py b/spacy/tests/en/test_punct.py index e58db1e2b..06c3350c4 100644 --- a/spacy/tests/en/test_punct.py +++ b/spacy/tests/en/test_punct.py @@ -10,9 +10,6 @@ from ...util import compile_prefix_regex from ...language_data import TOKENIZER_PREFIXES - -en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search - PUNCT_OPEN = ['(', '[', '{', '*'] PUNCT_CLOSE = [')', ']', '}', '*'] PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] @@ -99,7 +96,8 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text): @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): +def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, + punct_close, text): tokens = en_tokenizer(punct_open + text + punct_close) assert len(tokens) == 3 assert tokens[0].text == punct_open @@ -108,20 +106,22 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")]) +@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) @pytest.mark.parametrize('text', ["Hello"]) -def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text): - tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add) +def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, + punct_open2, punct_close2, text): + tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) assert len(tokens) == 5 - assert tokens[0].text == punct_open_add + assert tokens[0].text == punct_open2 assert tokens[1].text == punct_open assert tokens[2].text == text assert tokens[3].text == punct_close - assert tokens[4].text == punct_close_add + assert tokens[4].text == punct_close2 @pytest.mark.parametrize('text,punct', [("(can't", "(")]) def test_tokenizer_splits_pre_punct_regex(text, punct): + en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search match = en_search_prefixes(text) assert match.group() == punct diff --git a/spacy/tests/en/test_text.py b/spacy/tests/en/test_text.py index c7178fbf9..a99cfa29e 100644 --- a/spacy/tests/en/test_text.py +++ b/spacy/tests/en/test_text.py @@ -29,8 +29,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), ("""'Me too!', Mr. P. Delaware cried. """, 11), ("They ran about 10km.", 6), - # ("But then the 6,000-year ice age came...", 10) - ]) + pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))]) def test_tokenizer_handles_cnts(en_tokenizer, text, length): tokens = en_tokenizer(text) assert len(tokens) == length