mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix test formatting and consistency
This commit is contained in:
parent
3424e3a7e5
commit
a89e269a5a
|
@ -31,7 +31,7 @@ def test_doc_api_getitem(en_tokenizer):
|
|||
tokens[len(tokens)]
|
||||
|
||||
def to_str(span):
|
||||
return '/'.join(token.text for token in span)
|
||||
return '/'.join(token.text for token in span)
|
||||
|
||||
span = tokens[1:1]
|
||||
assert not to_str(span)
|
||||
|
@ -193,7 +193,7 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
|
||||
|
||||
def test_doc_api_right_edge(en_tokenizer):
|
||||
# Test for bug occurring from Unshift action, causing incorrect right edge
|
||||
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||
|
@ -202,7 +202,8 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
assert doc[6].text == 'for'
|
||||
subtree = [w.text for w in doc[6].subtree]
|
||||
assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
||||
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
|
||||
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
||||
assert doc[6].right_edge.text == ','
|
||||
|
||||
|
||||
|
|
|
@ -85,8 +85,8 @@ def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors):
|
|||
assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
|
||||
assert sum(tokens[0].vector) != sum(tokens[1].vector)
|
||||
assert numpy.isclose(
|
||||
tokens[0].vector_norm,
|
||||
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
|
||||
tokens[0].vector_norm,
|
||||
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
|
||||
|
||||
|
||||
def test_doc_token_api_ancestors(en_tokenizer):
|
||||
|
|
|
@ -10,9 +10,6 @@ from ...util import compile_prefix_regex
|
|||
from ...language_data import TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
|
||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
@ -99,7 +96,8 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
|||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
|
||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
|
@ -108,20 +106,22 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close
|
|||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
|
||||
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
|
||||
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open_add
|
||||
assert tokens[0].text == punct_open2
|
||||
assert tokens[1].text == punct_open
|
||||
assert tokens[2].text == text
|
||||
assert tokens[3].text == punct_close
|
||||
assert tokens[4].text == punct_close_add
|
||||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
match = en_search_prefixes(text)
|
||||
assert match.group() == punct
|
||||
|
||||
|
|
|
@ -29,8 +29,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||
("They ran about 10km.", 6),
|
||||
# ("But then the 6,000-year ice age came...", 10)
|
||||
])
|
||||
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
Loading…
Reference in New Issue
Block a user