mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-30 10:43:18 +03:00
Fix test formatting and consistency
This commit is contained in:
parent
3424e3a7e5
commit
a89e269a5a
|
@ -31,7 +31,7 @@ def test_doc_api_getitem(en_tokenizer):
|
||||||
tokens[len(tokens)]
|
tokens[len(tokens)]
|
||||||
|
|
||||||
def to_str(span):
|
def to_str(span):
|
||||||
return '/'.join(token.text for token in span)
|
return '/'.join(token.text for token in span)
|
||||||
|
|
||||||
span = tokens[1:1]
|
span = tokens[1:1]
|
||||||
assert not to_str(span)
|
assert not to_str(span)
|
||||||
|
@ -193,7 +193,7 @@ def test_doc_api_runtime_error(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_right_edge(en_tokenizer):
|
def test_doc_api_right_edge(en_tokenizer):
|
||||||
# Test for bug occurring from Unshift action, causing incorrect right edge
|
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||||
|
@ -202,7 +202,8 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||||
assert doc[6].text == 'for'
|
assert doc[6].text == 'for'
|
||||||
subtree = [w.text for w in doc[6].subtree]
|
subtree = [w.text for w in doc[6].subtree]
|
||||||
assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
|
||||||
|
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
||||||
assert doc[6].right_edge.text == ','
|
assert doc[6].right_edge.text == ','
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -85,8 +85,8 @@ def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors):
|
||||||
assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
|
assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
|
||||||
assert sum(tokens[0].vector) != sum(tokens[1].vector)
|
assert sum(tokens[0].vector) != sum(tokens[1].vector)
|
||||||
assert numpy.isclose(
|
assert numpy.isclose(
|
||||||
tokens[0].vector_norm,
|
tokens[0].vector_norm,
|
||||||
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
|
numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_ancestors(en_tokenizer):
|
def test_doc_token_api_ancestors(en_tokenizer):
|
||||||
|
|
|
@ -10,9 +10,6 @@ from ...util import compile_prefix_regex
|
||||||
from ...language_data import TOKENIZER_PREFIXES
|
from ...language_data import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
@ -99,7 +96,8 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
|
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||||
|
punct_close, text):
|
||||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct_open
|
assert tokens[0].text == punct_open
|
||||||
|
@ -108,20 +106,22 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
|
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
|
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||||
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
|
punct_open2, punct_close2, text):
|
||||||
|
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == punct_open_add
|
assert tokens[0].text == punct_open2
|
||||||
assert tokens[1].text == punct_open
|
assert tokens[1].text == punct_open
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
assert tokens[3].text == punct_close
|
assert tokens[3].text == punct_close
|
||||||
assert tokens[4].text == punct_close_add
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
|
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
match = en_search_prefixes(text)
|
match = en_search_prefixes(text)
|
||||||
assert match.group() == punct
|
assert match.group() == punct
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||||
("They ran about 10km.", 6),
|
("They ran about 10km.", 6),
|
||||||
# ("But then the 6,000-year ice age came...", 10)
|
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||||
])
|
|
||||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user