mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Fix test formatting and consistency
This commit is contained in:
parent
3424e3a7e5
commit
a89e269a5a
|
@ -193,7 +193,7 @@ def test_doc_api_runtime_error(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_right_edge(en_tokenizer):
|
def test_doc_api_right_edge(en_tokenizer):
|
||||||
# Test for bug occurring from Unshift action, causing incorrect right edge
|
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||||
|
@ -202,7 +202,8 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||||
assert doc[6].text == 'for'
|
assert doc[6].text == 'for'
|
||||||
subtree = [w.text for w in doc[6].subtree]
|
subtree = [w.text for w in doc[6].subtree]
|
||||||
assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
|
||||||
|
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
||||||
assert doc[6].right_edge.text == ','
|
assert doc[6].right_edge.text == ','
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,6 @@ from ...util import compile_prefix_regex
|
||||||
from ...language_data import TOKENIZER_PREFIXES
|
from ...language_data import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
@ -99,7 +96,8 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text):
|
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||||
|
punct_close, text):
|
||||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct_open
|
assert tokens[0].text == punct_open
|
||||||
|
@ -108,20 +106,22 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open_add,punct_close_add', [("`", "'")])
|
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_two_different(en_tokenizer, punct_open, punct_close, punct_open_add, punct_close_add, text):
|
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||||
tokens = en_tokenizer(punct_open_add + punct_open + text + punct_close + punct_close_add)
|
punct_open2, punct_close2, text):
|
||||||
|
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == punct_open_add
|
assert tokens[0].text == punct_open2
|
||||||
assert tokens[1].text == punct_open
|
assert tokens[1].text == punct_open
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
assert tokens[3].text == punct_close
|
assert tokens[3].text == punct_close
|
||||||
assert tokens[4].text == punct_close_add
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
|
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
match = en_search_prefixes(text)
|
match = en_search_prefixes(text)
|
||||||
assert match.group() == punct
|
assert match.group() == punct
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||||
("They ran about 10km.", 6),
|
("They ran about 10km.", 6),
|
||||||
# ("But then the 6,000-year ice age came...", 10)
|
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||||
])
|
|
||||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user