spaCy/spacy/tests/lang/en/test_customized_tokenizer.py
Ines Montani b6e991440c 💫 Tidy up and auto-format tests (#2967)
* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility
2018-11-27 01:09:36 +01:00

78 lines
1.9 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
from spacy.util import compile_infix_regex
@pytest.fixture
def custom_en_tokenizer(en_vocab):
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
custom_infixes = [
"\.\.\.+",
"(?<=[0-9])-(?=[0-9])",
# '(?<=[0-9]+),(?=[0-9]+)',
"[0-9]+(,[0-9]+)+",
"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
return Tokenizer(
en_vocab,
English.Defaults.tokenizer_exceptions,
prefix_re.search,
suffix_re.search,
infix_re.finditer,
token_match=None,
)
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]
# the trailing '-' may cause Assertion Error
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"-",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]