spaCy/spacy/tests/lang/en/test_customized_tokenizer.py
Sofie 46dfe773e1 Replacing regex library with re to increase tokenization speed (#3218)
* replace unicode categories with raw list of code points

* simplifying ranges

* fixing variable length quotes

* removing redundant regular expression

* small cleanup of regexp notations

* quotes and alpha as ranges instead of alterations

* removed most regexp dependencies and features

* exponential backtracking - unit tests

* rewrote expression with pathological backtracking

* disabling double hyphen tests for now

* test additional variants of repeating punctuation

* remove regex and redundant backslashes from load_reddit script

* small typo fixes

* disable double punctuation test for russian

* clean up old comments

* format block code

* final cleanup

* naming consistency

* french strings as unicode for python 2 support

* french regular expression case insensitive
2019-02-01 18:05:22 +11:00

77 lines
1.9 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
from spacy.util import compile_infix_regex
@pytest.fixture
def custom_en_tokenizer(en_vocab):
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
custom_infixes = [
"\.\.\.+",
"(?<=[0-9])-(?=[0-9])",
"[0-9]+(,[0-9]+)+",
"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
return Tokenizer(
en_vocab,
English.Defaults.tokenizer_exceptions,
prefix_re.search,
suffix_re.search,
infix_re.finditer,
token_match=None,
)
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]
# the trailing '-' may cause Assertion Error
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == [
"The",
"8",
"-",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]