Add test for #1488 (passes on v2.0.0a18?)

This commit is contained in:
ines 2017-11-03 14:44:36 +01:00
parent 711278b667
commit f0986df94b

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
import regex as re
from ...lang.en import English
from ...tokenizer import Tokenizer
def test_issue1488():
prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''')
infix_re = re.compile(r'''[-~\.]''')
simple_url_re = re.compile(r'''^https?://''')
def my_tokenizer(nlp):
return Tokenizer(nlp.vocab, {},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
nlp = English()
nlp.tokenizer = my_tokenizer(nlp)
doc = nlp("This is a test.")
for token in doc:
print(token.text)