From 1748549aebce44b3e3dce42815fd281b4e6894bd Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:16:19 +0100 Subject: [PATCH 1/5] Added exception pattern mechanism to the tokenizer. --- spacy/language.py | 6 ++++- spacy/language_data/__init__.py | 1 + spacy/language_data/special_cases.py | 5 ++++ spacy/tests/tokenizer/test_urls.py | 19 ++++++++++++++ spacy/tokenizer.pxd | 2 ++ spacy/tokenizer.pyx | 39 +++++++++++++++++++++------- spacy/util.py | 5 ++++ 7 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/tests/tokenizer/test_urls.py diff --git a/spacy/language.py b/spacy/language.py index c6f1376a4..16bffcd7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,6 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions + if cls.exception_patterns: + rule_match = util.compile_rule_regex(cls.exception_patterns).match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -80,7 +82,7 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, + return Tokenizer(vocab, rules=rules, rule_match=rule_match, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer) @@ -142,6 +144,8 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline + exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + prefixes = tuple(language_data.TOKENIZER_PREFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index f6aa4317c..aa379d86d 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,3 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * +from .special_cases import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py new file mode 100644 index 000000000..e7b2be5a5 --- /dev/null +++ b/spacy/language_data/special_cases.py @@ -0,0 +1,5 @@ +from __future__ import unicode_literals + +EXCEPTION_PATTERNS = r''' +((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) +'''.strip().split() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py new file mode 100644 index 000000000..5d0654d50 --- /dev/null +++ b/spacy/tests/tokenizer/test_urls.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize("text", [ + u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + u"www.google.com?q=google", + u"google.com", + u"www.red-stars.com", + pytest.mark.xfail(u"red-stars.com"), + u"http://foo.com/blah_(wikipedia)#cite-1", + u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", + u"mailto:foo.bar@baz.com", + u"mailto:foo-bar@baz-co.com" +]) +def test_simple_url(en_tokenizer, text): + tokens = en_tokenizer(text) + assert tokens[0].orth_ == text diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e53b7dbd1..24c76f7ee 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,6 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab + cdef public object rule_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -24,6 +25,7 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 + cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 66c93528b..ec5b5ea87 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,7 +28,7 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, infix_finditer=None): '''Load a Tokenizer, reading unsupplied components from the path. @@ -39,6 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + Special case matcher. Signature of re.compile(string).match prefix_search: Signature of re.compile(string).search suffix_search: @@ -65,10 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -76,6 +77,9 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +93,7 @@ cdef class Tokenizer: self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() + self.rule_match = rule_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -100,8 +105,9 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, + self.rule_match, + self._prefix_re, + self._suffix_re, self._infix_re) return (self.__class__, args, None, None) @@ -202,9 +208,12 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) - self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + if self._match_rule(span): + tokens.push_back(self.vocab.get(tokens.mem, span), False) + else: + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + self._attach_tokens(tokens, span, &prefixes, &suffixes) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, @@ -314,6 +323,18 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) + cdef int _match_rule(self, unicode string): + """Check whether the given string matches any of the patterns. + + string (unicode): The string to segment. + + Returns (int or None): The length of the prefix if present, otherwise None. + """ + if self.rule_match is None: + return 0 + match = self.rule_match(string) + return (match.end() - match.start()) if match is not None else 0 + def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index afed4142e..316e431ad 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,6 +108,11 @@ def compile_infix_regex(entries): return re.compile(expression) +def compile_rule_regex(entries): + expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' + return re.compile(expression) + + def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From d9c59c47517843f97ea3e36b0db66879e2af1b5d Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 21 Dec 2016 23:30:49 +0100 Subject: [PATCH 2/5] Maintaining backward compatibility. --- spacy/tokenizer.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ec5b5ea87..4aabdb3db 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -28,8 +28,8 @@ from .tokens.doc cimport Doc cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod - def load(cls, path, Vocab vocab, rules=None, rule_match = None, prefix_search=None, suffix_search=None, - infix_finditer=None): + def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, + infix_finditer=None, rule_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -69,7 +69,7 @@ cdef class Tokenizer: infix_finditer = util.compile_infix_regex(entries).finditer return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) - def __init__(self, Vocab vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -77,9 +77,6 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. prefix_search: A function matching the signature of re.compile(string).search to match prefixes. @@ -89,6 +86,9 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. + rule_match: + A function matching the signature of re.compile(string).match + to match special cases for the tokenizer. ''' self.mem = Pool() self._cache = PreshMap() From 3a9be4d485f8f80201ddf8056e16bf39c18ceff3 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Fri, 23 Dec 2016 23:49:34 +0100 Subject: [PATCH 3/5] Updated token exception handling mechanism to allow the usage of arbitrary functions as token exception matchers. --- spacy/language.py | 10 +++--- spacy/language_data/__init__.py | 2 +- spacy/language_data/special_cases.py | 5 --- spacy/language_data/tokenizer_exceptions.py | 11 +++++++ spacy/tokenizer.pxd | 3 +- spacy/tokenizer.pyx | 35 +++++++-------------- spacy/util.py | 5 --- 7 files changed, 29 insertions(+), 42 deletions(-) delete mode 100644 spacy/language_data/special_cases.py create mode 100644 spacy/language_data/tokenizer_exceptions.py diff --git a/spacy/language.py b/spacy/language.py index 16bffcd7b..bebdeab20 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,8 +67,8 @@ class BaseDefaults(object): @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions - if cls.exception_patterns: - rule_match = util.compile_rule_regex(cls.exception_patterns).match + if cls.token_match: + token_match = cls.token_match if cls.prefixes: prefix_search = util.compile_prefix_regex(cls.prefixes).search else: @@ -82,9 +82,9 @@ class BaseDefaults(object): else: infix_finditer = None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - return Tokenizer(vocab, rules=rules, rule_match=rule_match, + return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) + infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None): @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.entity) return pipeline - exception_patterns = tuple(language_data.EXCEPTION_PATTERNS) + token_match = language_data.TOKEN_MATCH prefixes = tuple(language_data.TOKENIZER_PREFIXES) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index aa379d86d..028924796 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -3,4 +3,4 @@ from .punctuation import * from .tag_map import * from .entity_rules import * from .util import * -from .special_cases import * +from .tokenizer_exceptions import * diff --git a/spacy/language_data/special_cases.py b/spacy/language_data/special_cases.py deleted file mode 100644 index e7b2be5a5..000000000 --- a/spacy/language_data/special_cases.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import unicode_literals - -EXCEPTION_PATTERNS = r''' -((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) -'''.strip().split() diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py new file mode 100644 index 000000000..6551440f2 --- /dev/null +++ b/spacy/language_data/tokenizer_exceptions.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import re + +_URL_PATTERN = r''' +^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +'''.strip() + +TOKEN_MATCH = re.compile(_URL_PATTERN).match + +__all__ = ['TOKEN_MATCH'] diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 24c76f7ee..1a3e86b49 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -16,7 +16,7 @@ cdef class Tokenizer: cdef PreshMap _specials cpdef readonly Vocab vocab - cdef public object rule_match + cdef public object token_match cdef public object prefix_search cdef public object suffix_search cdef public object infix_finditer @@ -25,7 +25,6 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) cdef int _try_cache(self, hash_t key, Doc tokens) except -1 - cdef int _match_rule(self, unicode string) cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4aabdb3db..63ac84482 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, rule_match = None): + infix_finditer=None, token_match = None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: @@ -39,8 +39,8 @@ cdef class Tokenizer: A storage container for lexical types. rules (dict): Exceptions and special-cases for the tokenizer. - rule_match: - Special case matcher. Signature of re.compile(string).match + token_match: + A boolean function matching strings that becomes tokens. prefix_search: Signature of re.compile(string).search suffix_search: @@ -67,9 +67,9 @@ cdef class Tokenizer: with (path / 'tokenizer' / 'infix.txt').open() as file_: entries = file_.read().split('\n') infix_finditer = util.compile_infix_regex(entries).finditer - return cls(vocab, rules, rule_match, prefix_search, suffix_search, infix_finditer) + return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, rule_match=None): + def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): '''Create a Tokenizer, to create Doc objects given unicode text. Arguments: @@ -86,14 +86,13 @@ cdef class Tokenizer: infix_finditer: A function matching the signature of re.compile(string).finditer to find infixes. - rule_match: - A function matching the signature of re.compile(string).match - to match special cases for the tokenizer. + token_match: + A boolean function matching strings that becomes tokens. ''' self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() - self.rule_match = rule_match + self.token_match = token_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -105,10 +104,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self.rule_match, self._prefix_re, self._suffix_re, - self._infix_re) + self._infix_re, + self.token_match) return (self.__class__, args, None, None) @@ -208,7 +207,7 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - if self._match_rule(span): + if self.token_match and self.token_match(span): tokens.push_back(self.vocab.get(tokens.mem, span), False) else: span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) @@ -323,18 +322,6 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - cdef int _match_rule(self, unicode string): - """Check whether the given string matches any of the patterns. - - string (unicode): The string to segment. - - Returns (int or None): The length of the prefix if present, otherwise None. - """ - if self.rule_match is None: - return 0 - match = self.rule_match(string) - return (match.end() - match.start()) if match is not None else 0 - def find_infix(self, unicode string): """Find internal split points of the string, such as hyphens. diff --git a/spacy/util.py b/spacy/util.py index 316e431ad..afed4142e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -108,11 +108,6 @@ def compile_infix_regex(entries): return re.compile(expression) -def compile_rule_regex(entries): - expression = '|'.join([piece for piece in entries if piece.strip()]) + '$' - return re.compile(expression) - - def normalize_slice(length, start, stop, step=None): if not (step is None or step == 1): raise ValueError("Stepped slices not supported in Span objects." From 623d94e14f5d7fc2162529353dbcbde0e4c38564 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:30:28 +1100 Subject: [PATCH 4/5] Whitespace --- spacy/tokenizer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 63ac84482..4fe12fc3f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,7 +29,7 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries.""" @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None, token_match = None): + infix_finditer=None, token_match=None): '''Load a Tokenizer, reading unsupplied components from the path. Arguments: From 3e8d9c772e381f7d4befb4035055c6a867561979 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Dec 2016 00:52:17 +1100 Subject: [PATCH 5/5] Test interaction of token_match and punctuation Check that the new token_match function applies after punctuation is split off. --- spacy/tests/tokenizer/test_urls.py | 64 ++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 5d0654d50..1a964d5e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -2,8 +2,7 @@ from __future__ import unicode_literals import pytest - -@pytest.mark.parametrize("text", [ +URLS = [ u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", u"www.google.com?q=google", u"google.com", @@ -13,7 +12,66 @@ import pytest u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", u"mailto:foo.bar@baz.com", u"mailto:foo-bar@baz-co.com" -]) +] + +# Punctuation we want to check is split away before the URL +PREFIXES = [ + "(", '"', "...", ":", "<", ">", ")" +] + +# Punctuation we want to check is split away after the URL +SUFFIXES = [ + "(", '"', "...", ":", "<", ">"] + +@pytest.mark.parametrize("text", URLS) def test_simple_url(en_tokenizer, text): tokens = en_tokenizer(text) assert tokens[0].orth_ == text + assert len(tokens) == 1 + + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, prefix, url): + tokens = en_tokenizer(prefix + url) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_prefixed_url(en_tokenizer, suffix, url): + tokens = en_tokenizer(url + suffix) + assert tokens[1].text == suffix + assert tokens[0].text == url + assert len(tokens) == 2 + +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_surround_url(en_tokenizer, prefix, suffix, url): + tokens = en_tokenizer(prefix + url + suffix) + assert tokens[0].text == prefix + assert tokens[1].text == url + assert tokens[2].text == suffix + assert len(tokens) == 3 + +@pytest.mark.parametrize("prefix1", PREFIXES) +@pytest.mark.parametrize("prefix2", PREFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, prefix1, prefix2, url): + tokens = en_tokenizer(prefix1 + prefix2 + url) + assert tokens[0].text == prefix1 + assert tokens[1].text == prefix2 + assert tokens[2].text == url + assert len(tokens) == 3 + +@pytest.mark.parametrize("suffix1", SUFFIXES) +@pytest.mark.parametrize("suffix2", SUFFIXES) +@pytest.mark.parametrize("url", URLS) +def test_two_prefix_url(en_tokenizer, suffix1, suffix2, url): + tokens = en_tokenizer(url + suffix1 + suffix2) + assert tokens[0].text == url + assert tokens[1].text == suffix1 + assert tokens[2].text == suffix2 + assert len(tokens) == 3