Add tokenizer option for token match with affixes

To fix the slow tokenizer URL (#4374) and allow `token_match` to take priority over prefixes and suffixes by default, introduce a new tokenizer option for a token match pattern that's applied after prefixes and suffixes but before infixes.
2026-01-04 07:53:24 +03:00 · 2020-05-05 10:35:33 +02:00 · 2020-05-05 10:35:33 +02:00 · 565e0eef73
commit 565e0eef73
parent 792c8af8cf
8 changed files with 46 additions and 26 deletions
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re

 from .punctuation import ELISION, HYPHENS
-from ..tokenizer_exceptions import URL_PATTERN
 from ..char_classes import ALPHA_LOWER, ALPHA
 from ...symbols import ORTH, LEMMA

@ -455,9 +454,6 @@ _regular_exp += [
    for hc in _hyphen_combination
 ]

-# URLs
-_regular_exp.append(URL_PATTERN)
-

 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re

 from ..punctuation import ALPHA_LOWER, CURRENCY
-from ..tokenizer_exceptions import URL_PATTERN
 from ...symbols import ORTH


@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(


 TOKENIZER_EXCEPTIONS = _exc
-TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
+TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 URL_PATTERN = (
    # fmt: off
    r"^"
-    # in order to support the prefix tokenization (see prefix test cases in test_urls).
-    r"(?=[" + ALPHA + "\w])"
    # protocol identifier (mods: make optional and expand schemes)
    # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
    r"(?:(?:[\w\+\-\.]{2,})://)?"
@ -56,13 +54,12 @@ URL_PATTERN = (
    r"(?::\d{2,5})?"
    # resource path
    r"(?:[/?#]\S*)?"
-    # in order to support the suffix tokenization (see suffix test cases in test_urls),
-    r"(?<=[" + ALPHA + "\w/])"
    r"$"
    # fmt: on
 ).strip()

-TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
+TOKEN_MATCH = None
+TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match


 BASE_EXCEPTIONS = {}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
 from .attrs import IS_STOP, LANG
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
-from .lang.tokenizer_exceptions import TOKEN_MATCH
+from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
 from .lang.tag_map import TAG_MAP
 from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -86,6 +86,7 @@ class BaseDefaults(object):
    def create_tokenizer(cls, nlp=None):
        rules = cls.tokenizer_exceptions
        token_match = cls.token_match
+        token_match_with_affixes = cls.token_match_with_affixes
        prefix_search = (
            util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
        )
@ -103,10 +104,12 @@ class BaseDefaults(object):
            suffix_search=suffix_search,
            infix_finditer=infix_finditer,
            token_match=token_match,
+            token_match_with_affixes=token_match_with_affixes,
        )

    pipe_names = ["tagger", "parser", "ner"]
    token_match = TOKEN_MATCH
+    token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
    infixes = tuple(TOKENIZER_INFIXES)
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [
    pytest.param(
        "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
    ),
-    pytest.param(
-        "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
-    ),
-    pytest.param(
-        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
-    ),
+    "http://foo.com/blah_blah_(wikipedia)",
+    "http://foo.com/blah_blah_(wikipedia)_(again)",
    "http://www.foo.co.uk",
    "http://www.foo.co.uk/",
    "http://www.foo.co.uk/blah/blah",
@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]

@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
 def test_should_match(en_tokenizer, url):
-    assert en_tokenizer.token_match(url) is not None
+    assert en_tokenizer.token_match_with_affixes(url) is not None


@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
 def test_should_not_match(en_tokenizer, url):
-    assert en_tokenizer.token_match(url) is None
+    assert en_tokenizer.token_match_with_affixes(url) is None


@pytest.mark.parametrize("url", URLS_BASIC)
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -17,6 +17,7 @@ cdef class Tokenizer:
    cpdef readonly Vocab vocab

    cdef object _token_match
+    cdef object _token_match_with_affixes
    cdef object _prefix_search
    cdef object _suffix_search
    cdef object _infix_finditer
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -30,7 +30,8 @@ cdef class Tokenizer:
    DOCS: https://spacy.io/api/tokenizer
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
-                 suffix_search=None, infix_finditer=None, token_match=None):
+                 suffix_search=None, infix_finditer=None, token_match=None,
+                 token_match_with_affixes=None):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -43,6 +44,8 @@ cdef class Tokenizer:
            `re.compile(string).finditer` to find infixes.
        token_match (callable): A boolean function matching strings to be
            recognised as tokens.
+        token_match_with_affixes (callable): A boolean function matching strings to be
+            recognised as tokens after considering prefixes and suffixes.
        RETURNS (Tokenizer): The newly constructed object.

        EXAMPLE:
@ -55,6 +58,7 @@ cdef class Tokenizer:
        self._cache = PreshMap()
        self._specials = PreshMap()
        self.token_match = token_match
+        self.token_match_with_affixes = token_match_with_affixes
        self.prefix_search = prefix_search
        self.suffix_search = suffix_search
        self.infix_finditer = infix_finditer
@ -70,6 +74,14 @@ cdef class Tokenizer:
            self._token_match = token_match
            self._flush_cache()

+    property token_match_with_affixes:
+        def __get__(self):
+            return self._token_match_with_affixes
+
+        def __set__(self, token_match_with_affixes):
+            self._token_match_with_affixes = token_match_with_affixes
+            self._flush_cache()
+
    property prefix_search:
        def __get__(self):
            return self._prefix_search
@ -108,11 +120,12 @@ cdef class Tokenizer:

    def __reduce__(self):
        args = (self.vocab,
-                self._rules,
+                self.rules,
                self.prefix_search,
                self.suffix_search,
                self.infix_finditer,
-                self.token_match)
+                self.token_match,
+                self.token_match_with_affixes)
        return (self.__class__, args, None, None)

    cpdef Doc tokens_from_list(self, list strings):
@ -297,7 +310,9 @@ cdef class Tokenizer:
            cache_hit = self._try_cache(hash_string(string), tokens)
            if cache_hit:
                pass
-            elif self.token_match and self.token_match(string):
+            elif (self.token_match and self.token_match(string)) or \
+                    (self.token_match_with_affixes and \
+                    self.token_match_with_affixes(string)):
                # We're always saying 'no' to spaces here -- the caller will
                # fix up the outermost one, with reference to the original.
                # See Issue #859
@ -450,6 +465,11 @@ cdef class Tokenizer:
        suffix_search = self.suffix_search
        infix_finditer = self.infix_finditer
        token_match = self.token_match
+        if token_match is None:
+            token_match = re.compile("a^").match
+        token_match_with_affixes = self.token_match_with_affixes
+        if token_match_with_affixes is None:
+            token_match_with_affixes = re.compile("a^").match
        special_cases = {}
        for orth, special_tokens in self.rules.items():
            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@ -485,6 +505,9 @@ cdef class Tokenizer:
                if token_match(substring):
                    tokens.append(("TOKEN_MATCH", substring))
                    substring = ''
+                elif token_match_with_affixes(substring):
+                    tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
+                    substring = ''
                elif substring in special_cases:
                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                    substring = ''
@ -549,6 +572,7 @@ cdef class Tokenizer:
            ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
            ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
            ("token_match", lambda: _get_regex_pattern(self.token_match)),
+            ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
            ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
        ))
        exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@ -570,11 +594,12 @@ cdef class Tokenizer:
            ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
            ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
            ("token_match", lambda b: data.setdefault("token_match", b)),
+            ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
            ("exceptions", lambda b: data.setdefault("rules", b))
        ))
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]:
+        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
            if key in data:
                data[key] = unescape_unicode(data[key])
        if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@ -585,6 +610,8 @@ cdef class Tokenizer:
            self.infix_finditer = re.compile(data["infix_finditer"]).finditer
        if "token_match" in data and isinstance(data["token_match"], basestring_):
            self.token_match = re.compile(data["token_match"]).match
+        if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
+            self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
        if "rules" in data and isinstance(data["rules"], dict):
            # make sure to hard reset the cache to remove data from the default exceptions
            self._rules = {}
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -41,7 +41,8 @@ the
 | `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
 | `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
 | `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
-| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches.                                         |
+| `token_match_with_affixes`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes.                                         |
 | **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |

 ## Tokenizer.\_\_call\_\_ {#call tag="method"}