From 1139247532d42ccc16e2e1c548924d83d7615637 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 9 Mar 2020 12:09:41 +0100
Subject: [PATCH 1/4] Revert changes to token_match priority from #4374

* Revert changes to priority of `token_match` so that it has priority
over all other tokenizer patterns

* Add lookahead and potentially slow lookbehind back to the default URL
pattern

* Expand character classes in URL pattern to improve matching around
lookaheads and lookbehinds related to #4882

* Revert changes to Hungarian tokenizer

* Revert (xfail) several URL tests to their status before #4374

* Update `tokenizer.explain()` and docs accordingly
---
 spacy/lang/hu/punctuation.py              |  6 +++---
 spacy/lang/tokenizer_exceptions.py        |  6 +++++-
 spacy/tests/tokenizer/test_urls.py        |  8 ++++++--
 spacy/tokenizer.pyx                       | 14 +++++++++----
 website/docs/usage/linguistic-features.md | 24 +++++++++++++----------
 5 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index bc043486f..a010bb7ae 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 
 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
-_units = UNITS.replace("%", "")
 
 _prefixes = (
     LIST_PUNCT
@@ -21,7 +20,8 @@ _prefixes = (
 )
 
 _suffixes = (
-    LIST_PUNCT
+    [r"\+"]
+    + LIST_PUNCT
     + LIST_ELLIPSES
     + LIST_QUOTES
     + [_concat_icons]
@@ -29,7 +29,7 @@ _suffixes = (
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=_units),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
         r"(?<=[{al}{e}{q}(?:{c})])\.".format(
             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
         ),
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 2c0fc9cf7..42dbc7bac 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 
 import re
 
-from .char_classes import ALPHA_LOWER
+from .char_classes import ALPHA_LOWER, ALPHA
 from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 
 
@@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 URL_PATTERN = (
     # fmt: off
     r"^"
+    # in order to support the prefix tokenization (see prefix test cases in test_urls).
+    r"(?=[" + ALPHA + "\w])"
     # protocol identifier (mods: make optional and expand schemes)
     # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
     r"(?:(?:[\w\+\-\.]{2,})://)?"
@@ -54,6 +56,8 @@ URL_PATTERN = (
     r"(?::\d{2,5})?"
     # resource path
     r"(?:[/?#]\S*)?"
+    # in order to support the suffix tokenization (see suffix test cases in test_urls),
+    r"(?<=[" + ALPHA + "\w/])"
     r"$"
     # fmt: on
 ).strip()
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 58e9d73f3..2d82e213c 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [
     pytest.param(
         "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
     ),
-    "http://foo.com/blah_blah_(wikipedia)",
-    "http://foo.com/blah_blah_(wikipedia)_(again)",
+    pytest.param(
+        "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
+    ),
+    pytest.param(
+        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
+    ),
     "http://www.foo.co.uk",
     "http://www.foo.co.uk/",
     "http://www.foo.co.uk/blah/blah",
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 4da081259..6f7e44061 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -239,6 +239,8 @@ cdef class Tokenizer:
         cdef unicode minus_suf
         cdef size_t last_size = 0
         while string and len(string) != last_size:
+            if self.token_match and self.token_match(string):
+                break
             if self._specials.get(hash_string(string)) != NULL:
                 has_special[0] = 1
                 break
@@ -455,6 +457,10 @@ cdef class Tokenizer:
             suffixes = []
             while substring:
                 while prefix_search(substring) or suffix_search(substring):
+                    if token_match(substring):
+                        tokens.append(("TOKEN_MATCH", substring))
+                        substring = ''
+                        break
                     if substring in special_cases:
                         tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                         substring = ''
@@ -475,12 +481,12 @@ cdef class Tokenizer:
                             break
                         suffixes.append(("SUFFIX", substring[split:]))
                         substring = substring[:split]
-                if substring in special_cases:
-                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
-                    substring = ''
-                elif token_match(substring):
+                if token_match(substring):
                     tokens.append(("TOKEN_MATCH", substring))
                     substring = ''
+                elif substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
                 elif list(infix_finditer(substring)):
                     infixes = infix_finditer(substring)
                     offset = 0
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 685619c88..60a6699a9 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
         suffixes = []
         while substring:
             while prefix_search(substring) or suffix_search(substring):
+                if token_match(substring):
+                    tokens.append(substring)
+                    substring = ''
+                    break
                 if substring in special_cases:
                     tokens.extend(special_cases[substring])
                     substring = ''
@@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
                     split = suffix_search(substring).start()
                     suffixes.append(substring[split:])
                     substring = substring[:split]
-            if substring in special_cases:
-                tokens.extend(special_cases[substring])
-                substring = ''
-            elif token_match(substring):
+            if token_match(substring):
                 tokens.append(substring)
                 substring = ''
+            elif substring in special_cases:
+                tokens.extend(special_cases[substring])
+                substring = ''
             elif list(infix_finditer(substring)):
                 infixes = infix_finditer(substring)
                 offset = 0
@@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
 The algorithm can be summarized as follows:
 
 1. Iterate over whitespace-separated substrings.
-2. Check whether we have an explicitly defined rule for this substring. If we
+2. Look for a token match. If there is a match, stop processing and keep this token.
+3. Check whether we have an explicitly defined rule for this substring. If we
    do, use it.
-3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
-   so that special cases always get priority.
-4. If we didn't consume a prefix, try to consume a suffix and then go back to
+4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
+   so that the token match and special cases always get priority.
+5. If we didn't consume a prefix, try to consume a suffix and then go back to
    #2.
-5. If we can't consume a prefix or a suffix, look for a special case.
-6. Next, look for a token match.
+6. If we can't consume a prefix or a suffix, look for a special case.
 7. Look for "infixes" — stuff like hyphens etc. and split the substring into
    tokens on all infixes.
 8. Once we can't consume any more of the string, handle it as a single token.

From 0c31f03ec5525cd33224a880b6d678c69019727d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 9 Mar 2020 13:41:01 +0100
Subject: [PATCH 2/4] Update docs [ci skip]

---
 website/docs/usage/linguistic-features.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 60a6699a9..0ceae4c4f 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -838,8 +838,6 @@ domain. There are five things you would need to define:
    hyphens etc.
 5. An optional boolean function `token_match` matching strings that should never
    be split, overriding the infix rules. Useful for things like URLs or numbers.
-   Note that prefixes and suffixes will be split off before `token_match` is
-   applied.
 
 You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
 to use `re.compile()` to build a regular expression object, and pass its

From 565e0eef73fab8c394339239cc48e4a83e068dfd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 5 May 2020 10:35:33 +0200
Subject: [PATCH 3/4] Add tokenizer option for token match with affixes

To fix the slow tokenizer URL (#4374) and allow `token_match` to take
priority over prefixes and suffixes by default, introduce a new
tokenizer option for a token match pattern that's applied after prefixes
and suffixes but before infixes.
---
 spacy/lang/fr/tokenizer_exceptions.py |  4 ---
 spacy/lang/hu/tokenizer_exceptions.py |  3 +--
 spacy/lang/tokenizer_exceptions.py    |  7 ++---
 spacy/language.py                     |  5 +++-
 spacy/tests/tokenizer/test_urls.py    | 12 +++------
 spacy/tokenizer.pxd                   |  1 +
 spacy/tokenizer.pyx                   | 37 +++++++++++++++++++++++----
 website/docs/api/tokenizer.md         |  3 ++-
 8 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index cb1702300..465626d39 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .punctuation import ELISION, HYPHENS
-from ..tokenizer_exceptions import URL_PATTERN
 from ..char_classes import ALPHA_LOWER, ALPHA
 from ...symbols import ORTH, LEMMA
 
@@ -455,9 +454,6 @@ _regular_exp += [
     for hc in _hyphen_combination
 ]
 
-# URLs
-_regular_exp.append(URL_PATTERN)
-
 
 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index c18a2cec2..d328baa22 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from ..punctuation import ALPHA_LOWER, CURRENCY
-from ..tokenizer_exceptions import URL_PATTERN
 from ...symbols import ORTH
 
 
@@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 
 
 TOKENIZER_EXCEPTIONS = _exc
-TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
+TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index f1eabd9aa..6a9a5363f 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 URL_PATTERN = (
     # fmt: off
     r"^"
-    # in order to support the prefix tokenization (see prefix test cases in test_urls).
-    r"(?=[" + ALPHA + "\w])"
     # protocol identifier (mods: make optional and expand schemes)
     # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
     r"(?:(?:[\w\+\-\.]{2,})://)?"
@@ -56,13 +54,12 @@ URL_PATTERN = (
     r"(?::\d{2,5})?"
     # resource path
     r"(?:[/?#]\S*)?"
-    # in order to support the suffix tokenization (see suffix test cases in test_urls),
-    r"(?<=[" + ALPHA + "\w/])"
     r"$"
     # fmt: on
 ).strip()
 
-TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
+TOKEN_MATCH = None
+TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match
 
 
 BASE_EXCEPTIONS = {}
diff --git a/spacy/language.py b/spacy/language.py
index e89f80f08..d4f6c78ec 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
 from .attrs import IS_STOP, LANG
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
-from .lang.tokenizer_exceptions import TOKEN_MATCH
+from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
 from .lang.tag_map import TAG_MAP
 from .tokens import Doc
 from .lang.lex_attrs import LEX_ATTRS, is_stop
@@ -86,6 +86,7 @@ class BaseDefaults(object):
     def create_tokenizer(cls, nlp=None):
         rules = cls.tokenizer_exceptions
         token_match = cls.token_match
+        token_match_with_affixes = cls.token_match_with_affixes
         prefix_search = (
             util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
         )
@@ -103,10 +104,12 @@ class BaseDefaults(object):
             suffix_search=suffix_search,
             infix_finditer=infix_finditer,
             token_match=token_match,
+            token_match_with_affixes=token_match_with_affixes,
         )
 
     pipe_names = ["tagger", "parser", "ner"]
     token_match = TOKEN_MATCH
+    token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
     prefixes = tuple(TOKENIZER_PREFIXES)
     suffixes = tuple(TOKENIZER_SUFFIXES)
     infixes = tuple(TOKENIZER_INFIXES)
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 2d82e213c..2f76111e5 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [
     pytest.param(
         "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
     ),
-    pytest.param(
-        "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
-    ),
-    pytest.param(
-        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
-    ),
+    "http://foo.com/blah_blah_(wikipedia)",
+    "http://foo.com/blah_blah_(wikipedia)_(again)",
     "http://www.foo.co.uk",
     "http://www.foo.co.uk/",
     "http://www.foo.co.uk/blah/blah",
@@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
 
 @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
 def test_should_match(en_tokenizer, url):
-    assert en_tokenizer.token_match(url) is not None
+    assert en_tokenizer.token_match_with_affixes(url) is not None
 
 
 @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
 def test_should_not_match(en_tokenizer, url):
-    assert en_tokenizer.token_match(url) is None
+    assert en_tokenizer.token_match_with_affixes(url) is None
 
 
 @pytest.mark.parametrize("url", URLS_BASIC)
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index dadbad7bd..70d49bb39 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -17,6 +17,7 @@ cdef class Tokenizer:
     cpdef readonly Vocab vocab
 
     cdef object _token_match
+    cdef object _token_match_with_affixes
     cdef object _prefix_search
     cdef object _suffix_search
     cdef object _infix_finditer
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 16a2cf27b..cf0421158 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -30,7 +30,8 @@ cdef class Tokenizer:
     DOCS: https://spacy.io/api/tokenizer
     """
     def __init__(self, Vocab vocab, rules=None, prefix_search=None,
-                 suffix_search=None, infix_finditer=None, token_match=None):
+                 suffix_search=None, infix_finditer=None, token_match=None,
+                 token_match_with_affixes=None):
         """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
         vocab (Vocab): A storage container for lexical types.
@@ -43,6 +44,8 @@ cdef class Tokenizer:
             `re.compile(string).finditer` to find infixes.
         token_match (callable): A boolean function matching strings to be
             recognised as tokens.
+        token_match_with_affixes (callable): A boolean function matching strings to be
+            recognised as tokens after considering prefixes and suffixes.
         RETURNS (Tokenizer): The newly constructed object.
 
         EXAMPLE:
@@ -55,6 +58,7 @@ cdef class Tokenizer:
         self._cache = PreshMap()
         self._specials = PreshMap()
         self.token_match = token_match
+        self.token_match_with_affixes = token_match_with_affixes
         self.prefix_search = prefix_search
         self.suffix_search = suffix_search
         self.infix_finditer = infix_finditer
@@ -70,6 +74,14 @@ cdef class Tokenizer:
             self._token_match = token_match
             self._flush_cache()
 
+    property token_match_with_affixes:
+        def __get__(self):
+            return self._token_match_with_affixes
+
+        def __set__(self, token_match_with_affixes):
+            self._token_match_with_affixes = token_match_with_affixes
+            self._flush_cache()
+
     property prefix_search:
         def __get__(self):
             return self._prefix_search
@@ -108,11 +120,12 @@ cdef class Tokenizer:
 
     def __reduce__(self):
         args = (self.vocab,
-                self._rules,
+                self.rules,
                 self.prefix_search,
                 self.suffix_search,
                 self.infix_finditer,
-                self.token_match)
+                self.token_match,
+                self.token_match_with_affixes)
         return (self.__class__, args, None, None)
 
     cpdef Doc tokens_from_list(self, list strings):
@@ -297,7 +310,9 @@ cdef class Tokenizer:
             cache_hit = self._try_cache(hash_string(string), tokens)
             if cache_hit:
                 pass
-            elif self.token_match and self.token_match(string):
+            elif (self.token_match and self.token_match(string)) or \
+                    (self.token_match_with_affixes and \
+                    self.token_match_with_affixes(string)):
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
@@ -450,6 +465,11 @@ cdef class Tokenizer:
         suffix_search = self.suffix_search
         infix_finditer = self.infix_finditer
         token_match = self.token_match
+        if token_match is None:
+            token_match = re.compile("a^").match
+        token_match_with_affixes = self.token_match_with_affixes
+        if token_match_with_affixes is None:
+            token_match_with_affixes = re.compile("a^").match
         special_cases = {}
         for orth, special_tokens in self.rules.items():
             special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@@ -485,6 +505,9 @@ cdef class Tokenizer:
                 if token_match(substring):
                     tokens.append(("TOKEN_MATCH", substring))
                     substring = ''
+                elif token_match_with_affixes(substring):
+                    tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
+                    substring = ''
                 elif substring in special_cases:
                     tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                     substring = ''
@@ -549,6 +572,7 @@ cdef class Tokenizer:
             ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
             ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
             ("token_match", lambda: _get_regex_pattern(self.token_match)),
+            ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
             ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
         ))
         exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@@ -570,11 +594,12 @@ cdef class Tokenizer:
             ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
             ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
             ("token_match", lambda b: data.setdefault("token_match", b)),
+            ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
             ("exceptions", lambda b: data.setdefault("rules", b))
         ))
         exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
         msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]:
+        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
             if key in data:
                 data[key] = unescape_unicode(data[key])
         if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@@ -585,6 +610,8 @@ cdef class Tokenizer:
             self.infix_finditer = re.compile(data["infix_finditer"]).finditer
         if "token_match" in data and isinstance(data["token_match"], basestring_):
             self.token_match = re.compile(data["token_match"]).match
+        if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
+            self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
         if "rules" in data and isinstance(data["rules"], dict):
             # make sure to hard reset the cache to remove data from the default exceptions
             self._rules = {}
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 7462af739..f73e851f7 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -41,7 +41,8 @@ the
 | `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
 | `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
 | `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
-| `token_match`    | callable    | A function matching the signature of `re.compile(string).match to find token matches.                                         |
+| `token_match`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches.                                         |
+| `token_match_with_affixes`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes.                                         |
 | **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}

From e4a1b5dab1f2de60fa0ddbb3e80282b0749635da Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 22 May 2020 12:41:03 +0200
Subject: [PATCH 4/4] Rename to url_match

Rename to `url_match` and update docs.
---
 spacy/lang/tokenizer_exceptions.py        |  2 +-
 spacy/language.py                         |  8 ++---
 spacy/tests/tokenizer/test_urls.py        |  4 +--
 spacy/tokenizer.pxd                       |  2 +-
 spacy/tokenizer.pyx                       | 40 +++++++++++------------
 website/docs/api/tokenizer.md             | 16 ++++-----
 website/docs/usage/linguistic-features.md | 23 ++++++++-----
 7 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 6a9a5363f..67349916b 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -59,7 +59,7 @@ URL_PATTERN = (
 ).strip()
 
 TOKEN_MATCH = None
-TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match
+URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
 
 
 BASE_EXCEPTIONS = {}
diff --git a/spacy/language.py b/spacy/language.py
index 2c7f4e2b5..53a788f2a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
 from .attrs import IS_STOP, LANG, NORM
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
-from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES
+from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
 from .lang.norm_exceptions import BASE_NORMS
 from .lang.tag_map import TAG_MAP
 from .tokens import Doc
@@ -89,7 +89,7 @@ class BaseDefaults(object):
     def create_tokenizer(cls, nlp=None):
         rules = cls.tokenizer_exceptions
         token_match = cls.token_match
-        token_match_with_affixes = cls.token_match_with_affixes
+        url_match = cls.url_match
         prefix_search = (
             util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
         )
@@ -107,12 +107,12 @@ class BaseDefaults(object):
             suffix_search=suffix_search,
             infix_finditer=infix_finditer,
             token_match=token_match,
-            token_match_with_affixes=token_match_with_affixes,
+            url_match=url_match,
         )
 
     pipe_names = ["tagger", "parser", "ner"]
     token_match = TOKEN_MATCH
-    token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES
+    url_match = URL_MATCH
     prefixes = tuple(TOKENIZER_PREFIXES)
     suffixes = tuple(TOKENIZER_SUFFIXES)
     infixes = tuple(TOKENIZER_INFIXES)
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 2f76111e5..65ba93d66 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
 
 @pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
 def test_should_match(en_tokenizer, url):
-    assert en_tokenizer.token_match_with_affixes(url) is not None
+    assert en_tokenizer.url_match(url) is not None
 
 
 @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
 def test_should_not_match(en_tokenizer, url):
-    assert en_tokenizer.token_match_with_affixes(url) is None
+    assert en_tokenizer.url_match(url) is None
 
 
 @pytest.mark.parametrize("url", URLS_BASIC)
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 70d49bb39..694ea49cc 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -17,7 +17,7 @@ cdef class Tokenizer:
     cpdef readonly Vocab vocab
 
     cdef object _token_match
-    cdef object _token_match_with_affixes
+    cdef object _url_match
     cdef object _prefix_search
     cdef object _suffix_search
     cdef object _infix_finditer
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index cf0421158..154a42c4f 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -31,7 +31,7 @@ cdef class Tokenizer:
     """
     def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                  suffix_search=None, infix_finditer=None, token_match=None,
-                 token_match_with_affixes=None):
+                 url_match=None):
         """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
         vocab (Vocab): A storage container for lexical types.
@@ -44,7 +44,7 @@ cdef class Tokenizer:
             `re.compile(string).finditer` to find infixes.
         token_match (callable): A boolean function matching strings to be
             recognised as tokens.
-        token_match_with_affixes (callable): A boolean function matching strings to be
+        url_match (callable): A boolean function matching strings to be
             recognised as tokens after considering prefixes and suffixes.
         RETURNS (Tokenizer): The newly constructed object.
 
@@ -58,7 +58,7 @@ cdef class Tokenizer:
         self._cache = PreshMap()
         self._specials = PreshMap()
         self.token_match = token_match
-        self.token_match_with_affixes = token_match_with_affixes
+        self.url_match = url_match
         self.prefix_search = prefix_search
         self.suffix_search = suffix_search
         self.infix_finditer = infix_finditer
@@ -74,12 +74,12 @@ cdef class Tokenizer:
             self._token_match = token_match
             self._flush_cache()
 
-    property token_match_with_affixes:
+    property url_match:
         def __get__(self):
-            return self._token_match_with_affixes
+            return self._url_match
 
-        def __set__(self, token_match_with_affixes):
-            self._token_match_with_affixes = token_match_with_affixes
+        def __set__(self, url_match):
+            self._url_match = url_match
             self._flush_cache()
 
     property prefix_search:
@@ -125,7 +125,7 @@ cdef class Tokenizer:
                 self.suffix_search,
                 self.infix_finditer,
                 self.token_match,
-                self.token_match_with_affixes)
+                self.url_match)
         return (self.__class__, args, None, None)
 
     cpdef Doc tokens_from_list(self, list strings):
@@ -311,8 +311,8 @@ cdef class Tokenizer:
             if cache_hit:
                 pass
             elif (self.token_match and self.token_match(string)) or \
-                    (self.token_match_with_affixes and \
-                    self.token_match_with_affixes(string)):
+                    (self.url_match and \
+                    self.url_match(string)):
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
@@ -467,9 +467,9 @@ cdef class Tokenizer:
         token_match = self.token_match
         if token_match is None:
             token_match = re.compile("a^").match
-        token_match_with_affixes = self.token_match_with_affixes
-        if token_match_with_affixes is None:
-            token_match_with_affixes = re.compile("a^").match
+        url_match = self.url_match
+        if url_match is None:
+            url_match = re.compile("a^").match
         special_cases = {}
         for orth, special_tokens in self.rules.items():
             special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
@@ -505,8 +505,8 @@ cdef class Tokenizer:
                 if token_match(substring):
                     tokens.append(("TOKEN_MATCH", substring))
                     substring = ''
-                elif token_match_with_affixes(substring):
-                    tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring))
+                elif url_match(substring):
+                    tokens.append(("URL_MATCH", substring))
                     substring = ''
                 elif substring in special_cases:
                     tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
@@ -572,7 +572,7 @@ cdef class Tokenizer:
             ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
             ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
             ("token_match", lambda: _get_regex_pattern(self.token_match)),
-            ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)),
+            ("url_match", lambda: _get_regex_pattern(self.url_match)),
             ("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
         ))
         exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
@@ -594,12 +594,12 @@ cdef class Tokenizer:
             ("suffix_search", lambda b: data.setdefault("suffix_search", b)),
             ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
             ("token_match", lambda b: data.setdefault("token_match", b)),
-            ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)),
+            ("url_match", lambda b: data.setdefault("url_match", b)),
             ("exceptions", lambda b: data.setdefault("rules", b))
         ))
         exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
         msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]:
+        for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
             if key in data:
                 data[key] = unescape_unicode(data[key])
         if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
@@ -610,8 +610,8 @@ cdef class Tokenizer:
             self.infix_finditer = re.compile(data["infix_finditer"]).finditer
         if "token_match" in data and isinstance(data["token_match"], basestring_):
             self.token_match = re.compile(data["token_match"]).match
-        if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_):
-            self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match
+        if "url_match" in data and isinstance(data["url_match"], basestring_):
+            self.url_match = re.compile(data["url_match"]).match
         if "rules" in data and isinstance(data["rules"], dict):
             # make sure to hard reset the cache to remove data from the default exceptions
             self._rules = {}
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index f73e851f7..6f8badfe8 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -35,15 +35,15 @@ the
 > ```
 
 | Name             | Type        | Description                                                                                                                   |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                        |
-| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                               |
-| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                           |
-| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                           |
-| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                           |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| `vocab`          | `Vocab`     | A storage container for lexical types.                                                                                         |
+| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                                                                |
+| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.                                            |
+| `suffix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match suffixes.                                            |
+| `infix_finditer` | callable    | A function matching the signature of `re.compile(string).finditer` to find infixes.                                            |
 | `token_match`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches.                                         |
-| `token_match_with_affixes`    | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes.                                         |
-| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                 |
+| `url_match`      | callable    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
+| **RETURNS**      | `Tokenizer` | The newly constructed object.                                                                                                  |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 91ca1267b..bcc943436 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
             if token_match(substring):
                 tokens.append(substring)
                 substring = ''
+            elif url_match(substring):
+                tokens.append(substring)
+                substring = ''
             elif substring in special_cases:
                 tokens.extend(special_cases[substring])
                 substring = ''
@@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
 The algorithm can be summarized as follows:
 
 1. Iterate over whitespace-separated substrings.
-2. Look for a token match. If there is a match, stop processing and keep this token.
-3. Check whether we have an explicitly defined rule for this substring. If we
-   do, use it.
-4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
-   so that the token match and special cases always get priority.
+2. Look for a token match. If there is a match, stop processing and keep this
+   token.
+3. Check whether we have an explicitly defined special case for this substring.
+   If we do, use it.
+4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
+   #2, so that the token match and special cases always get priority.
 5. If we didn't consume a prefix, try to consume a suffix and then go back to
    #2.
-6. If we can't consume a prefix or a suffix, look for a special case.
-7. Look for "infixes" — stuff like hyphens etc. and split the substring into
+6. If we can't consume a prefix or a suffix, look for a URL match.
+7. If there's no URL match, then look for a special case.
+8. Look for "infixes" — stuff like hyphens etc. and split the substring into
    tokens on all infixes.
-8. Once we can't consume any more of the string, handle it as a single token.
+9. Once we can't consume any more of the string, handle it as a single token.
 
 #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
 
@@ -836,6 +841,8 @@ domain. There are five things you would need to define:
    hyphens etc.
 5. An optional boolean function `token_match` matching strings that should never
    be split, overriding the infix rules. Useful for things like URLs or numbers.
+6. An optional boolean function `url_match`, which is similar to `token_match`
+   except prefixes and suffixes are removed before applying the match.
 
 You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
 to use `re.compile()` to build a regular expression object, and pass its