Revert changes to token_match priority from #4374

* Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly
2025-10-19 18:24:30 +03:00 · 2020-03-09 12:09:41 +01:00 · 2020-03-09 12:09:41 +01:00 · 1139247532
commit 1139247532
parent 0345135167
5 changed files with 38 additions and 20 deletions
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
 _units = UNITS.replace("%", "")
 _prefixes = (
    LIST_PUNCT
@ -21,7 +20,8 @@ _prefixes = (
 )
 _suffixes = (
-    LIST_PUNCT
+    [r"\+"]
    + LIST_PUNCT
    + LIST_ELLIPSES
    + LIST_QUOTES
    + [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=_units),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
        ),
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import re
-from .char_classes import ALPHA_LOWER
+from .char_classes import ALPHA_LOWER, ALPHA
 from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 URL_PATTERN = (
    # fmt: off
    r"^"
    # in order to support the prefix tokenization (see prefix test cases in test_urls).
    r"(?=[" + ALPHA + "\w])"
    # protocol identifier (mods: make optional and expand schemes)
    # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
    r"(?:(?:[\w\+\-\.]{2,})://)?"
@ -54,6 +56,8 @@ URL_PATTERN = (
    r"(?::\d{2,5})?"
    # resource path
    r"(?:[/?#]\S*)?"
    # in order to support the suffix tokenization (see suffix test cases in test_urls),
    r"(?<=[" + ALPHA + "\w/])"
    r"$"
    # fmt: on
 ).strip()
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [
    pytest.param(
        "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
    ),
-    "http://foo.com/blah_blah_(wikipedia)",
+    pytest.param(
-    "http://foo.com/blah_blah_(wikipedia)_(again)",
+        "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
    ),
    pytest.param(
        "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
    ),
    "http://www.foo.co.uk",
    "http://www.foo.co.uk/",
    "http://www.foo.co.uk/blah/blah",
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -239,6 +239,8 @@ cdef class Tokenizer:
        cdef unicode minus_suf
        cdef size_t last_size = 0
        while string and len(string) != last_size:
            if self.token_match and self.token_match(string):
                break
            if self._specials.get(hash_string(string)) != NULL:
                has_special[0] = 1
                break
@ -455,6 +457,10 @@ cdef class Tokenizer:
            suffixes = []
            while substring:
                while prefix_search(substring) or suffix_search(substring):
                    if token_match(substring):
                        tokens.append(("TOKEN_MATCH", substring))
                        substring = ''
                        break
                    if substring in special_cases:
                        tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                        substring = ''
@ -475,12 +481,12 @@ cdef class Tokenizer:
                            break
                        suffixes.append(("SUFFIX", substring[split:]))
                        substring = substring[:split]
-                if substring in special_cases:
+                if token_match(substring):
                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                    substring = ''
                elif token_match(substring):
                    tokens.append(("TOKEN_MATCH", substring))
                    substring = ''
                elif substring in special_cases:
                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                    substring = ''
                elif list(infix_finditer(substring)):
                    infixes = infix_finditer(substring)
                    offset = 0
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
        suffixes = []
        while substring:
            while prefix_search(substring) or suffix_search(substring):
                if token_match(substring):
                    tokens.append(substring)
                    substring = ''
                    break
                if substring in special_cases:
                    tokens.extend(special_cases[substring])
                    substring = ''
@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
                    split = suffix_search(substring).start()
                    suffixes.append(substring[split:])
                    substring = substring[:split]
-            if substring in special_cases:
+            if token_match(substring):
                tokens.extend(special_cases[substring])
                substring = ''
            elif token_match(substring):
                tokens.append(substring)
                substring = ''
            elif substring in special_cases:
                tokens.extend(special_cases[substring])
                substring = ''
            elif list(infix_finditer(substring)):
                infixes = infix_finditer(substring)
                offset = 0
@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
 The algorithm can be summarized as follows:
 1. Iterate over whitespace-separated substrings.
-2. Check whether we have an explicitly defined rule for this substring. If we
+2. Look for a token match. If there is a match, stop processing and keep this token.
 3. Check whether we have an explicitly defined rule for this substring. If we
   do, use it.
-3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
+4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
-   so that special cases always get priority.
+   so that the token match and special cases always get priority.
-4. If we didn't consume a prefix, try to consume a suffix and then go back to
+5. If we didn't consume a prefix, try to consume a suffix and then go back to
   #2.
-5. If we can't consume a prefix or a suffix, look for a special case.
+6. If we can't consume a prefix or a suffix, look for a special case.
 6. Next, look for a token match.
 7. Look for "infixes" — stuff like hyphens etc. and split the substring into
   tokens on all infixes.
 8. Once we can't consume any more of the string, handle it as a single token.