Ignore prefix in suffix matches (#9155)

* Ignore prefix in suffix matches Ignore the currently matched prefix when looking for suffix matches in the tokenizer. Otherwise a lookbehind in the suffix pattern may match incorrectly due the presence of the prefix in the token string. * Move °[cfkCFK]. to a tokenizer exception * Adjust exceptions for same tokenization as v3.1 * Also update test accordingly * Continue to split . after °CFK if ° is not a prefix * Exclude new ° exceptions for pl * Switch back to default tokenization of "° C ." * Revert "Exclude new ° exceptions for pl" This reverts commit 952013a5b4. * Add exceptions for °C for hu
2025-12-16 14:44:19 +03:00 · 2021-10-27 13:02:25 +02:00 · 2021-10-27 13:02:25 +02:00 · 2ea9b58006
commit 2ea9b58006
parent 4170110ce7
5 changed files with 36 additions and 2 deletions
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 )


+for u in "cfkCFK":
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -250,3 +250,9 @@ o.0

 for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfkCFK":
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
    if sys.maxunicode >= 1114111:
        tokens = tokenizer(text)
        assert len(tokens) == length
+
+
+def test_tokenizer_degree(tokenizer):
+    for u in "cfkCFK":
+        assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
+        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -2,7 +2,7 @@ import pytest
 import re
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
-from spacy.util import ensure_path
+from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex
 from spacy.lang.en import English


@ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab):
    assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
    tokenizer1.rules = {}
    assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
+
+
+def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
+    # the prefix and suffix matches overlap in the suffix lookbehind
+    prefixes = ['a(?=.)']
+    suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.']
+    prefix_re = compile_prefix_regex(prefixes)
+    suffix_re = compile_suffix_regex(suffixes)
+    tokenizer = Tokenizer(
+        en_vocab,
+        prefix_search=prefix_re.search,
+        suffix_search=suffix_re.search,
+    )
+    tokens = [t.text for t in tokenizer("a10.")]
+    assert tokens == ["a", "10", "."]
+    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
+    assert tokens == explain_tokens
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -408,7 +408,7 @@ cdef class Tokenizer:
                    string = minus_pre
                    prefixes.push_back(self.vocab.get(mem, prefix))
                    break
-            suf_len = self.find_suffix(string)
+            suf_len = self.find_suffix(string[pre_len:])
            if suf_len != 0:
                suffix = string[-suf_len:]
                minus_suf = string[:-suf_len]