Move °[cfkCFK]. to a tokenizer exception

2025-10-24 12:41:23 +03:00 · 2021-10-11 13:08:07 +02:00 · 2021-10-11 13:08:07 +02:00 · 97fb19d7d3
commit 97fb19d7d3
parent 96d50a3cb3
3 changed files with 13 additions and 1 deletions
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -21,7 +21,6 @@ TOKENIZER_SUFFIXES = (
    + ["'s", "'S", "’s", "’S", "—", "–"]
    + [
        r"(?<=[0-9])\+",
-        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -250,3 +250,10 @@ o.0

 for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfk":
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+    BASE_EXCEPTIONS[f"°{u.upper()}."] = [{ORTH: f"°{u.upper()}"}, {ORTH: "."}]
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
    if sys.maxunicode >= 1114111:
        tokens = tokenizer(text)
        assert len(tokens) == length
+
+
+def test_tokenizer_degree(tokenizer):
+    for u in "cfkCFK":
+        assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
+        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]