Continue to split . after °CFK if ° is not a prefix

2025-10-18 09:44:16 +03:00 · 2021-10-13 15:31:26 +02:00 · 2021-10-13 15:31:26 +02:00 · ee6acd7e9d
commit ee6acd7e9d
parent c710abe6d7
3 changed files with 5 additions and 3 deletions
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -21,6 +21,7 @@ TOKENIZER_SUFFIXES = (
    + ["'s", "'S", "’s", "’S", "—", "–"]
    + [
        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@ -255,4 +255,5 @@ for orth in emoticons:
 # Moved from a suffix setting due to #9155 removing prefixes from consideration
 # for lookbehinds
 for u in "cfkCFK":
-    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
+    BASE_EXCEPTIONS[f"°{u}"] = [{ORTH: f"°{u}"}]
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -49,5 +49,5 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):

 def test_tokenizer_degree(tokenizer):
    for u in "cfkCFK":
-        assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
-        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
+        assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
+        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]