From 97fb19d7d3b13255ff1e1d0fe2db4dcd3d2c62e3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 11 Oct 2021 13:08:07 +0200
Subject: [PATCH] =?UTF-8?q?Move=20=C2=B0[cfkCFK].=20to=20a=20tokenizer=20e?=
 =?UTF-8?q?xception?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spacy/lang/punctuation.py                | 1 -
 spacy/lang/tokenizer_exceptions.py       | 7 +++++++
 spacy/tests/tokenizer/test_exceptions.py | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index e712e71d6..64165907f 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -21,7 +21,6 @@ TOKENIZER_SUFFIXES = (
     + ["'s", "'S", "’s", "’S", "—", "–"]
     + [
         r"(?<=[0-9])\+",
-        r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
         r"(?<=[0-9])(?:{u})".format(u=UNITS),
         r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index e41db911f..e505827b8 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -250,3 +250,10 @@ o.0
 
 for orth in emoticons:
     BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
+
+
+# Moved from a suffix setting due to #9155 removing prefixes from consideration
+# for lookbehinds
+for u in "cfk":
+    BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+    BASE_EXCEPTIONS[f"°{u.upper()}."] = [{ORTH: f"°{u.upper()}"}, {ORTH: "."}]
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index 9a98e049e..1f5852572 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
     if sys.maxunicode >= 1114111:
         tokens = tokenizer(text)
         assert len(tokens) == length
+
+
+def test_tokenizer_degree(tokenizer):
+    for u in "cfkCFK":
+        assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
+        assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]