From 97fb19d7d3b13255ff1e1d0fe2db4dcd3d2c62e3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 11 Oct 2021 13:08:07 +0200 Subject: [PATCH] =?UTF-8?q?Move=20=C2=B0[cfkCFK].=20to=20a=20tokenizer=20e?= =?UTF-8?q?xception?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/lang/punctuation.py | 1 - spacy/lang/tokenizer_exceptions.py | 7 +++++++ spacy/tests/tokenizer/test_exceptions.py | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index e712e71d6..64165907f 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -21,7 +21,6 @@ TOKENIZER_SUFFIXES = ( + ["'s", "'S", "’s", "’S", "—", "–"] + [ r"(?<=[0-9])\+", - r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index e41db911f..e505827b8 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -250,3 +250,10 @@ o.0 for orth in emoticons: BASE_EXCEPTIONS[orth] = [{ORTH: orth}] + + +# Moved from a suffix setting due to #9155 removing prefixes from consideration +# for lookbehinds +for u in "cfk": + BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] + BASE_EXCEPTIONS[f"°{u.upper()}."] = [{ORTH: f"°{u.upper()}"}, {ORTH: "."}] diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 9a98e049e..1f5852572 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length): if sys.maxunicode >= 1114111: tokens = tokenizer(text) assert len(tokens) == length + + +def test_tokenizer_degree(tokenizer): + for u in "cfkCFK": + assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."] + assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]