From ee6acd7e9db41c8c8072a8f25eb20d39c26e5802 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 13 Oct 2021 15:31:26 +0200 Subject: [PATCH] =?UTF-8?q?Continue=20to=20split=20.=20after=20=C2=B0CFK?= =?UTF-8?q?=20if=20=C2=B0=20is=20not=20a=20prefix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/lang/punctuation.py | 1 + spacy/lang/tokenizer_exceptions.py | 3 ++- spacy/tests/tokenizer/test_exceptions.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 64165907f..e712e71d6 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -21,6 +21,7 @@ TOKENIZER_SUFFIXES = ( + ["'s", "'S", "’s", "’S", "—", "–"] + [ r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index d76fe4262..18a61d89d 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -255,4 +255,5 @@ for orth in emoticons: # Moved from a suffix setting due to #9155 removing prefixes from consideration # for lookbehinds for u in "cfkCFK": - BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}] + BASE_EXCEPTIONS[f"°{u}"] = [{ORTH: f"°{u}"}] + BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 85716377a..1f5852572 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -49,5 +49,5 @@ def test_tokenizer_handles_emoji(tokenizer, text, length): def test_tokenizer_degree(tokenizer): for u in "cfkCFK": - assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."] - assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."] + assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."] + assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]