Continue to split . after °CFK if ° is not a prefix

This commit is contained in:
Adriane Boyd 2021-10-13 15:31:26 +02:00
parent c710abe6d7
commit ee6acd7e9d
3 changed files with 5 additions and 3 deletions

View File

@ -21,6 +21,7 @@ TOKENIZER_SUFFIXES = (
+ ["'s", "'S", "s", "S", "", ""] + ["'s", "'S", "s", "S", "", ""]
+ [ + [
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(

View File

@ -255,4 +255,5 @@ for orth in emoticons:
# Moved from a suffix setting due to #9155 removing prefixes from consideration # Moved from a suffix setting due to #9155 removing prefixes from consideration
# for lookbehinds # for lookbehinds
for u in "cfkCFK": for u in "cfkCFK":
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}] BASE_EXCEPTIONS[f"°{u}"] = [{ORTH: f"°{u}"}]
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]

View File

@ -49,5 +49,5 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
def test_tokenizer_degree(tokenizer): def test_tokenizer_degree(tokenizer):
for u in "cfkCFK": for u in "cfkCFK":
assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."] assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."] assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]