mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Continue to split . after °CFK if ° is not a prefix
This commit is contained in:
parent
c710abe6d7
commit
ee6acd7e9d
|
@ -21,6 +21,7 @@ TOKENIZER_SUFFIXES = (
|
|||
+ ["'s", "'S", "’s", "’S", "—", "–"]
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||
|
|
|
@ -255,4 +255,5 @@ for orth in emoticons:
|
|||
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
||||
# for lookbehinds
|
||||
for u in "cfkCFK":
|
||||
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]
|
||||
BASE_EXCEPTIONS[f"°{u}"] = [{ORTH: f"°{u}"}]
|
||||
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
|
||||
|
|
|
@ -49,5 +49,5 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
|
|||
|
||||
def test_tokenizer_degree(tokenizer):
|
||||
for u in "cfkCFK":
|
||||
assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."]
|
||||
assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."]
|
||||
assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
|
||||
assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]
|
||||
|
|
Loading…
Reference in New Issue
Block a user