mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-04 20:03:13 +03:00
Move °[cfkCFK]. to a tokenizer exception
This commit is contained in:
parent
96d50a3cb3
commit
97fb19d7d3
|
@ -21,7 +21,6 @@ TOKENIZER_SUFFIXES = (
|
||||||
+ ["'s", "'S", "’s", "’S", "—", "–"]
|
+ ["'s", "'S", "’s", "’S", "—", "–"]
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
|
|
@ -250,3 +250,10 @@ o.0
|
||||||
|
|
||||||
for orth in emoticons:
|
for orth in emoticons:
|
||||||
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
# Moved from a suffix setting due to #9155 removing prefixes from consideration
|
||||||
|
# for lookbehinds
|
||||||
|
for u in "cfk":
|
||||||
|
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
|
||||||
|
BASE_EXCEPTIONS[f"°{u.upper()}."] = [{ORTH: f"°{u.upper()}"}, {ORTH: "."}]
|
||||||
|
|
|
@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
if sys.maxunicode >= 1114111:
|
if sys.maxunicode >= 1114111:
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_degree(tokenizer):
|
||||||
|
for u in "cfkCFK":
|
||||||
|
assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."]
|
||||||
|
assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user