mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
💫 Add en/em dash to prefixes and suffixes (#3281)
* Auto-format * Add en/em dash to prefixes and suffixes
This commit is contained in:
parent
5651a0d052
commit
c31a9dabd5
|
@ -1,14 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
|
||||
from .char_classes import HYPHENS
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
||||
|
||||
_prefixes = (
|
||||
["§", "%", "=", r"\+(?![0-9])"]
|
||||
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
|
@ -22,13 +21,15 @@ _suffixes = (
|
|||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ ["'s", "'S", "’s", "’S"]
|
||||
+ ["'s", "'S", "’s", "’S", "—", "–"]
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
@ -40,8 +41,8 @@ _infixes = (
|
|||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
11
spacy/tests/regression/test_issue3277.py
Normal file
11
spacy/tests/regression/test_issue3277.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def test_issue3277(es_tokenizer):
|
||||
"""Test that hyphens are split correctly as prefixes."""
|
||||
doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
|
||||
assert len(doc) == 14
|
||||
assert doc[0].text == "\u2014"
|
||||
assert doc[5].text == "\u2013"
|
||||
assert doc[9].text == "\u2013"
|
Loading…
Reference in New Issue
Block a user