💫 Add en/em dash to prefixes and suffixes (#3281)

* Auto-format

* Add en/em dash to prefixes and suffixes
This commit is contained in:
Ines Montani 2019-02-15 10:29:59 +01:00 committed by Matthew Honnibal
parent 5651a0d052
commit c31a9dabd5
2 changed files with 20 additions and 8 deletions

View File

@ -1,14 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import HYPHENS from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_prefixes = ( _prefixes = (
["§", "%", "=", r"\+(?![0-9])"] ["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT + LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
@ -22,13 +21,15 @@ _suffixes = (
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
+ LIST_ICONS + LIST_ICONS
+ ["'s", "'S", "s", "S"] + ["'s", "'S", "s", "S", "", ""]
+ [ + [
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
] ]
) )
@ -40,8 +41,8 @@ _infixes = (
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
assert len(doc) == 14
assert doc[0].text == "\u2014"
assert doc[5].text == "\u2013"
assert doc[9].text == "\u2013"