diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 2ec8c3e43..17e20fa0c 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,14 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS -from .char_classes import HYPHENS -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY +from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA _prefixes = ( - ["§", "%", "=", r"\+(?![0-9])"] + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES @@ -22,13 +21,15 @@ _suffixes = ( + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS - + ["'s", "'S", "’s", "’S"] + + ["'s", "'S", "’s", "’S", "—", "–"] + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), + r"(?<=[0-9{al}{e}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES + ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), ] ) @@ -40,8 +41,8 @@ _infixes = ( r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), - r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py new file mode 100644 index 000000000..88ea67774 --- /dev/null +++ b/spacy/tests/regression/test_issue3277.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013"