From c31a9dabd53de47aa3bda065d95944bb61ffec78 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Feb 2019 10:29:59 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20=20Add=20en/em=20dash=20to=20pre?= =?UTF-8?q?fixes=20and=20suffixes=20=20(#3281)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Auto-format * Add en/em dash to prefixes and suffixes --- spacy/lang/punctuation.py | 17 +++++++++-------- spacy/tests/regression/test_issue3277.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/regression/test_issue3277.py diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 2ec8c3e43..17e20fa0c 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,14 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS -from .char_classes import HYPHENS -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY +from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA _prefixes = ( - ["§", "%", "=", r"\+(?![0-9])"] + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES @@ -22,13 +21,15 @@ _suffixes = ( + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS - + ["'s", "'S", "’s", "’S"] + + ["'s", "'S", "’s", "’S", "—", "–"] + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), + r"(?<=[0-9{al}{e}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES + ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), ] ) @@ -40,8 +41,8 @@ _infixes = ( r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), - r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py new file mode 100644 index 000000000..88ea67774 --- /dev/null +++ b/spacy/tests/regression/test_issue3277.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013"