From c31a9dabd53de47aa3bda065d95944bb61ffec78 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 15 Feb 2019 10:29:59 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20=20Add=20en/em=20dash=20to=20pre?=
 =?UTF-8?q?fixes=20and=20suffixes=20=20(#3281)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Auto-format

* Add en/em dash to prefixes and suffixes
---
 spacy/lang/punctuation.py                | 17 +++++++++--------
 spacy/tests/regression/test_issue3277.py | 11 +++++++++++
 2 files changed, 20 insertions(+), 8 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3277.py

diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index 2ec8c3e43..17e20fa0c 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,14 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
-from .char_classes import HYPHENS
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
 from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 
 
 _prefixes = (
-    ["§", "%", "=", r"\+(?![0-9])"]
+    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
     + LIST_PUNCT
     + LIST_ELLIPSES
     + LIST_QUOTES
@@ -22,13 +21,15 @@ _suffixes = (
     + LIST_ELLIPSES
     + LIST_QUOTES
     + LIST_ICONS
-    + ["'s", "'S", "’s", "’S"]
+    + ["'s", "'S", "’s", "’S", "—", "–"]
     + [
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
         r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
+        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
+        ),
         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
     ]
 )
@@ -40,8 +41,8 @@ _infixes = (
         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
-        r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
 
diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py
new file mode 100644
index 000000000..88ea67774
--- /dev/null
+++ b/spacy/tests/regression/test_issue3277.py
@@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+def test_issue3277(es_tokenizer):
+    """Test that hyphens are split correctly as prefixes."""
+    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
+    assert len(doc) == 14
+    assert doc[0].text == "\u2014"
+    assert doc[5].text == "\u2013"
+    assert doc[9].text == "\u2013"