Merge branch 'master' into spacy.io

2025-12-10 19:54:17 +03:00 · 2019-03-22 15:17:51 +01:00 · 2019-03-22 15:17:51 +01:00 · 680eafab94
commit 680eafab94
parent 7dd5e2f564 9cee3f702a
9 changed files with 276 additions and 18 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0
 thinc>=7.0.2,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.1.3,<1.1.0
+wasabi>=0.2.0,<1.1.0
 srsly>=0.0.5,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.py
+++ b/setup.py
@ -232,7 +232,7 @@ def setup_package():
                "plac<1.0.0,>=0.9.6",
                "requests>=2.13.0,<3.0.0",
                "jsonschema>=2.6.0,<3.0.0",
-                "wasabi>=0.0.12,<1.1.0",
+                "wasabi>=0.2.0,<1.1.0",
                "srsly>=0.0.5,<1.1.0",
                'pathlib==1.0.1; python_version < "3.4"',
            ],
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # fmt: off

 __title__ = "spacy"
-__version__ = "2.1.1"
+__version__ = "2.1.2"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -11,6 +11,7 @@ from __future__ import unicode_literals
 import os
 import sys
 import itertools
+import ast

 from thinc.neural.util import copy_array

@ -150,3 +151,26 @@ def import_file(name, loc):
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
+
+
+def unescape_unicode(string):
+    """Python2.7's re module chokes when compiling patterns that have ranges
+    between escaped unicode codepoints if the two codepoints are unrecognised
+    in the unicode database. For instance:
+
+        re.compile('[\\uAA77-\\uAA79]').findall("hello")
+
+    Ends up matching every character (on Python 2). This problem doesn't occur
+    if we're dealing with unicode literals.
+    """
+    if string is None:
+        return string
+    # We only want to unescape the unicode, so we first must protect the other
+    # backslashes.
+    string = string.replace("\\", "\\\\")
+    # Now we remove that protection for the unicode.
+    string = string.replace("\\\\u", "\\u")
+    string = string.replace("\\\\U", "\\U")
+    # Now we unescape by evaling the string with the AST. This can't execute
+    # code -- it only does the representational level.
+    return ast.literal_eval("u'''" + string + "'''")
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -1,13 +1,97 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA, PRON_LEMMA
+from ...symbols import LEMMA, PRON_LEMMA, AUX

+_subordinating_conjunctions = [
+    "that",
+    "if",
+    "as",
+    "because",
+    "of",
+    "for",
+    "before",
+    "in",
+    "while",
+    "after",
+    "since",
+    "like",
+    "with",
+    "so",
+    "to",
+    "by",
+    "on",
+    "about",
+    "than",
+    "whether",
+    "although",
+    "from",
+    "though",
+    "until",
+    "unless",
+    "once",
+    "without",
+    "at",
+    "into",
+    "cause",
+    "over",
+    "upon",
+    "till",
+    "whereas",
+    "beyond",
+    "whilst",
+    "except",
+    "despite",
+    "wether",
+    "then",
+    "but",
+    "becuse",
+    "whie",
+    "below",
+    "against",
+    "it",
+    "w/out",
+    "toward",
+    "albeit",
+    "save",
+    "besides",
+    "becouse",
+    "coz",
+    "til",
+    "ask",
+    "i'd",
+    "out",
+    "near",
+    "seince",
+    "towards",
+    "tho",
+    "sice",
+    "will",
+]
+
+_relative_pronouns = ["this", "that", "those", "these"]

 MORPH_RULES = {
+    "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
+    "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
+    "NN": {
+        "something": {"POS": "PRON"},
+        "anyone": {"POS": "PRON"},
+        "anything": {"POS": "PRON"},
+        "nothing": {"POS": "PRON"},
+        "someone": {"POS": "PRON"},
+        "everything": {"POS": "PRON"},
+        "everyone": {"POS": "PRON"},
+        "everybody": {"POS": "PRON"},
+        "nobody": {"POS": "PRON"},
+        "somebody": {"POS": "PRON"},
+        "anybody": {"POS": "PRON"},
+        "any1": {"POS": "PRON"},
+    },
    "PRP": {
        "I": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Sing",
@ -15,14 +99,16 @@ MORPH_RULES = {
        },
        "me": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Sing",
            "Case": "Acc",
        },
-        "you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
+        "you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
        "he": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -31,6 +117,7 @@ MORPH_RULES = {
        },
        "him": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -39,6 +126,7 @@ MORPH_RULES = {
        },
        "she": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -47,6 +135,7 @@ MORPH_RULES = {
        },
        "her": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -55,6 +144,7 @@ MORPH_RULES = {
        },
        "it": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -62,6 +152,7 @@ MORPH_RULES = {
        },
        "we": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Plur",
@ -69,6 +160,7 @@ MORPH_RULES = {
        },
        "us": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Plur",
@ -76,6 +168,7 @@ MORPH_RULES = {
        },
        "they": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Plur",
@ -83,6 +176,7 @@ MORPH_RULES = {
        },
        "them": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Plur",
@ -90,6 +184,7 @@ MORPH_RULES = {
        },
        "mine": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Sing",
@ -98,6 +193,7 @@ MORPH_RULES = {
        },
        "his": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -107,6 +203,7 @@ MORPH_RULES = {
        },
        "hers": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -116,6 +213,7 @@ MORPH_RULES = {
        },
        "its": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -125,6 +223,7 @@ MORPH_RULES = {
        },
        "ours": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Plur",
@ -133,6 +232,7 @@ MORPH_RULES = {
        },
        "yours": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Two",
            "Number": "Plur",
@ -141,6 +241,7 @@ MORPH_RULES = {
        },
        "theirs": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Plur",
@ -149,6 +250,7 @@ MORPH_RULES = {
        },
        "myself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Sing",
@ -157,6 +259,7 @@ MORPH_RULES = {
        },
        "yourself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Two",
            "Case": "Acc",
@ -164,6 +267,7 @@ MORPH_RULES = {
        },
        "himself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -173,6 +277,7 @@ MORPH_RULES = {
        },
        "herself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -182,6 +287,7 @@ MORPH_RULES = {
        },
        "itself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -191,6 +297,7 @@ MORPH_RULES = {
        },
        "themself": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Sing",
@ -199,6 +306,7 @@ MORPH_RULES = {
        },
        "ourselves": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "One",
            "Number": "Plur",
@ -207,6 +315,7 @@ MORPH_RULES = {
        },
        "yourselves": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Two",
            "Case": "Acc",
@ -214,6 +323,7 @@ MORPH_RULES = {
        },
        "themselves": {
            LEMMA: PRON_LEMMA,
+            "POS": "PRON",
            "PronType": "Prs",
            "Person": "Three",
            "Number": "Plur",
@ -269,9 +379,17 @@ MORPH_RULES = {
            "Poss": "Yes",
        },
    },
+    "RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "n’t"]},
+    "VB": {
+        word: {"POS": "AUX"}
+        for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
+    },
+    "VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
+    "VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
    "VBZ": {
        "am": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "One",
            "Tense": "Pres",
@ -279,6 +397,7 @@ MORPH_RULES = {
        },
        "are": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "Two",
            "Tense": "Pres",
@ -286,6 +405,7 @@ MORPH_RULES = {
        },
        "is": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "Three",
            "Tense": "Pres",
@ -293,6 +413,7 @@ MORPH_RULES = {
        },
        "'re": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "Two",
            "Tense": "Pres",
@ -300,26 +421,65 @@ MORPH_RULES = {
        },
        "'s": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "Three",
            "Tense": "Pres",
            "Mood": "Ind",
        },
+        "has": {LEMMA: "have", "POS": "AUX"},
+        "does": {LEMMA: "do", "POS": "AUX"},
    },
    "VBP": {
-        "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
-        "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
+        "are": {
+            LEMMA: "be",
+            "POS": "AUX",
+            "VerbForm": "Fin",
+            "Tense": "Pres",
+            "Mood": "Ind",
+        },
+        "'re": {
+            LEMMA: "be",
+            "POS": "AUX",
+            "VerbForm": "Fin",
+            "Tense": "Pres",
+            "Mood": "Ind",
+        },
        "am": {
            LEMMA: "be",
+            "POS": "AUX",
            "VerbForm": "Fin",
            "Person": "One",
            "Tense": "Pres",
            "Mood": "Ind",
        },
+        "do": {"POS": "AUX"},
+        "have": {"POS": "AUX"},
+        "'m": {"POS": "AUX", LEMMA: "be"},
+        "'ve": {"POS": "AUX"},
+        "'re": {"POS": "AUX", LEMMA: "be"},
+        "'s": {"POS": "AUX"},
+        "is": {"POS": "AUX"},
+        "'d": {"POS": "AUX"},
    },
    "VBD": {
-        "was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
-        "were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"},
+        "was": {
+            LEMMA: "be",
+            "POS": "AUX",
+            "VerbForm": "Fin",
+            "Tense": "Past",
+            "Number": "Sing",
+        },
+        "were": {
+            LEMMA: "be",
+            "POS": "AUX",
+            "VerbForm": "Fin",
+            "Tense": "Past",
+            "Number": "Plur",
+        },
+        "did": {LEMMA: "do", "POS": "AUX"},
+        "had": {LEMMA: "have", "POS": "AUX"},
+        "'d": {LEMMA: "have", "POS": "AUX"},
    },
 }

--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX


 TAG_MAP = {
@ -20,15 +20,15 @@ TAG_MAP = {
    "CC": {POS: CCONJ, "ConjType": "coor"},
    "CD": {POS: NUM, "NumType": "card"},
    "DT": {POS: DET},
-    "EX": {POS: ADV, "AdvType": "ex"},
+    "EX": {POS: PRON, "AdvType": "ex"},
    "FW": {POS: X, "Foreign": "yes"},
    "HYPH": {POS: PUNCT, "PunctType": "dash"},
    "IN": {POS: ADP},
    "JJ": {POS: ADJ, "Degree": "pos"},
    "JJR": {POS: ADJ, "Degree": "comp"},
    "JJS": {POS: ADJ, "Degree": "sup"},
-    "LS": {POS: PUNCT, "NumType": "ord"},
-    "MD": {POS: VERB, "VerbType": "mod"},
+    "LS": {POS: X, "NumType": "ord"},
+    "MD": {POS: AUX, "VerbType": "mod"},
    "NIL": {POS: ""},
    "NN": {POS: NOUN, "Number": "sing"},
    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
@ -37,11 +37,11 @@ TAG_MAP = {
    "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
    "POS": {POS: PART, "Poss": "yes"},
    "PRP": {POS: PRON, "PronType": "prs"},
-    "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
+    "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
    "RB": {POS: ADV, "Degree": "pos"},
    "RBR": {POS: ADV, "Degree": "comp"},
    "RBS": {POS: ADV, "Degree": "sup"},
-    "RP": {POS: PART},
+    "RP": {POS: ADP},
    "SP": {POS: SPACE},
    "SYM": {POS: SYM},
    "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
@ -58,9 +58,9 @@ TAG_MAP = {
        "Number": "sing",
        "Person": 3,
    },
-    "WDT": {POS: DET, "PronType": "int|rel"},
+    "WDT": {POS: PRON, "PronType": "int|rel"},
    "WP": {POS: PRON, "PronType": "int|rel"},
-    "WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"},
+    "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"},
    "WRB": {POS: ADV, "PronType": "int|rel"},
    "ADD": {POS: X},
    "NFP": {POS: PUNCT},
--- a/spacy/tests/regression/test_issue3356.py
+++ b/spacy/tests/regression/test_issue3356.py
@ -0,0 +1,70 @@
+import pytest
+import re
+from ... import compat
+
+prefix_search = (
+    b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
+    b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
+    b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
+    b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
+    b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
+    b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
+    b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
+    b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
+    b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
+    b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
+    b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
+    b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
+    b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
+    b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
+    b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
+    b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
+    b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
+    b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
+    b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
+    b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
+    b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
+    b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
+    b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
+    b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
+    b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
+    b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
+    b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
+    b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
+    b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
+    b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
+    b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
+    b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
+    b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
+    b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
+    b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
+    b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
+    b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
+    b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
+    b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
+    b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
+    b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
+    b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
+    b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
+    b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
+    b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
+    b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
+    b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
+    b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
+    b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
+    b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
+    b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
+    b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
+    b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
+    b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
+    b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
+    b"\\U0001FA60-\\U0001FA6D]"
+)
+
+
+if compat.is_python2:
+    # If we have this test in Python 3, pytest chokes, as it can't print the
+    # string above in the xpass message.
+    def test_issue3356():
+        pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
+        assert not pattern.search(u"hello")
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -14,6 +14,7 @@ import re

 from .tokens.doc cimport Doc
 from .strings cimport hash_string
+from .compat import unescape_unicode

 from .errors import Errors, Warnings, deprecation_warning
 from . import util
@ -428,6 +429,9 @@ cdef class Tokenizer:
        ))
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        for key in ["prefix_search", "suffix_search", "infix_finditer"]:
+            if key in data:
+                data[key] = unescape_unicode(data[key])
        if data.get("prefix_search"):
            self.prefix_search = re.compile(data["prefix_search"]).search
        if data.get("suffix_search"):
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -218,7 +218,7 @@ const Landing = ({ data }) => {
                    <H2>Benchmarks</H2>
                    <p>
                        In 2015, independent researchers from Emory University and Yahoo! Labs
-                        showed that spaCy offered the
+                        showed that spaCy offered the{' '}
                        <strong>fastest syntactic parser in the world</strong> and that its accuracy
                        was <strong>within 1% of the best</strong> available (
                        <Link to="https://aclweb.org/anthology/P/P15/P15-1038.pdf">