From e65b5bb9a027af0c290c41a90b89728e0e77f9ae Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 22 Mar 2019 13:42:47 +0100
Subject: [PATCH] Fix tokenizer on Python2.7 (#3460)

spaCy v2.1 switched to the built-in re module, where v2.0 had been using
the third-party regex library. When the tokenizer was deserialized on
Python2.7, the `re.compile()` function was called with expressions that
featured escaped unicode codepoints that were not in Python2.7's unicode
database.

Problems occurred when we had a range between two of these unknown
codepoints, like this:

```
    '[\\uAA77-\\uAA79]'
```

On Python2.7, the unknown codepoints are not unescaped correctly,
resulting in arbitrary out-of-range characters being matched by the
expression.

This problem does not occur if we instead have a range between two
unicode literals, rather than the escape sequences. To fix the bug, we
therefore add a new compat function that unescapes unicode sequences
using the `ast.literal_eval()` function. Care is taken to ensure we
do not also escape non-unicode sequences.

Closes #3356.

- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
---
 spacy/compat.py                          | 24 ++++++++++++++++++++++++
 spacy/tests/regression/test_issue3356.py |  3 +--
 spacy/tokenizer.pyx                      |  4 ++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/spacy/compat.py b/spacy/compat.py
index 8af49f254..997e8787b 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -11,6 +11,7 @@ from __future__ import unicode_literals
 import os
 import sys
 import itertools
+import ast
 
 from thinc.neural.util import copy_array
 
@@ -150,3 +151,26 @@ def import_file(name, loc):
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
         return module
+
+
+def unescape_unicode(string):
+    """Python2.7's re module chokes when compiling patterns that have ranges
+    between escaped unicode codepoints if the two codepoints are unrecognised
+    in the unicode database. For instance:
+
+        re.compile('[\\uAA77-\\uAA79]').findall("hello")
+
+    Ends up matching every character (on Python 2). This problem doesn't occur
+    if we're dealing with unicode literals.
+    """
+    if string is None:
+        return string
+    # We only want to unescape the unicode, so we first must protect the other
+    # backslashes.
+    string = string.replace("\\", "\\\\")
+    # Now we remove that protection for the unicode.
+    string = string.replace("\\\\u", "\\u")
+    string = string.replace("\\\\U", "\\U")
+    # Now we unescape by evaling the string with the AST. This can't execute
+    # code -- it only does the representational level.
+    return ast.literal_eval("u'''" + string + "'''")
diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py
index c14fa8525..4e27055c7 100644
--- a/spacy/tests/regression/test_issue3356.py
+++ b/spacy/tests/regression/test_issue3356.py
@@ -65,7 +65,6 @@ prefix_search = (
 if compat.is_python2:
     # If we have this test in Python 3, pytest chokes, as it can't print the
     # string above in the xpass message.
-    @pytest.mark.xfail
     def test_issue3356():
-        pattern = re.compile(prefix_search.decode("utf8"))
+        pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
         assert not pattern.search(u"hello")
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 86c2d6ad3..e390a72b9 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -14,6 +14,7 @@ import re
 
 from .tokens.doc cimport Doc
 from .strings cimport hash_string
+from .compat import unescape_unicode
 
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
@@ -428,6 +429,9 @@ cdef class Tokenizer:
         ))
         exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
         msg = util.from_bytes(bytes_data, deserializers, exclude)
+        for key in ["prefix_search", "suffix_search", "infix_finditer"]:
+            if key in data:
+                data[key] = unescape_unicode(data[key])
         if data.get("prefix_search"):
             self.prefix_search = re.compile(data["prefix_search"]).search
         if data.get("suffix_search"):