Merge remote-tracking branch 'upstream/master' into feature/el-framework

2025-07-03 11:23:12 +03:00 · 2019-03-22 13:47:08 +01:00 · 2019-03-22 13:47:08 +01:00 · ec3e860b44
commit ec3e860b44
parent 12d4caf341 c9bd0e5a96
4 changed files with 30 additions and 3 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # fmt: off

 __title__ = "spacy"
-__version__ = "2.1.1"
+__version__ = "2.1.2"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -11,6 +11,7 @@ from __future__ import unicode_literals
 import os
 import sys
 import itertools
+import ast

 from thinc.neural.util import copy_array

@ -150,3 +151,26 @@ def import_file(name, loc):
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
+
+
+def unescape_unicode(string):
+    """Python2.7's re module chokes when compiling patterns that have ranges
+    between escaped unicode codepoints if the two codepoints are unrecognised
+    in the unicode database. For instance:
+
+        re.compile('[\\uAA77-\\uAA79]').findall("hello")
+
+    Ends up matching every character (on Python 2). This problem doesn't occur
+    if we're dealing with unicode literals.
+    """
+    if string is None:
+        return string
+    # We only want to unescape the unicode, so we first must protect the other
+    # backslashes.
+    string = string.replace("\\", "\\\\")
+    # Now we remove that protection for the unicode.
+    string = string.replace("\\\\u", "\\u")
+    string = string.replace("\\\\U", "\\U")
+    # Now we unescape by evaling the string with the AST. This can't execute
+    # code -- it only does the representational level.
+    return ast.literal_eval("u'''" + string + "'''")
--- a/spacy/tests/regression/test_issue3356.py
+++ b/spacy/tests/regression/test_issue3356.py
@ -65,7 +65,6 @@ prefix_search = (
 if compat.is_python2:
    # If we have this test in Python 3, pytest chokes, as it can't print the
    # string above in the xpass message.
-    @pytest.mark.xfail
    def test_issue3356():
-        pattern = re.compile(prefix_search.decode("utf8"))
+        pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
        assert not pattern.search(u"hello")
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -14,6 +14,7 @@ import re

 from .tokens.doc cimport Doc
 from .strings cimport hash_string
+from .compat import unescape_unicode

 from .errors import Errors, Warnings, deprecation_warning
 from . import util
@ -428,6 +429,9 @@ cdef class Tokenizer:
        ))
        exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        for key in ["prefix_search", "suffix_search", "infix_finditer"]:
+            if key in data:
+                data[key] = unescape_unicode(data[key])
        if data.get("prefix_search"):
            self.prefix_search = re.compile(data["prefix_search"]).search
        if data.get("suffix_search"):