From e65b5bb9a027af0c290c41a90b89728e0e77f9ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Mar 2019 13:42:47 +0100 Subject: [PATCH] Fix tokenizer on Python2.7 (#3460) spaCy v2.1 switched to the built-in re module, where v2.0 had been using the third-party regex library. When the tokenizer was deserialized on Python2.7, the `re.compile()` function was called with expressions that featured escaped unicode codepoints that were not in Python2.7's unicode database. Problems occurred when we had a range between two of these unknown codepoints, like this: ``` '[\\uAA77-\\uAA79]' ``` On Python2.7, the unknown codepoints are not unescaped correctly, resulting in arbitrary out-of-range characters being matched by the expression. This problem does not occur if we instead have a range between two unicode literals, rather than the escape sequences. To fix the bug, we therefore add a new compat function that unescapes unicode sequences using the `ast.literal_eval()` function. Care is taken to ensure we do not also escape non-unicode sequences. Closes #3356. - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/compat.py | 24 ++++++++++++++++++++++++ spacy/tests/regression/test_issue3356.py | 3 +-- spacy/tokenizer.pyx | 4 ++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 8af49f254..997e8787b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -11,6 +11,7 @@ from __future__ import unicode_literals import os import sys import itertools +import ast from thinc.neural.util import copy_array @@ -150,3 +151,26 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + +def unescape_unicode(string): + """Python2.7's re module chokes when compiling patterns that have ranges + between escaped unicode codepoints if the two codepoints are unrecognised + in the unicode database. For instance: + + re.compile('[\\uAA77-\\uAA79]').findall("hello") + + Ends up matching every character (on Python 2). This problem doesn't occur + if we're dealing with unicode literals. + """ + if string is None: + return string + # We only want to unescape the unicode, so we first must protect the other + # backslashes. + string = string.replace("\\", "\\\\") + # Now we remove that protection for the unicode. + string = string.replace("\\\\u", "\\u") + string = string.replace("\\\\U", "\\U") + # Now we unescape by evaling the string with the AST. This can't execute + # code -- it only does the representational level. + return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py index c14fa8525..4e27055c7 100644 --- a/spacy/tests/regression/test_issue3356.py +++ b/spacy/tests/regression/test_issue3356.py @@ -65,7 +65,6 @@ prefix_search = ( if compat.is_python2: # If we have this test in Python 3, pytest chokes, as it can't print the # string above in the xpass message. - @pytest.mark.xfail def test_issue3356(): - pattern = re.compile(prefix_search.decode("utf8")) + pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8"))) assert not pattern.search(u"hello") diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 86c2d6ad3..e390a72b9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -14,6 +14,7 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string +from .compat import unescape_unicode from .errors import Errors, Warnings, deprecation_warning from . import util @@ -428,6 +429,9 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) + for key in ["prefix_search", "suffix_search", "infix_finditer"]: + if key in data: + data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"):