mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge remote-tracking branch 'upstream/master' into feature/el-framework
This commit is contained in:
commit
ec3e860b44
|
@ -4,7 +4,7 @@
|
|||
# fmt: off
|
||||
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.1.1"
|
||||
__version__ = "2.1.2"
|
||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||
__uri__ = "https://spacy.io"
|
||||
__author__ = "Explosion AI"
|
||||
|
|
|
@ -11,6 +11,7 @@ from __future__ import unicode_literals
|
|||
import os
|
||||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -150,3 +151,26 @@ def import_file(name, loc):
|
|||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def unescape_unicode(string):
|
||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
||||
in the unicode database. For instance:
|
||||
|
||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
||||
|
||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
||||
if we're dealing with unicode literals.
|
||||
"""
|
||||
if string is None:
|
||||
return string
|
||||
# We only want to unescape the unicode, so we first must protect the other
|
||||
# backslashes.
|
||||
string = string.replace("\\", "\\\\")
|
||||
# Now we remove that protection for the unicode.
|
||||
string = string.replace("\\\\u", "\\u")
|
||||
string = string.replace("\\\\U", "\\U")
|
||||
# Now we unescape by evaling the string with the AST. This can't execute
|
||||
# code -- it only does the representational level.
|
||||
return ast.literal_eval("u'''" + string + "'''")
|
||||
|
|
|
@ -65,7 +65,6 @@ prefix_search = (
|
|||
if compat.is_python2:
|
||||
# If we have this test in Python 3, pytest chokes, as it can't print the
|
||||
# string above in the xpass message.
|
||||
@pytest.mark.xfail
|
||||
def test_issue3356():
|
||||
pattern = re.compile(prefix_search.decode("utf8"))
|
||||
pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
|
||||
assert not pattern.search(u"hello")
|
||||
|
|
|
@ -14,6 +14,7 @@ import re
|
|||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .compat import unescape_unicode
|
||||
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
@ -428,6 +429,9 @@ cdef class Tokenizer:
|
|||
))
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
||||
if key in data:
|
||||
data[key] = unescape_unicode(data[key])
|
||||
if data.get("prefix_search"):
|
||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||
if data.get("suffix_search"):
|
||||
|
|
Loading…
Reference in New Issue
Block a user