Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-03-22 15:17:51 +01:00
commit 680eafab94
9 changed files with 276 additions and 18 deletions

View File

@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0
thinc>=7.0.2,<7.1.0 thinc>=7.0.2,<7.1.0
blis>=0.2.2,<0.3.0 blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.1.3,<1.1.0 wasabi>=0.2.0,<1.1.0
srsly>=0.0.5,<1.1.0 srsly>=0.0.5,<1.1.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0

View File

@ -232,7 +232,7 @@ def setup_package():
"plac<1.0.0,>=0.9.6", "plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0", "requests>=2.13.0,<3.0.0",
"jsonschema>=2.6.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0",
"wasabi>=0.0.12,<1.1.0", "wasabi>=0.2.0,<1.1.0",
"srsly>=0.0.5,<1.1.0", "srsly>=0.0.5,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"', 'pathlib==1.0.1; python_version < "3.4"',
], ],

View File

@ -4,7 +4,7 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.1" __version__ = "2.1.2"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"

View File

@ -11,6 +11,7 @@ from __future__ import unicode_literals
import os import os
import sys import sys
import itertools import itertools
import ast
from thinc.neural.util import copy_array from thinc.neural.util import copy_array
@ -150,3 +151,26 @@ def import_file(name, loc):
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) spec.loader.exec_module(module)
return module return module
def unescape_unicode(string):
"""Python2.7's re module chokes when compiling patterns that have ranges
between escaped unicode codepoints if the two codepoints are unrecognised
in the unicode database. For instance:
re.compile('[\\uAA77-\\uAA79]').findall("hello")
Ends up matching every character (on Python 2). This problem doesn't occur
if we're dealing with unicode literals.
"""
if string is None:
return string
# We only want to unescape the unicode, so we first must protect the other
# backslashes.
string = string.replace("\\", "\\\\")
# Now we remove that protection for the unicode.
string = string.replace("\\\\u", "\\u")
string = string.replace("\\\\U", "\\U")
# Now we unescape by evaling the string with the AST. This can't execute
# code -- it only does the representational level.
return ast.literal_eval("u'''" + string + "'''")

View File

@ -1,13 +1,97 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import LEMMA, PRON_LEMMA from ...symbols import LEMMA, PRON_LEMMA, AUX
_subordinating_conjunctions = [
"that",
"if",
"as",
"because",
"of",
"for",
"before",
"in",
"while",
"after",
"since",
"like",
"with",
"so",
"to",
"by",
"on",
"about",
"than",
"whether",
"although",
"from",
"though",
"until",
"unless",
"once",
"without",
"at",
"into",
"cause",
"over",
"upon",
"till",
"whereas",
"beyond",
"whilst",
"except",
"despite",
"wether",
"then",
"but",
"becuse",
"whie",
"below",
"against",
"it",
"w/out",
"toward",
"albeit",
"save",
"besides",
"becouse",
"coz",
"til",
"ask",
"i'd",
"out",
"near",
"seince",
"towards",
"tho",
"sice",
"will",
]
_relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = { MORPH_RULES = {
"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},
"anyone": {"POS": "PRON"},
"anything": {"POS": "PRON"},
"nothing": {"POS": "PRON"},
"someone": {"POS": "PRON"},
"everything": {"POS": "PRON"},
"everyone": {"POS": "PRON"},
"everybody": {"POS": "PRON"},
"nobody": {"POS": "PRON"},
"somebody": {"POS": "PRON"},
"anybody": {"POS": "PRON"},
"any1": {"POS": "PRON"},
},
"PRP": { "PRP": {
"I": { "I": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Sing", "Number": "Sing",
@ -15,14 +99,16 @@ MORPH_RULES = {
}, },
"me": { "me": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Sing", "Number": "Sing",
"Case": "Acc", "Case": "Acc",
}, },
"you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, "you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
"he": { "he": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -31,6 +117,7 @@ MORPH_RULES = {
}, },
"him": { "him": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -39,6 +126,7 @@ MORPH_RULES = {
}, },
"she": { "she": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -47,6 +135,7 @@ MORPH_RULES = {
}, },
"her": { "her": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -55,6 +144,7 @@ MORPH_RULES = {
}, },
"it": { "it": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -62,6 +152,7 @@ MORPH_RULES = {
}, },
"we": { "we": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Plur", "Number": "Plur",
@ -69,6 +160,7 @@ MORPH_RULES = {
}, },
"us": { "us": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Plur", "Number": "Plur",
@ -76,6 +168,7 @@ MORPH_RULES = {
}, },
"they": { "they": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Plur", "Number": "Plur",
@ -83,6 +176,7 @@ MORPH_RULES = {
}, },
"them": { "them": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Plur", "Number": "Plur",
@ -90,6 +184,7 @@ MORPH_RULES = {
}, },
"mine": { "mine": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Sing", "Number": "Sing",
@ -98,6 +193,7 @@ MORPH_RULES = {
}, },
"his": { "his": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -107,6 +203,7 @@ MORPH_RULES = {
}, },
"hers": { "hers": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -116,6 +213,7 @@ MORPH_RULES = {
}, },
"its": { "its": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -125,6 +223,7 @@ MORPH_RULES = {
}, },
"ours": { "ours": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Plur", "Number": "Plur",
@ -133,6 +232,7 @@ MORPH_RULES = {
}, },
"yours": { "yours": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Two", "Person": "Two",
"Number": "Plur", "Number": "Plur",
@ -141,6 +241,7 @@ MORPH_RULES = {
}, },
"theirs": { "theirs": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Plur", "Number": "Plur",
@ -149,6 +250,7 @@ MORPH_RULES = {
}, },
"myself": { "myself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Sing", "Number": "Sing",
@ -157,6 +259,7 @@ MORPH_RULES = {
}, },
"yourself": { "yourself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Two", "Person": "Two",
"Case": "Acc", "Case": "Acc",
@ -164,6 +267,7 @@ MORPH_RULES = {
}, },
"himself": { "himself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -173,6 +277,7 @@ MORPH_RULES = {
}, },
"herself": { "herself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -182,6 +287,7 @@ MORPH_RULES = {
}, },
"itself": { "itself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -191,6 +297,7 @@ MORPH_RULES = {
}, },
"themself": { "themself": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Sing", "Number": "Sing",
@ -199,6 +306,7 @@ MORPH_RULES = {
}, },
"ourselves": { "ourselves": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "One", "Person": "One",
"Number": "Plur", "Number": "Plur",
@ -207,6 +315,7 @@ MORPH_RULES = {
}, },
"yourselves": { "yourselves": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Two", "Person": "Two",
"Case": "Acc", "Case": "Acc",
@ -214,6 +323,7 @@ MORPH_RULES = {
}, },
"themselves": { "themselves": {
LEMMA: PRON_LEMMA, LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs", "PronType": "Prs",
"Person": "Three", "Person": "Three",
"Number": "Plur", "Number": "Plur",
@ -269,9 +379,17 @@ MORPH_RULES = {
"Poss": "Yes", "Poss": "Yes",
}, },
}, },
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "nt"]},
"VB": {
word: {"POS": "AUX"}
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
},
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
"VBZ": { "VBZ": {
"am": { "am": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "One", "Person": "One",
"Tense": "Pres", "Tense": "Pres",
@ -279,6 +397,7 @@ MORPH_RULES = {
}, },
"are": { "are": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "Two", "Person": "Two",
"Tense": "Pres", "Tense": "Pres",
@ -286,6 +405,7 @@ MORPH_RULES = {
}, },
"is": { "is": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "Three", "Person": "Three",
"Tense": "Pres", "Tense": "Pres",
@ -293,6 +413,7 @@ MORPH_RULES = {
}, },
"'re": { "'re": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "Two", "Person": "Two",
"Tense": "Pres", "Tense": "Pres",
@ -300,26 +421,65 @@ MORPH_RULES = {
}, },
"'s": { "'s": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "Three", "Person": "Three",
"Tense": "Pres", "Tense": "Pres",
"Mood": "Ind", "Mood": "Ind",
}, },
"has": {LEMMA: "have", "POS": "AUX"},
"does": {LEMMA: "do", "POS": "AUX"},
}, },
"VBP": { "VBP": {
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, "are": {
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"am": { "am": {
LEMMA: "be", LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin", "VerbForm": "Fin",
"Person": "One", "Person": "One",
"Tense": "Pres", "Tense": "Pres",
"Mood": "Ind", "Mood": "Ind",
}, },
"do": {"POS": "AUX"},
"have": {"POS": "AUX"},
"'m": {"POS": "AUX", LEMMA: "be"},
"'ve": {"POS": "AUX"},
"'re": {"POS": "AUX", LEMMA: "be"},
"'s": {"POS": "AUX"},
"is": {"POS": "AUX"},
"'d": {"POS": "AUX"},
}, },
"VBD": { "VBD": {
"was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, "was": {
"were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}, LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Sing",
},
"were": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Plur",
},
"did": {LEMMA: "do", "POS": "AUX"},
"had": {LEMMA: "have", "POS": "AUX"},
"'d": {LEMMA: "have", "POS": "AUX"},
}, },
} }

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
TAG_MAP = { TAG_MAP = {
@ -20,15 +20,15 @@ TAG_MAP = {
"CC": {POS: CCONJ, "ConjType": "coor"}, "CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"}, "CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET}, "DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"}, "EX": {POS: PRON, "AdvType": "ex"},
"FW": {POS: X, "Foreign": "yes"}, "FW": {POS: X, "Foreign": "yes"},
"HYPH": {POS: PUNCT, "PunctType": "dash"}, "HYPH": {POS: PUNCT, "PunctType": "dash"},
"IN": {POS: ADP}, "IN": {POS: ADP},
"JJ": {POS: ADJ, "Degree": "pos"}, "JJ": {POS: ADJ, "Degree": "pos"},
"JJR": {POS: ADJ, "Degree": "comp"}, "JJR": {POS: ADJ, "Degree": "comp"},
"JJS": {POS: ADJ, "Degree": "sup"}, "JJS": {POS: ADJ, "Degree": "sup"},
"LS": {POS: PUNCT, "NumType": "ord"}, "LS": {POS: X, "NumType": "ord"},
"MD": {POS: VERB, "VerbType": "mod"}, "MD": {POS: AUX, "VerbType": "mod"},
"NIL": {POS: ""}, "NIL": {POS: ""},
"NN": {POS: NOUN, "Number": "sing"}, "NN": {POS: NOUN, "Number": "sing"},
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
@ -37,11 +37,11 @@ TAG_MAP = {
"PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"}, "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
"POS": {POS: PART, "Poss": "yes"}, "POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"}, "PRP": {POS: PRON, "PronType": "prs"},
"PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"}, "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
"RB": {POS: ADV, "Degree": "pos"}, "RB": {POS: ADV, "Degree": "pos"},
"RBR": {POS: ADV, "Degree": "comp"}, "RBR": {POS: ADV, "Degree": "comp"},
"RBS": {POS: ADV, "Degree": "sup"}, "RBS": {POS: ADV, "Degree": "sup"},
"RP": {POS: PART}, "RP": {POS: ADP},
"SP": {POS: SPACE}, "SP": {POS: SPACE},
"SYM": {POS: SYM}, "SYM": {POS: SYM},
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
@ -58,9 +58,9 @@ TAG_MAP = {
"Number": "sing", "Number": "sing",
"Person": 3, "Person": 3,
}, },
"WDT": {POS: DET, "PronType": "int|rel"}, "WDT": {POS: PRON, "PronType": "int|rel"},
"WP": {POS: PRON, "PronType": "int|rel"}, "WP": {POS: PRON, "PronType": "int|rel"},
"WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"}, "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"},
"WRB": {POS: ADV, "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"},
"ADD": {POS: X}, "ADD": {POS: X},
"NFP": {POS: PUNCT}, "NFP": {POS: PUNCT},

View File

@ -0,0 +1,70 @@
import pytest
import re
from ... import compat
prefix_search = (
b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
b"\\U0001FA60-\\U0001FA6D]"
)
if compat.is_python2:
# If we have this test in Python 3, pytest chokes, as it can't print the
# string above in the xpass message.
def test_issue3356():
pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
assert not pattern.search(u"hello")

View File

@ -14,6 +14,7 @@ import re
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .strings cimport hash_string from .strings cimport hash_string
from .compat import unescape_unicode
from .errors import Errors, Warnings, deprecation_warning from .errors import Errors, Warnings, deprecation_warning
from . import util from . import util
@ -428,6 +429,9 @@ cdef class Tokenizer:
)) ))
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
if key in data:
data[key] = unescape_unicode(data[key])
if data.get("prefix_search"): if data.get("prefix_search"):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search
if data.get("suffix_search"): if data.get("suffix_search"):

View File

@ -218,7 +218,7 @@ const Landing = ({ data }) => {
<H2>Benchmarks</H2> <H2>Benchmarks</H2>
<p> <p>
In 2015, independent researchers from Emory University and Yahoo! Labs In 2015, independent researchers from Emory University and Yahoo! Labs
showed that spaCy offered the showed that spaCy offered the{' '}
<strong>fastest syntactic parser in the world</strong> and that its accuracy <strong>fastest syntactic parser in the world</strong> and that its accuracy
was <strong>within 1% of the best</strong> available ( was <strong>within 1% of the best</strong> available (
<Link to="https://aclweb.org/anthology/P/P15/P15-1038.pdf"> <Link to="https://aclweb.org/anthology/P/P15/P15-1038.pdf">