mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
680eafab94
|
@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0
|
|||
thinc>=7.0.2,<7.1.0
|
||||
blis>=0.2.2,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.1.3,<1.1.0
|
||||
wasabi>=0.2.0,<1.1.0
|
||||
srsly>=0.0.5,<1.1.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
|
|
2
setup.py
2
setup.py
|
@ -232,7 +232,7 @@ def setup_package():
|
|||
"plac<1.0.0,>=0.9.6",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
"jsonschema>=2.6.0,<3.0.0",
|
||||
"wasabi>=0.0.12,<1.1.0",
|
||||
"wasabi>=0.2.0,<1.1.0",
|
||||
"srsly>=0.0.5,<1.1.0",
|
||||
'pathlib==1.0.1; python_version < "3.4"',
|
||||
],
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# fmt: off
|
||||
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.1.1"
|
||||
__version__ = "2.1.2"
|
||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||
__uri__ = "https://spacy.io"
|
||||
__author__ = "Explosion AI"
|
||||
|
|
|
@ -11,6 +11,7 @@ from __future__ import unicode_literals
|
|||
import os
|
||||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -150,3 +151,26 @@ def import_file(name, loc):
|
|||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def unescape_unicode(string):
|
||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
||||
in the unicode database. For instance:
|
||||
|
||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
||||
|
||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
||||
if we're dealing with unicode literals.
|
||||
"""
|
||||
if string is None:
|
||||
return string
|
||||
# We only want to unescape the unicode, so we first must protect the other
|
||||
# backslashes.
|
||||
string = string.replace("\\", "\\\\")
|
||||
# Now we remove that protection for the unicode.
|
||||
string = string.replace("\\\\u", "\\u")
|
||||
string = string.replace("\\\\U", "\\U")
|
||||
# Now we unescape by evaling the string with the AST. This can't execute
|
||||
# code -- it only does the representational level.
|
||||
return ast.literal_eval("u'''" + string + "'''")
|
||||
|
|
|
@ -1,13 +1,97 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import LEMMA, PRON_LEMMA
|
||||
from ...symbols import LEMMA, PRON_LEMMA, AUX
|
||||
|
||||
_subordinating_conjunctions = [
|
||||
"that",
|
||||
"if",
|
||||
"as",
|
||||
"because",
|
||||
"of",
|
||||
"for",
|
||||
"before",
|
||||
"in",
|
||||
"while",
|
||||
"after",
|
||||
"since",
|
||||
"like",
|
||||
"with",
|
||||
"so",
|
||||
"to",
|
||||
"by",
|
||||
"on",
|
||||
"about",
|
||||
"than",
|
||||
"whether",
|
||||
"although",
|
||||
"from",
|
||||
"though",
|
||||
"until",
|
||||
"unless",
|
||||
"once",
|
||||
"without",
|
||||
"at",
|
||||
"into",
|
||||
"cause",
|
||||
"over",
|
||||
"upon",
|
||||
"till",
|
||||
"whereas",
|
||||
"beyond",
|
||||
"whilst",
|
||||
"except",
|
||||
"despite",
|
||||
"wether",
|
||||
"then",
|
||||
"but",
|
||||
"becuse",
|
||||
"whie",
|
||||
"below",
|
||||
"against",
|
||||
"it",
|
||||
"w/out",
|
||||
"toward",
|
||||
"albeit",
|
||||
"save",
|
||||
"besides",
|
||||
"becouse",
|
||||
"coz",
|
||||
"til",
|
||||
"ask",
|
||||
"i'd",
|
||||
"out",
|
||||
"near",
|
||||
"seince",
|
||||
"towards",
|
||||
"tho",
|
||||
"sice",
|
||||
"will",
|
||||
]
|
||||
|
||||
_relative_pronouns = ["this", "that", "those", "these"]
|
||||
|
||||
MORPH_RULES = {
|
||||
"DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
|
||||
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
|
||||
"NN": {
|
||||
"something": {"POS": "PRON"},
|
||||
"anyone": {"POS": "PRON"},
|
||||
"anything": {"POS": "PRON"},
|
||||
"nothing": {"POS": "PRON"},
|
||||
"someone": {"POS": "PRON"},
|
||||
"everything": {"POS": "PRON"},
|
||||
"everyone": {"POS": "PRON"},
|
||||
"everybody": {"POS": "PRON"},
|
||||
"nobody": {"POS": "PRON"},
|
||||
"somebody": {"POS": "PRON"},
|
||||
"anybody": {"POS": "PRON"},
|
||||
"any1": {"POS": "PRON"},
|
||||
},
|
||||
"PRP": {
|
||||
"I": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Sing",
|
||||
|
@ -15,14 +99,16 @@ MORPH_RULES = {
|
|||
},
|
||||
"me": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Sing",
|
||||
"Case": "Acc",
|
||||
},
|
||||
"you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
|
||||
"you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
|
||||
"he": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -31,6 +117,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"him": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -39,6 +126,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"she": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -47,6 +135,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"her": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -55,6 +144,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"it": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -62,6 +152,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"we": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Plur",
|
||||
|
@ -69,6 +160,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"us": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Plur",
|
||||
|
@ -76,6 +168,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"they": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Plur",
|
||||
|
@ -83,6 +176,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"them": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Plur",
|
||||
|
@ -90,6 +184,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"mine": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Sing",
|
||||
|
@ -98,6 +193,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"his": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -107,6 +203,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"hers": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -116,6 +213,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"its": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -125,6 +223,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"ours": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Plur",
|
||||
|
@ -133,6 +232,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"yours": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Two",
|
||||
"Number": "Plur",
|
||||
|
@ -141,6 +241,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"theirs": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Plur",
|
||||
|
@ -149,6 +250,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"myself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Sing",
|
||||
|
@ -157,6 +259,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"yourself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Two",
|
||||
"Case": "Acc",
|
||||
|
@ -164,6 +267,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"himself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -173,6 +277,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"herself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -182,6 +287,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"itself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -191,6 +297,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"themself": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Sing",
|
||||
|
@ -199,6 +306,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"ourselves": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "One",
|
||||
"Number": "Plur",
|
||||
|
@ -207,6 +315,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"yourselves": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Two",
|
||||
"Case": "Acc",
|
||||
|
@ -214,6 +323,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"themselves": {
|
||||
LEMMA: PRON_LEMMA,
|
||||
"POS": "PRON",
|
||||
"PronType": "Prs",
|
||||
"Person": "Three",
|
||||
"Number": "Plur",
|
||||
|
@ -269,9 +379,17 @@ MORPH_RULES = {
|
|||
"Poss": "Yes",
|
||||
},
|
||||
},
|
||||
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "n’t"]},
|
||||
"VB": {
|
||||
word: {"POS": "AUX"}
|
||||
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
|
||||
},
|
||||
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
|
||||
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
|
||||
"VBZ": {
|
||||
"am": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "One",
|
||||
"Tense": "Pres",
|
||||
|
@ -279,6 +397,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"are": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "Two",
|
||||
"Tense": "Pres",
|
||||
|
@ -286,6 +405,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"is": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "Three",
|
||||
"Tense": "Pres",
|
||||
|
@ -293,6 +413,7 @@ MORPH_RULES = {
|
|||
},
|
||||
"'re": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "Two",
|
||||
"Tense": "Pres",
|
||||
|
@ -300,26 +421,65 @@ MORPH_RULES = {
|
|||
},
|
||||
"'s": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "Three",
|
||||
"Tense": "Pres",
|
||||
"Mood": "Ind",
|
||||
},
|
||||
"has": {LEMMA: "have", "POS": "AUX"},
|
||||
"does": {LEMMA: "do", "POS": "AUX"},
|
||||
},
|
||||
"VBP": {
|
||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||
"are": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Tense": "Pres",
|
||||
"Mood": "Ind",
|
||||
},
|
||||
"'re": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Tense": "Pres",
|
||||
"Mood": "Ind",
|
||||
},
|
||||
"am": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Person": "One",
|
||||
"Tense": "Pres",
|
||||
"Mood": "Ind",
|
||||
},
|
||||
"do": {"POS": "AUX"},
|
||||
"have": {"POS": "AUX"},
|
||||
"'m": {"POS": "AUX", LEMMA: "be"},
|
||||
"'ve": {"POS": "AUX"},
|
||||
"'re": {"POS": "AUX", LEMMA: "be"},
|
||||
"'s": {"POS": "AUX"},
|
||||
"is": {"POS": "AUX"},
|
||||
"'d": {"POS": "AUX"},
|
||||
},
|
||||
"VBD": {
|
||||
"was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
|
||||
"were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"},
|
||||
"was": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Tense": "Past",
|
||||
"Number": "Sing",
|
||||
},
|
||||
"were": {
|
||||
LEMMA: "be",
|
||||
"POS": "AUX",
|
||||
"VerbForm": "Fin",
|
||||
"Tense": "Past",
|
||||
"Number": "Plur",
|
||||
},
|
||||
"did": {LEMMA: "do", "POS": "AUX"},
|
||||
"had": {LEMMA: "have", "POS": "AUX"},
|
||||
"'d": {LEMMA: "have", "POS": "AUX"},
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -20,15 +20,15 @@ TAG_MAP = {
|
|||
"CC": {POS: CCONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"EX": {POS: PRON, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"LS": {POS: X, "NumType": "ord"},
|
||||
"MD": {POS: AUX, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
|
@ -37,11 +37,11 @@ TAG_MAP = {
|
|||
"PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
|
||||
"PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"RP": {POS: ADP},
|
||||
"SP": {POS: SPACE},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
|
@ -58,9 +58,9 @@ TAG_MAP = {
|
|||
"Number": "sing",
|
||||
"Person": 3,
|
||||
},
|
||||
"WDT": {POS: DET, "PronType": "int|rel"},
|
||||
"WDT": {POS: PRON, "PronType": "int|rel"},
|
||||
"WP": {POS: PRON, "PronType": "int|rel"},
|
||||
"WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
|
|
70
spacy/tests/regression/test_issue3356.py
Normal file
70
spacy/tests/regression/test_issue3356.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
import pytest
|
||||
import re
|
||||
from ... import compat
|
||||
|
||||
prefix_search = (
|
||||
b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
|
||||
b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
|
||||
b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
|
||||
b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
|
||||
b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
|
||||
b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
|
||||
b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
|
||||
b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
|
||||
b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
|
||||
b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
|
||||
b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
|
||||
b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
|
||||
b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
|
||||
b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
|
||||
b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
|
||||
b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
|
||||
b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
|
||||
b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
|
||||
b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
|
||||
b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
|
||||
b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
|
||||
b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
|
||||
b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
|
||||
b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
|
||||
b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
|
||||
b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
|
||||
b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
|
||||
b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
|
||||
b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
|
||||
b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
|
||||
b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
|
||||
b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
|
||||
b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
|
||||
b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
|
||||
b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
|
||||
b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
|
||||
b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
|
||||
b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
|
||||
b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
|
||||
b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
|
||||
b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
|
||||
b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
|
||||
b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
|
||||
b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
|
||||
b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
|
||||
b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
|
||||
b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
|
||||
b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
|
||||
b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
|
||||
b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
|
||||
b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
|
||||
b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
|
||||
b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
|
||||
b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
|
||||
b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
|
||||
b"\\U0001FA60-\\U0001FA6D]"
|
||||
)
|
||||
|
||||
|
||||
if compat.is_python2:
|
||||
# If we have this test in Python 3, pytest chokes, as it can't print the
|
||||
# string above in the xpass message.
|
||||
def test_issue3356():
|
||||
pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
|
||||
assert not pattern.search(u"hello")
|
|
@ -14,6 +14,7 @@ import re
|
|||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .compat import unescape_unicode
|
||||
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
@ -428,6 +429,9 @@ cdef class Tokenizer:
|
|||
))
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
||||
if key in data:
|
||||
data[key] = unescape_unicode(data[key])
|
||||
if data.get("prefix_search"):
|
||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||
if data.get("suffix_search"):
|
||||
|
|
|
@ -218,7 +218,7 @@ const Landing = ({ data }) => {
|
|||
<H2>Benchmarks</H2>
|
||||
<p>
|
||||
In 2015, independent researchers from Emory University and Yahoo! Labs
|
||||
showed that spaCy offered the
|
||||
showed that spaCy offered the{' '}
|
||||
<strong>fastest syntactic parser in the world</strong> and that its accuracy
|
||||
was <strong>within 1% of the best</strong> available (
|
||||
<Link to="https://aclweb.org/anthology/P/P15/P15-1038.pdf">
|
||||
|
|
Loading…
Reference in New Issue
Block a user