Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-02-21 12:34:51 +01:00
commit ab8392eda3
34 changed files with 456 additions and 189 deletions

View File

@ -10,12 +10,13 @@ Compatible with: spaCy v2.0.0+
"""
from __future__ import print_function, unicode_literals
from toolz import partition_all
from pathlib import Path
from joblib import Parallel, delayed
from functools import partial
import thinc.extra.datasets
import plac
import spacy
from spacy.util import minibatch
@plac.annotations(
@ -35,10 +36,10 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
data, _ = thinc.extra.datasets.imdb()
texts, _ = zip(*data[-limit:])
print("Processing texts...")
partitions = partition_all(batch_size, texts)
executor = Parallel(n_jobs=n_jobs)
do = delayed(transform_texts)
tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
partitions = minibatch(texts, size=batch_size)
executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
do = delayed(partial(transform_texts, nlp))
tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
executor(tasks)

View File

@ -31,7 +31,7 @@ TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
# strings are unicode and that the number of tags assigned matches spaCy's
# tokenization. If not, you can always add a 'words' key to the annotations
# that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),

1
fabfile.py vendored
View File

@ -59,6 +59,7 @@ def make():
def sdist():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
local('python -m pip install -U setuptools')
local('python setup.py sdist')
def wheel():

View File

@ -4,7 +4,7 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "2.1.0a7"
__version__ = "2.1.0a8"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"

View File

@ -5,41 +5,143 @@ split_chars = lambda char: list(char.strip().split(" "))
merge_chars = lambda char: char.strip().replace(" ", "|")
group_chars = lambda char: char.strip().replace(" ", "")
# used https://unicode.org/cldr/utility/list-unicodeset.jsp to convert categories into code points
# \p{L}&&\p{Bengali}
# https://en.wikipedia.org/wiki/Bengali_(Unicode_block)
_bengali = r"\u0980-\u09FF"
# \p{L}&&\p{Hebrew}
# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet
_hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
# \p{Ll}&&\p{Latin}
_latin_lower = r"a-z\u00DF-\u00F6\u00F8-\u00FF\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175\u0177\u017A\u017C\u017E-\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242\u0247\u0249\u024B\u024D\u024F-\u0293\u0295-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF\u214E\u2184\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA\uAB30-\uAB5A\uAB60-\uAB64\uFB00-\uFB06\uFF41-\uFF5A"
# \p{Lu}&&\p{Latin}
_latin_upper = r"A-Z\u00C0-\u00D6\u00D8-\u00DE\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176\u0178\u0179\u017B\u017D\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE\u212A\u212B\u2132\u2183\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8\uFF21-\uFF3A"
# [\p{Ll}||\p{Lu}]&&\p{Latin}
latin = r"A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u01BA\u01BC-\u01BF\u01C4\u01C6\u01C7\u01C9\u01CA\u01CC-\u01F1\u01F3-\u0293\u0295-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A\u1E00-\u1EFF\u212A\u212B\u2132\u214E\u2183\u2184\u2C60-\u2C7B\u2C7E\u2C7F\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA\uAB30-\uAB5A\uAB60-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A"
# Latin standard
_latin_u_standard = r"A-Z"
_latin_l_standard = r"a-z"
_latin_standard = _latin_u_standard + _latin_l_standard
# \p{L}&&\p{Arabic}
_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB"
_latin_u_standard_fullwidth = r"\uFF21-\uFF3A"
_latin_l_standard_fullwidth = r"\uFF41-\uFF5A"
_latin_standard_fullwidth = _latin_u_standard_fullwidth + _latin_l_standard_fullwidth
# letters with diacritics - French, German, Icelandic, Spanish
_latin_u_supplement = r"\u00C0-\u00D6\u00D8-\u00DE"
_latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF"
_latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF"
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" \
r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B" \
r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158" \
r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176" \
r"\u0178\u0179\u017B\u017D"
_latin_l_extendedA = r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D" \
r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A" \
r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157" \
r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175" \
r"\u0177\u017A\u017C\u017E\u017F"
_latin_extendedA = r"\u0100-\u017F"
# special characters - Khoisan, Pan-Nigerian, Pinyin, Romanian
# those that are a combination of both upper and lower letters are only included in the group _latin_extendedB
_latin_u_extendedB = r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C" \
r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5" \
r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB" \
r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA" \
r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216" \
r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232" \
r"\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E"
_latin_l_extendedB = r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5" \
r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC" \
r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7" \
r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205" \
r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221" \
r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242" \
r"\u0247\u0249\u024B\u024D\u024F"
_latin_extendedB = r"\u0180-\u01BF\u01C4-\u024F"
# special characters - Uighur, Uralic Phonetic
_latin_u_extendedC = r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F"
_latin_l_extendedC = r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B"
_latin_extendedC = r"\u2C60-\u2C7B\u2C7E\u2C7F"
# special characters - phonetic, Mayan, Medieval
_latin_u_extendedD = r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C" \
r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758" \
r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D" \
r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E" \
r"\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8"
_latin_l_extendedD = r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D" \
r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759" \
r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A" \
r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B" \
r"\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA"
_latin_extendedD = r"\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA"
# special characters - phonetic Teuthonista and Sakha
_latin_l_extendedE = r"\uAB30-\uAB5A\uAB60-\uAB64"
_latin_extendedE = _latin_l_extendedE
# phonetic letters - Greek, Latin, Cyrillic
_latin_l_phonetic = r"\u0250-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A"
_latin_phonetic = _latin_l_phonetic
# letters with multiple diacritics - Vietnamese
_latin_u_diacritics = r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A" \
r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36" \
r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52" \
r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E" \
r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A" \
r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE" \
r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8" \
r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4" \
r"\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE"
_latin_l_diacritics = r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B" \
r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37" \
r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53" \
r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F" \
r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B" \
r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD" \
r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9" \
r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5" \
r"\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF"
_latin_diacritics = r"\u1E00-\u1EFF"
# all lower latin classes
LATIN_LOWER_BASIC = _latin_l_standard + _latin_l_standard_fullwidth + _latin_l_supplement + _latin_l_extendedA
LATIN_LOWER = LATIN_LOWER_BASIC + _latin_l_extendedB + _latin_l_extendedC + _latin_l_extendedD + _latin_l_extendedE \
+ _latin_l_phonetic + _latin_l_diacritics
# all upper latin classes
LATIN_UPPER_BASIC = _latin_u_standard + _latin_u_standard_fullwidth + _latin_u_supplement + _latin_u_extendedA
LATIN_UPPER = LATIN_UPPER_BASIC + _latin_u_extendedB + _latin_u_extendedC + _latin_u_extendedD + _latin_u_diacritics
# all latin classes
LATIN_BASIC = _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA
LATIN = LATIN_BASIC + _latin_extendedB + _latin_extendedC + _latin_extendedD + _latin_extendedE \
+ _latin_phonetic + _latin_diacritics
_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD" \
r"\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB"
_russian_lower = r"ёа-я"
_russian_upper = r"ЁА-Я"
# \p{L}&&\p{Sinhala}
_russian = r"ёа-яЁА-Я"
_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
_tatar_lower = r"әөүҗңһ"
_tatar_upper = r"ӘӨҮҖҢҺ"
_tatar = r"әөүҗңһӘӨҮҖҢҺ"
_greek_lower = r"α-ωάέίόώήύ"
_greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
_greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
_ukrainian_lower = r"а-щюяіїєґ"
_ukrainian_upper = r"А-ЩЮЯІЇЄҐ"
_ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_upper = _latin_upper + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = _latin_lower + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = _bengali + _hebrew + _persian + _sinhala
ALPHA = group_chars(_upper + _lower + _uncased)
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased)
@ -63,7 +165,33 @@ _hyphens = "- — -- --- —— ~"
# Various symbols like dingbats, but also emoji
# Details: https://www.compart.com/en/unicode/category/So
_other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D"
_other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70" \
r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34" \
r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399" \
r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116" \
r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B" \
r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3" \
r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB" \
r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E" \
r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95" \
r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB" \
r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3" \
r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF" \
r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC" \
r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B" \
r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F" \
r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164" \
r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8" \
r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A" \
r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B" \
r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF" \
r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202" \
r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265" \
r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9" \
r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847" \
r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B" \
r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2" \
r"\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D"
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)

View File

@ -7076,14 +7076,6 @@ FR_BASE_EXCEPTIONS = [
"au-lof",
"au-tour",
"aube-vigne",
"audio-numérique",
"audio-numériques",
"audio-prothésiste",
"audio-prothésistes",
"audio-visuel",
"audio-visuelle",
"audio-visuelles",
"audio-visuels",
"aujourd'hui",
"aulnaie-frênaie",
"aulnaies-frênaies",
@ -14400,7 +14392,6 @@ FR_BASE_EXCEPTIONS = [
"attaques surprises",
"attaques-surprises",
"attrape-con",
"audio-oral",
"auriculo-cardiaque",
"auriculo-temporal",
"austro-bavarois",

View File

@ -3,12 +3,15 @@ from __future__ import unicode_literals
import re
from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
def upper_first_letter(text):
if len(text) == 0:
@ -128,6 +131,7 @@ _hyphen_prefix = [
"arcs?",
"archi",
"arrières?",
"audio",
"avant",
"avion",
"auto",

View File

@ -636,17 +636,17 @@ for orth in [
_exc[orth] = [{ORTH: orth}]
_ord_num_or_date = "([A-Z0-9]+[./-])*(\d+\.?)"
_num = "[+\-]?\d+([,.]\d+)*"
_ops = "[=<>+\-\*/^()÷%²]"
_suffixes = "-[{al}]+".format(al=ALPHA_LOWER)
_numeric_exp = "({n})(({o})({n}))*[%]?".format(n=_num, o=_ops)
_time_exp = "\d+(:\d+)*(\.\d+)?"
_ord_num_or_date = r"([A-Z0-9]+[./-])*(\d+\.?)"
_num = r"[+\-]?\d+([,.]\d+)*"
_ops = r"[=<>+\-\*/^()÷%²]"
_suffixes = r"-[{al}]+".format(al=ALPHA_LOWER)
_numeric_exp = r"({n})(({o})({n}))*[%]?".format(n=_num, o=_ops)
_time_exp = r"\d+(:\d+)*(\.\d+)?"
_nums = "(({ne})|({t})|({on})|({c}))({s})?".format(
_nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
ne=_numeric_exp, t=_time_exp, on=_ord_num_or_date, c=CURRENCY, s=_suffixes
)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -20,7 +21,7 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
tag_map = TAG_MAP

View File

@ -0,0 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
}
TOKENIZER_EXCEPTIONS = _exc

View File

@ -39,10 +39,10 @@ _infixes = (
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -44,7 +44,7 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
cdef Pool mem = Pool()
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values())
nr_extra_attr = max(extensions.values()) + 1
extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
else:
nr_extra_attr = 0
@ -60,9 +60,8 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
for i in range(doc.length):
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache,
transition_states(states, matches, &predicate_cache[i],
doc[i], extra_attr_values, predicates)
predicate_cache += nr_predicate
extra_attr_values += nr_extra_attr
# Handle matches that end in 0-width patterns
finish_states(matches, states)
@ -74,6 +73,7 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
matches[i].start,
matches[i].start+matches[i].length
)
# We need to deduplicate, because we could otherwise arrive at the same
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
# first .?, or the second .? -- it doesn't matter, it's just one match.
@ -89,7 +89,8 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
# showed this wasn't the case when we had a reject-and-continue before a
# match. I still don't really understand what's going on here, but this
# workaround does resolve the issue.
while pattern.attrs.attr != ID and pattern.nr_attr > 0:
while pattern.attrs.attr != ID and \
(pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0):
pattern += 1
return pattern.attrs.value
@ -101,13 +102,17 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
cdef vector[PatternStateC] new_states
cdef int nr_predicate = len(py_predicates)
for i in range(states.size()):
if states[i].pattern.nr_py != 0:
if states[i].pattern.nr_py >= 1:
update_predicate_cache(cached_py_predicates,
states[i].pattern, token, py_predicates)
for i in range(states.size()):
action = get_action(states[i], token.c, extra_attrs,
cached_py_predicates, nr_predicate)
cached_py_predicates)
if action == REJECT:
continue
# Keep only a subset of states (the active ones). Index q is the
# states which are still alive. If we reject a state, we overwrite
# it in the states list, because q doesn't advance.
state = states[i]
states[q] = state
while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
@ -126,7 +131,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
update_predicate_cache(cached_py_predicates,
states[q].pattern, token, py_predicates)
action = get_action(states[q], token.c, extra_attrs,
cached_py_predicates, nr_predicate)
cached_py_predicates)
if action == REJECT:
pass
elif action == ADVANCE:
@ -154,8 +159,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
states.push_back(new_states[i])
cdef void update_predicate_cache(char* cache,
const TokenPatternC* pattern, Token token, predicates):
cdef int update_predicate_cache(char* cache,
const TokenPatternC* pattern, Token token, predicates) except -1:
# If the state references any extra predicates, check whether they match.
# These are cached, so that we don't call these potentially expensive
# Python functions more than we need to.
@ -192,7 +197,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
cdef action_t get_action(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches, int nr_predicate) nogil:
const char* predicate_matches) nogil:
'''We need to consider:
a) Does the token match the specification? [Yes, No]
@ -252,7 +257,7 @@ cdef action_t get_action(PatternStateC state,
Problem: If a quantifier is matching, we're adding a lot of open partials
'''
cdef char is_match
is_match = get_is_match(state, token, extra_attrs, predicate_matches, nr_predicate)
is_match = get_is_match(state, token, extra_attrs, predicate_matches)
quantifier = get_quantifier(state)
is_final = get_is_final(state)
if quantifier == ZERO:
@ -303,9 +308,9 @@ cdef action_t get_action(PatternStateC state,
cdef char get_is_match(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches, int nr_predicate) nogil:
for i in range(nr_predicate):
if predicate_matches[i] == -1:
const char* predicate_matches) nogil:
for i in range(state.pattern.nr_py):
if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0
spec = state.pattern
for attr in spec.attrs[:spec.nr_attr]:
@ -333,7 +338,7 @@ DEF PADDING = 5
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i
cdef int i, index
for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
pattern[i].quantifier = quantifier
pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
@ -356,11 +361,13 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id
pattern[i].nr_attr = 0
pattern[i].nr_extra_attr = 0
pattern[i].nr_py = 0
return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
while pattern.nr_attr != 0:
while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
if id_attr.attr != ID:
@ -384,7 +391,6 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates.
"""
tokens = []
seen_predicates = {}
for spec in token_specs:
if not spec:
# Signifier for 'any token'
@ -393,7 +399,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates, seen_predicates)
predicates = _get_extra_predicates(spec, extra_predicates)
for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens
@ -430,6 +436,7 @@ class _RegexPredicate(object):
self.value = re.compile(value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate == 'REGEX'
def __call__(self, Token token):
@ -447,6 +454,7 @@ class _SetMemberPredicate(object):
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate in ('IN', 'NOT_IN')
def __call__(self, Token token):
@ -459,6 +467,9 @@ class _SetMemberPredicate(object):
else:
return value not in self.value
def __repr__(self):
return repr(('SetMemberPredicate', self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate(object):
def __init__(self, i, attr, value, predicate, is_extension=False):
@ -467,6 +478,7 @@ class _ComparisonPredicate(object):
self.value = value
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate in ('==', '!=', '>=', '<=', '>', '<')
def __call__(self, Token token):
@ -488,7 +500,7 @@ class _ComparisonPredicate(object):
return value < self.value
def _get_extra_predicates(spec, extra_predicates, seen_predicates):
def _get_extra_predicates(spec, extra_predicates):
predicate_types = {
'REGEX': _RegexPredicate,
'IN': _SetMemberPredicate,
@ -499,6 +511,7 @@ def _get_extra_predicates(spec, extra_predicates, seen_predicates):
'>': _ComparisonPredicate,
'<': _ComparisonPredicate,
}
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = []
for attr, value in spec.items():
if isinstance(attr, basestring):
@ -516,16 +529,15 @@ def _get_extra_predicates(spec, extra_predicates, seen_predicates):
if isinstance(value, dict):
for type_, cls in predicate_types.items():
if type_ in value:
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
predicate = cls(len(extra_predicates), attr, value[type_], type_)
# Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results.
if key in seen_predicates:
output.append(seen_predicates[key])
if predicate.key in seen_predicates:
output.append(seen_predicates[predicate.key])
else:
predicate = cls(len(extra_predicates), attr, value[type_], type_)
extra_predicates.append(predicate)
output.append(predicate.i)
seen_predicates[key] = predicate.i
seen_predicates[predicate.key] = predicate.i
return output

View File

@ -145,9 +145,7 @@ class Pipe(object):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict()
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
if self.model in (True, False, None):
serialize["model"] = lambda: self.model
else:
if self.model not in (True, False, None):
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
return util.to_bytes(serialize, exclude)
@ -538,9 +536,7 @@ class Tagger(Pipe):
def to_bytes(self, **exclude):
serialize = OrderedDict()
if self.model in (None, True, False):
serialize['model'] = lambda: self.model
else:
if self.model not in (None, True, False):
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)

View File

@ -261,13 +261,14 @@ class ParserStepModel(Model):
# lowest score.
# numpy's nan_to_num function doesn't take a value, and nan is replaced
# by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf
self.ops.xp.nan_to_num(scores, copy=False)
# Note that scores is always a numpy array! Should fix #3112
scores[numpy.isnan(scores)] = -numpy.inf
numpy.nan_to_num(scores, copy=False)
def backprop_parser_step(d_scores, sgd=None):
# If we have a non-zero gradient for a previously unseen class,
# replace the weight with 0.
new_classes = self.ops.xp.logical_and(
new_classes = self.vec2scores.ops.xp.logical_and(
self.vec2scores.ops.xp.isnan(self.vec2scores.b),
d_scores.any(axis=0)
)

View File

@ -13,10 +13,10 @@ def custom_en_tokenizer(en_vocab):
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
custom_infixes = [
"\.\.\.+",
"(?<=[0-9])-(?=[0-9])",
"[0-9]+(,[0-9]+)+",
"[\[\]!&:,()\*—–\/-]",
r"\.\.\.+",
r"(?<=[0-9])-(?=[0-9])",
r"[0-9]+(,[0-9]+)+",
r"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
return Tokenizer(

View File

@ -22,9 +22,9 @@ import pytest
u"11-septembre",
u"11-Septembre",
u"refox-trottâmes",
u"K-POP",
u"K-Pop",
u"K-pop",
# u"K-POP",
# u"K-Pop",
# u"K-pop",
u"z'yeutes",
u"black-outeront",
u"états-unienne",

View File

@ -207,14 +207,13 @@ def test_matcher_set_value(en_vocab):
assert len(matches) == 0
@pytest.mark.xfail
def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
matcher.add("DET_HOUSE", None, pattern)
doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc)
assert len(matches) == 1
assert len(matches) == 2
doc = Doc(en_vocab, words=["my", "house"])
matches = matcher(doc)
assert len(matches) == 1

View File

@ -358,6 +358,7 @@ def test_issue850_basic():
assert end == 4
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
@pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
)

View File

@ -13,6 +13,22 @@ from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
testwords = u'e2g 2g 52g'
doc = nlp(testwords)
assert len(doc) == 5
assert doc[0].text == "e2g"
assert doc[1].text == "2"
assert doc[2].text == "g"
assert doc[3].text == "52"
assert doc[4].text == "g"
def test_issue1242():
nlp = English()
doc = nlp("")

View File

@ -6,7 +6,6 @@ from spacy.matcher import Matcher
from spacy.tokens import Token, Doc
@pytest.mark.xfail
def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab)
@ -22,21 +21,20 @@ def test_issue1971(en_vocab):
# We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id
# that's not actually in the vocab!
assert all(match_id in en_vocab.strings for match_id, start, end in matcher(doc))
matches = matcher(doc)
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
@pytest.mark.xfail
def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab)
pattern1 = [{"LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = list(reversed(pattern1))
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] #{"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST", None, pattern1, pattern2)
matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc)
assert len(matches) == 2
@pytest.mark.xfail
def test_issue_1971_3(en_vocab):
"""Test that pattern matches correctly for multiple extension attributes."""
Token.set_extension("a", default=1)
@ -50,7 +48,6 @@ def test_issue_1971_3(en_vocab):
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
# @pytest.mark.xfail
def test_issue_1971_4(en_vocab):
"""Test that pattern matches correctly with multiple extension attribute
values on a single token.

View File

@ -7,10 +7,22 @@ from spacy.displacy import render
from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian
import numpy
from spacy.lang.en import English
from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail(
reason="The dot is now properly split off, but the prefix/suffix rules are not applied again afterwards."
"This means that the quote will still be attached to the remaining token."
)
def test_issue2070():
"""Test that checks that a dot followed by a quote is handled appropriately."""
nlp = English()
doc = nlp('First sentence."A quoted sentence" he said ...')
assert len(doc) == 11
def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()

View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
def test_issue2656():
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
text = "I went for 40.3, and got home by 10.0."
nlp = English()
doc = nlp(text)
assert len(doc) == 11
assert doc[0].text == "I"
assert doc[1].text == "went"
assert doc[2].text == "for"
assert doc[3].text == "40.3"
assert doc[4].text == ","
assert doc[5].text == "and"
assert doc[6].text == "got"
assert doc[7].text == "home"
assert doc[8].text == "by"
assert doc[9].text == "10.0"
assert doc[10].text == "."

View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.it import Italian
def test_issue2822():
""" Test that the abbreviation of poco is kept as one word """
nlp = Italian()
text = "Vuoi un po' di zucchero?"
doc = nlp(text)
assert len(doc) == 6
assert doc[0].text == "Vuoi"
assert doc[1].text == "un"
assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di"
assert doc[4].text == "zucchero"
assert doc[5].text == "?"

View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.fr import French
def test_issue2926():
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
nlp = French()
text = "Learn html5/css3/javascript/jquery"
doc = nlp(text)
assert len(doc) == 8
assert doc[0].text == "Learn"
assert doc[1].text == "html5"
assert doc[2].text == "/"
assert doc[3].text == "css3"
assert doc[4].text == "/"
assert doc[5].text == "javascript"
assert doc[6].text == "/"
assert doc[7].text == "jquery"

View File

@ -0,0 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.de import German
def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German()
doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl')
assert len(doc) == 5

View File

@ -8,7 +8,6 @@ from spacy import displacy
from ..util import get_doc
@pytest.mark.xfail
def test_issue3288(en_vocab):
"""Test that retokenization works correctly via displaCy when punctuation
is merged onto the preceeding token and tensor is resized."""

View File

@ -5,7 +5,6 @@ import pytest
from spacy.lang.en import English
@pytest.mark.xfail
def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model."""

View File

@ -9,104 +9,104 @@ import pytest
NAUGHTY_STRINGS = [
# ASCII punctuation
",./;'[]\-=",
'<>?:"{}|_+',
'!@#$%^&*()`~"',
r",./;'[]\-=",
r'<>?:"{}|_+',
r'!@#$%^&*()`~"',
# Unicode additional control characters, byte order marks
"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪",
"",
r"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪",
r"",
# Unicode Symbols
"Ω≈ç√∫˜µ≤≥÷",
"åß∂ƒ©˙∆˚¬…æ",
r"Ω≈ç√∫˜µ≤≥÷",
r"åß∂ƒ©˙∆˚¬…æ",
"œ∑´®†¥¨ˆøπ“‘",
"¡™£¢∞§¶•ªº–≠",
"¸˛Ç◊ı˜Â¯˘¿",
"ÅÍÎÏ˝ÓÔÒÚÆ☃",
"Œ„´‰ˇÁ¨ˆØ∏”’",
"`⁄€‹›fifl‡°·‚—±",
"⅛⅜⅝⅞",
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
"٠١٢٣٤٥٦٧٨٩",
r"¡™£¢∞§¶•ªº–≠",
r"¸˛Ç◊ı˜Â¯˘¿",
r"ÅÍÎÏ˝ÓÔÒÚÆ☃",
r"Œ„´‰ˇÁ¨ˆØ∏”’",
r"`⁄€‹›fifl‡°·‚—±",
r"⅛⅜⅝⅞",
r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
r"٠١٢٣٤٥٦٧٨٩",
# Unicode Subscript/Superscript/Accents
"⁰⁴⁵",
"₀₁₂",
"⁰⁴⁵₀₁₂",
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
r"⁰⁴⁵",
r"₀₁₂",
r"⁰⁴⁵₀₁₂",
r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
# Two-Byte Characters
"田中さんにあげて下さい",
"パーティーへ行かないか",
"和製漢語",
"部落格",
"사회과학원 어학연구소",
"찦차를 타고 온 펲시맨과 쑛다리 똠방각하",
"社會科學院語學研究所",
"울란바토르",
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
r"田中さんにあげて下さい",
r"パーティーへ行かないか",
r"和製漢語",
r"部落格",
r"사회과학원 어학연구소",
r"찦차를 타고 온 펲시맨과 쑛다리 똠방각하",
r"社會科學院語學研究所",
r"울란바토르",
r"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
# Japanese Emoticons
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
"(。◕ ∀ ◕。)",
"`ィ(´∀`∩",
"__ロ(,_,*)",
"・( ̄∀ ̄)・:*:",
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
"┬─┬ノ( º _ ºノ)",
"( ͡° ͜ʖ ͡°)",
r"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
r"(。◕ ∀ ◕。)",
r"`ィ(´∀`∩",
r"__ロ(,_,*)",
r"・( ̄∀ ̄)・:*:",
r"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
r",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
r"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
r"┬─┬ノ( º _ ºノ)",
r"( ͡° ͜ʖ ͡°)",
# Emoji
"😍",
"👩🏽",
"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍",
"🐵 🙈 🙉 🙊",
"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙",
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
"0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟",
r"😍",
r"👩🏽",
r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍",
r"🐵 🙈 🙉 🙊",
r"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙",
r"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
r"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
r"0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟",
# Regional Indicator Symbols
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
"🇺🇸🇷🇺🇸🇦",
r"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
r"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
r"🇺🇸🇷🇺🇸🇦",
# Unicode Numbers
"",
"١٢٣",
r"",
r"١٢٣",
# Right-To-Left Strings
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
"إيو.",
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
"הָיְתָהtestالصفحات التّحول",
"",
"",
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
r"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
r"إيو.",
r"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
r"הָיְתָהtestالصفحات التّحول",
r"",
r"",
r"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
# Trick Unicode
"test",
"test",
"test",
"testtest",
"test",
r"test",
r"test",
r"test",
r"testtest",
r"test",
# Zalgo Text
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
r"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
r"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
r"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
r"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
# Unicode Upsidedown
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
"00˙Ɩ$-",
r"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
r"00˙Ɩ$-",
# Unicode font
" ",
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈",
"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰",
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
r" ",
r"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
r"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
r"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈",
r"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰",
r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
r"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
r"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
# File paths
"../../../../../../../../../../../etc/passwd%00",
"../../../../../../../../../../../etc/hosts",
r"../../../../../../../../../../../etc/passwd%00",
r"../../../../../../../../../../../etc/hosts",
# iOS Vulnerabilities
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
"🏳0🌈",
r"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
r"🏳0🌈",
]

View File

@ -407,7 +407,7 @@ cdef class Tokenizer:
if data.get('infix_finditer'):
self.infix_finditer = re.compile(data['infix_finditer']).finditer
if data.get('token_match'):
self.token_match = re.compile(data['token_match']).search
self.token_match = re.compile(data['token_match']).match
for string, substrings in data.get('rules', {}).items():
self.add_special_case(string, substrings)
return self

View File

@ -222,7 +222,7 @@ def _bulk_merge(Doc doc, merges):
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor,
[(m[1][0].start, m[1][0].end) for m in merges])
[(m[0].start, m[0].end) for m in merges])
# Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots.
span_roots = []

View File

@ -102,11 +102,12 @@ language and training a language model.
In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs
to know the language's character set. If the language you're adding uses
non-latin characters, you might need to add the required character classes to
non-latin characters, you might need to define the required character classes in
the global
[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py).
spaCy uses the [`regex` library](https://pypi.python.org/pypi/regex/) to keep
this simple and readable. If the language requires very specific punctuation
For efficiency, spaCy uses hard-coded unicode ranges to define character classes,
the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block).
If the language requires very specific punctuation
rules, you should consider overwriting the default regular expressions with your
own in the language's `Defaults`.

View File

@ -215,6 +215,22 @@ if all of your models are up to date, you can run the
means that the `Matcher` in v2.1.x may produce different results compared to
the `Matcher` in v2.0.x.
- The deprecated [`Doc.merge`](/api/doc#merge) and
[`Span.merge`](/api/span#merge) methods still work, but you may notice that
they now run slower when merging many objects in a row. That's because the
merging engine was rewritten to be more reliable and to support more efficient
merging **in bulk**. To take advantage of this, you should rewrite your logic
to use the [`Doc.retokenize`](/api/doc#retokenize) context manager and perform
as many merges as possible together in the `with` block.
```diff
- doc[1:5].merge()
- doc[6:8].merge()
+ with doc.retokenize() as retokenizer:
+ retokenizer.merge(doc[1:5])
+ retokenizer.merge(doc[6:8])
```
- For better compatibility with the Universal Dependencies data, the lemmatizer
now preserves capitalization, e.g. for proper nouns. See
[this issue](https://github.com/explosion/spaCy/issues/3256) for details.
@ -227,6 +243,11 @@ if all of your models are up to date, you can run the
+ sentence_splitter = nlp.create_pipe("sentencizer")
```
- The keyword argument `n_threads` on the `.pipe` methods is now deprecated, as
the v2.x models cannot release the global interpreter lock. (Future versions
may introduce a `n_process` argument for parallel inference via
multiprocessing.)
- The `Doc.print_tree` method is now deprecated. If you need a custom nested
JSON representation of a `Doc` object, you might want to write your own helper
function. For a simple and consistent JSON representation of the `Doc` object

View File

@ -1,4 +1,7 @@
redirects = [
# Netlify
# TODO: uncomment once the site is switched over
# {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat" },
# Old demos
{from = "/demos/*", to = "https://explosion.ai/demos/:splat"},
# Old blog

View File

@ -71,11 +71,11 @@ const SEO = ({ description, lang, title, section, sectionTitle, bodyClass }) =>
},
{
name: 'twitter:creator',
content: `@${data.site.siteMetadata.social.twitter}`,
content: `@${siteMetadata.social.twitter}`,
},
{
name: 'twitter:site',
content: `@${data.site.siteMetadata.social.twitter}`,
content: `@${siteMetadata.social.twitter}`,
},
{
name: 'twitter:title',
@ -126,8 +126,6 @@ const query = graphql`
title
description
slogan
siteUrl
email
social {
twitter
}