Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-02-21 12:34:51 +01:00
commit ab8392eda3
34 changed files with 456 additions and 189 deletions

View File

@ -10,12 +10,13 @@ Compatible with: spaCy v2.0.0+
""" """
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
from toolz import partition_all
from pathlib import Path from pathlib import Path
from joblib import Parallel, delayed from joblib import Parallel, delayed
from functools import partial
import thinc.extra.datasets import thinc.extra.datasets
import plac import plac
import spacy import spacy
from spacy.util import minibatch
@plac.annotations( @plac.annotations(
@ -35,10 +36,10 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
data, _ = thinc.extra.datasets.imdb() data, _ = thinc.extra.datasets.imdb()
texts, _ = zip(*data[-limit:]) texts, _ = zip(*data[-limit:])
print("Processing texts...") print("Processing texts...")
partitions = partition_all(batch_size, texts) partitions = minibatch(texts, size=batch_size)
executor = Parallel(n_jobs=n_jobs) executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
do = delayed(transform_texts) do = delayed(partial(transform_texts, nlp))
tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions)) tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
executor(tasks) executor(tasks)

View File

@ -31,7 +31,7 @@ TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
# strings are unicode and that the number of tags assigned matches spaCy's # strings are unicode and that the number of tags assigned matches spaCy's
# tokenization. If not, you can always add a 'words' key to the annotations # tokenization. If not, you can always add a 'words' key to the annotations
# that specifies the gold-standard tokenization, e.g.: # that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']}) # ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
TRAIN_DATA = [ TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}), ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}), ("Eat blue ham", {"tags": ["V", "J", "N"]}),

1
fabfile.py vendored
View File

@ -59,6 +59,7 @@ def make():
def sdist(): def sdist():
with virtualenv(VENV_DIR) as venv_local: with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):
local('python -m pip install -U setuptools')
local('python setup.py sdist') local('python setup.py sdist')
def wheel(): def wheel():

View File

@ -4,7 +4,7 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "2.1.0a7" __version__ = "2.1.0a8"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"

View File

@ -5,41 +5,143 @@ split_chars = lambda char: list(char.strip().split(" "))
merge_chars = lambda char: char.strip().replace(" ", "|") merge_chars = lambda char: char.strip().replace(" ", "|")
group_chars = lambda char: char.strip().replace(" ", "") group_chars = lambda char: char.strip().replace(" ", "")
# used https://unicode.org/cldr/utility/list-unicodeset.jsp to convert categories into code points
# \p{L}&&\p{Bengali}
# https://en.wikipedia.org/wiki/Bengali_(Unicode_block)
_bengali = r"\u0980-\u09FF" _bengali = r"\u0980-\u09FF"
# \p{L}&&\p{Hebrew}
# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet
_hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
# \p{Ll}&&\p{Latin} # Latin standard
_latin_lower = r"a-z\u00DF-\u00F6\u00F8-\u00FF\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175\u0177\u017A\u017C\u017E-\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242\u0247\u0249\u024B\u024D\u024F-\u0293\u0295-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF\u214E\u2184\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA\uAB30-\uAB5A\uAB60-\uAB64\uFB00-\uFB06\uFF41-\uFF5A" _latin_u_standard = r"A-Z"
# \p{Lu}&&\p{Latin} _latin_l_standard = r"a-z"
_latin_upper = r"A-Z\u00C0-\u00D6\u00D8-\u00DE\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176\u0178\u0179\u017B\u017D\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE\u212A\u212B\u2132\u2183\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8\uFF21-\uFF3A" _latin_standard = _latin_u_standard + _latin_l_standard
# [\p{Ll}||\p{Lu}]&&\p{Latin}
latin = r"A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u01BA\u01BC-\u01BF\u01C4\u01C6\u01C7\u01C9\u01CA\u01CC-\u01F1\u01F3-\u0293\u0295-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A\u1E00-\u1EFF\u212A\u212B\u2132\u214E\u2183\u2184\u2C60-\u2C7B\u2C7E\u2C7F\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA\uAB30-\uAB5A\uAB60-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A"
# \p{L}&&\p{Arabic} _latin_u_standard_fullwidth = r"\uFF21-\uFF3A"
_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB" _latin_l_standard_fullwidth = r"\uFF41-\uFF5A"
_latin_standard_fullwidth = _latin_u_standard_fullwidth + _latin_l_standard_fullwidth
# letters with diacritics - French, German, Icelandic, Spanish
_latin_u_supplement = r"\u00C0-\u00D6\u00D8-\u00DE"
_latin_l_supplement = r"\u00DF-\u00F6\u00F8-\u00FF"
_latin_supplement = r"\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF"
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
_latin_u_extendedA = r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" \
r"\u011E\u0120\u0122\u0124\u0126\u0128\u012A\u012C\u012E\u0130\u0132\u0134\u0136\u0139\u013B" \
r"\u013D\u013F\u0141\u0143\u0145\u0147\u014A\u014C\u014E\u0150\u0152\u0154\u0156\u0158" \
r"\u015A\u015C\u015E\u0160\u0162\u0164\u0166\u0168\u016A\u016C\u016E\u0170\u0172\u0174\u0176" \
r"\u0178\u0179\u017B\u017D"
_latin_l_extendedA = r"\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D" \
r"\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A" \
r"\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157" \
r"\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175" \
r"\u0177\u017A\u017C\u017E\u017F"
_latin_extendedA = r"\u0100-\u017F"
# special characters - Khoisan, Pan-Nigerian, Pinyin, Romanian
# those that are a combination of both upper and lower letters are only included in the group _latin_extendedB
_latin_u_extendedB = r"\u0181\u0182\u0184\u0186\u0187\u0189-\u018B\u018E-\u0191\u0193\u0194\u0196-\u0198\u019C" \
r"\u019D\u019F\u01A0\u01A2\u01A4\u01A6\u01A7\u01A9\u01AC\u01AE\u01AF\u01B1-\u01B3\u01B5" \
r"\u01B7\u01B8\u01BC\u01C4\u01C7\u01CA\u01CD\u01CF\u01D1\u01D3\u01D5\u01D7\u01D9\u01DB" \
r"\u01DE\u01E0\u01E2\u01E4\u01E6\u01E8\u01EA\u01EC\u01EE\u01F1\u01F4\u01F6-\u01F8\u01FA" \
r"\u01FC\u01FE\u0200\u0202\u0204\u0206\u0208\u020A\u020C\u020E\u0210\u0212\u0214\u0216" \
r"\u0218\u021A\u021C\u021E\u0220\u0222\u0224\u0226\u0228\u022A\u022C\u022E\u0230\u0232" \
r"\u023A\u023B\u023D\u023E\u0241\u0243-\u0246\u0248\u024A\u024C\u024E"
_latin_l_extendedB = r"\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5" \
r"\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9\u01BA\u01BD-\u01BF\u01C6\u01C9\u01CC" \
r"\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7" \
r"\u01E9\u01EB\u01ED\u01EF\u01F0\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205" \
r"\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221" \
r"\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242" \
r"\u0247\u0249\u024B\u024D\u024F"
_latin_extendedB = r"\u0180-\u01BF\u01C4-\u024F"
# special characters - Uighur, Uralic Phonetic
_latin_u_extendedC = r"\u2C60\u2C62-\u2C64\u2C67\u2C69\u2C6B\u2C6D-\u2C70\u2C72\u2C75\u2C7E\u2C7F"
_latin_l_extendedC = r"\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7B"
_latin_extendedC = r"\u2C60-\u2C7B\u2C7E\u2C7F"
# special characters - phonetic, Mayan, Medieval
_latin_u_extendedD = r"\uA722\uA724\uA726\uA728\uA72A\uA72C\uA72E\uA732\uA734\uA736\uA738\uA73A\uA73C" \
r"\uA73E\uA740\uA742\uA744\uA746\uA748\uA74A\uA74C\uA74E\uA750\uA752\uA754\uA756\uA758" \
r"\uA75A\uA75C\uA75E\uA760\uA762\uA764\uA766\uA768\uA76A\uA76C\uA76E\uA779\uA77B\uA77D" \
r"\uA77E\uA780\uA782\uA784\uA786\uA78B\uA78D\uA790\uA792\uA796\uA798\uA79A\uA79C\uA79E" \
r"\uA7A0\uA7A2\uA7A4\uA7A6\uA7A8\uA7AA-\uA7AE\uA7B0-\uA7B4\uA7B6\uA7B8"
_latin_l_extendedD = r"\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D" \
r"\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759" \
r"\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F\uA771-\uA778\uA77A" \
r"\uA77C\uA77F\uA781\uA783\uA785\uA787\uA78C\uA78E\uA791\uA793-\uA795\uA797\uA799\uA79B" \
r"\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AF\uA7B5\uA7B7\uA7B9\uA7FA"
_latin_extendedD = r"\uA722-\uA76F\uA771-\uA787\uA78B-\uA78E\uA790-\uA7B9\uA7FA"
# special characters - phonetic Teuthonista and Sakha
_latin_l_extendedE = r"\uAB30-\uAB5A\uAB60-\uAB64"
_latin_extendedE = _latin_l_extendedE
# phonetic letters - Greek, Latin, Cyrillic
_latin_l_phonetic = r"\u0250-\u02AF\u1D00-\u1D25\u1D6B-\u1D77\u1D79-\u1D9A"
_latin_phonetic = _latin_l_phonetic
# letters with multiple diacritics - Vietnamese
_latin_u_diacritics = r"\u1E00\u1E02\u1E04\u1E06\u1E08\u1E0A\u1E0C\u1E0E\u1E10\u1E12\u1E14\u1E16\u1E18\u1E1A" \
r"\u1E1C\u1E1E\u1E20\u1E22\u1E24\u1E26\u1E28\u1E2A\u1E2C\u1E2E\u1E30\u1E32\u1E34\u1E36" \
r"\u1E38\u1E3A\u1E3C\u1E3E\u1E40\u1E42\u1E44\u1E46\u1E48\u1E4A\u1E4C\u1E4E\u1E50\u1E52" \
r"\u1E54\u1E56\u1E58\u1E5A\u1E5C\u1E5E\u1E60\u1E62\u1E64\u1E66\u1E68\u1E6A\u1E6C\u1E6E" \
r"\u1E70\u1E72\u1E74\u1E76\u1E78\u1E7A\u1E7C\u1E7E\u1E80\u1E82\u1E84\u1E86\u1E88\u1E8A" \
r"\u1E8C\u1E8E\u1E90\u1E92\u1E94\u1E9E\u1EA0\u1EA2\u1EA4\u1EA6\u1EA8\u1EAA\u1EAC\u1EAE" \
r"\u1EB0\u1EB2\u1EB4\u1EB6\u1EB8\u1EBA\u1EBC\u1EBE\u1EC0\u1EC2\u1EC4\u1EC6\u1EC8" \
r"\u1ECA\u1ECC\u1ECE\u1ED0\u1ED2\u1ED4\u1ED6\u1ED8\u1EDA\u1EDC\u1EDE\u1EE0\u1EE2\u1EE4" \
r"\u1EE6\u1EE8\u1EEA\u1EEC\u1EEE\u1EF0\u1EF2\u1EF4\u1EF6\u1EF8\u1EFA\u1EFC\u1EFE"
_latin_l_diacritics = r"\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B" \
r"\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37" \
r"\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53" \
r"\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F" \
r"\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B" \
r"\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD" \
r"\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9" \
r"\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5" \
r"\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF"
_latin_diacritics = r"\u1E00-\u1EFF"
# all lower latin classes
LATIN_LOWER_BASIC = _latin_l_standard + _latin_l_standard_fullwidth + _latin_l_supplement + _latin_l_extendedA
LATIN_LOWER = LATIN_LOWER_BASIC + _latin_l_extendedB + _latin_l_extendedC + _latin_l_extendedD + _latin_l_extendedE \
+ _latin_l_phonetic + _latin_l_diacritics
# all upper latin classes
LATIN_UPPER_BASIC = _latin_u_standard + _latin_u_standard_fullwidth + _latin_u_supplement + _latin_u_extendedA
LATIN_UPPER = LATIN_UPPER_BASIC + _latin_u_extendedB + _latin_u_extendedC + _latin_u_extendedD + _latin_u_diacritics
# all latin classes
LATIN_BASIC = _latin_standard + _latin_standard_fullwidth + _latin_supplement + _latin_extendedA
LATIN = LATIN_BASIC + _latin_extendedB + _latin_extendedC + _latin_extendedD + _latin_extendedE \
+ _latin_phonetic + _latin_diacritics
_persian = r"\u0620-\u064A\u066E-\u06D5\u06E5-\u06FF\u0750-\u077F\u08A0-\u08BD" \
r"\uFB50-\uFBB1\uFBD3-\uFD3D\uFD50-\uFDC7\uFDF0-\uFDFB\uFE70-\uFEFC\U0001EE00-\U0001EEBB"
_russian_lower = r"ёа-я" _russian_lower = r"ёа-я"
_russian_upper = r"ЁА-Я" _russian_upper = r"ЁА-Я"
# \p{L}&&\p{Sinhala} _russian = r"ёа-яЁА-Я"
_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6" _sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
_tatar_lower = r"әөүҗңһ" _tatar_lower = r"әөүҗңһ"
_tatar_upper = r"ӘӨҮҖҢҺ" _tatar_upper = r"ӘӨҮҖҢҺ"
_tatar = r"әөүҗңһӘӨҮҖҢҺ"
_greek_lower = r"α-ωάέίόώήύ" _greek_lower = r"α-ωάέίόώήύ"
_greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
_greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
_ukrainian_lower = r"а-щюяіїєґ" _ukrainian_lower = r"а-щюяіїєґ"
_ukrainian_upper = r"А-ЩЮЯІЇЄҐ" _ukrainian_upper = r"А-ЩЮЯІЇЄҐ"
_ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_upper = _latin_upper + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = _latin_lower + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = _bengali + _hebrew + _persian + _sinhala _uncased = _bengali + _hebrew + _persian + _sinhala
ALPHA = group_chars(_upper + _lower + _uncased) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased)
@ -63,7 +165,33 @@ _hyphens = "- — -- --- —— ~"
# Various symbols like dingbats, but also emoji # Various symbols like dingbats, but also emoji
# Details: https://www.compart.com/en/unicode/category/So # Details: https://www.compart.com/en/unicode/category/So
_other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D" _other_symbols = r"\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70" \
r"\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34" \
r"\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399" \
r"\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116" \
r"\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B" \
r"\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3" \
r"\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB" \
r"\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E" \
r"\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95" \
r"\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB" \
r"\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3" \
r"\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF" \
r"\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC" \
r"\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B" \
r"\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F" \
r"\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164" \
r"\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8" \
r"\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A" \
r"\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B" \
r"\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF" \
r"\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202" \
r"\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265" \
r"\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9" \
r"\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847" \
r"\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B" \
r"\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2" \
r"\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D"
UNITS = merge_chars(_units) UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency) CURRENCY = merge_chars(_currency)

View File

@ -7076,14 +7076,6 @@ FR_BASE_EXCEPTIONS = [
"au-lof", "au-lof",
"au-tour", "au-tour",
"aube-vigne", "aube-vigne",
"audio-numérique",
"audio-numériques",
"audio-prothésiste",
"audio-prothésistes",
"audio-visuel",
"audio-visuelle",
"audio-visuelles",
"audio-visuels",
"aujourd'hui", "aujourd'hui",
"aulnaie-frênaie", "aulnaie-frênaie",
"aulnaies-frênaies", "aulnaies-frênaies",
@ -14400,7 +14392,6 @@ FR_BASE_EXCEPTIONS = [
"attaques surprises", "attaques surprises",
"attaques-surprises", "attaques-surprises",
"attrape-con", "attrape-con",
"audio-oral",
"auriculo-cardiaque", "auriculo-cardiaque",
"auriculo-temporal", "auriculo-temporal",
"austro-bavarois", "austro-bavarois",

View File

@ -3,12 +3,15 @@ from __future__ import unicode_literals
import re import re
from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG from ...symbols import ORTH, LEMMA, TAG
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
def upper_first_letter(text): def upper_first_letter(text):
if len(text) == 0: if len(text) == 0:
@ -128,6 +131,7 @@ _hyphen_prefix = [
"arcs?", "arcs?",
"archi", "archi",
"arrières?", "arrières?",
"audio",
"avant", "avant",
"avion", "avion",
"auto", "auto",

View File

@ -636,17 +636,17 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
_ord_num_or_date = "([A-Z0-9]+[./-])*(\d+\.?)" _ord_num_or_date = r"([A-Z0-9]+[./-])*(\d+\.?)"
_num = "[+\-]?\d+([,.]\d+)*" _num = r"[+\-]?\d+([,.]\d+)*"
_ops = "[=<>+\-\*/^()÷%²]" _ops = r"[=<>+\-\*/^()÷%²]"
_suffixes = "-[{al}]+".format(al=ALPHA_LOWER) _suffixes = r"-[{al}]+".format(al=ALPHA_LOWER)
_numeric_exp = "({n})(({o})({n}))*[%]?".format(n=_num, o=_ops) _numeric_exp = r"({n})(({o})({n}))*[%]?".format(n=_num, o=_ops)
_time_exp = "\d+(:\d+)*(\.\d+)?" _time_exp = r"\d+(:\d+)*(\.\d+)?"
_nums = "(({ne})|({t})|({on})|({c}))({s})?".format( _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
ne=_numeric_exp, t=_time_exp, on=_ord_num_or_date, c=CURRENCY, s=_suffixes ne=_numeric_exp, t=_time_exp, on=_ord_num_or_date, c=CURRENCY, s=_suffixes
) )
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -20,7 +21,7 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups( lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_lookup = LOOKUP lemma_lookup = LOOKUP
tag_map = TAG_MAP tag_map = TAG_MAP

View File

@ -0,0 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA
_exc = {
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
}
TOKENIZER_EXCEPTIONS = _exc

View File

@ -39,10 +39,10 @@ _infixes = (
+ LIST_ICONS + LIST_ICONS
+ [ + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -44,7 +44,7 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
cdef Pool mem = Pool() cdef Pool mem = Pool()
predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char)) predicate_cache = <char*>mem.alloc(doc.length * len(predicates), sizeof(char))
if extensions is not None and len(extensions) >= 1: if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values()) nr_extra_attr = max(extensions.values()) + 1
extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) extra_attr_values = <attr_t*>mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
else: else:
nr_extra_attr = 0 nr_extra_attr = 0
@ -60,9 +60,8 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
for i in range(doc.length): for i in range(doc.length):
for j in range(n): for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0)) states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache, transition_states(states, matches, &predicate_cache[i],
doc[i], extra_attr_values, predicates) doc[i], extra_attr_values, predicates)
predicate_cache += nr_predicate
extra_attr_values += nr_extra_attr extra_attr_values += nr_extra_attr
# Handle matches that end in 0-width patterns # Handle matches that end in 0-width patterns
finish_states(matches, states) finish_states(matches, states)
@ -74,6 +73,7 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
matches[i].start, matches[i].start,
matches[i].start+matches[i].length matches[i].start+matches[i].length
) )
# We need to deduplicate, because we could otherwise arrive at the same # We need to deduplicate, because we could otherwise arrive at the same
# match through two paths, e.g. .?.? matching 'a'. Are we matching the # match through two paths, e.g. .?.? matching 'a'. Are we matching the
# first .?, or the second .? -- it doesn't matter, it's just one match. # first .?, or the second .? -- it doesn't matter, it's just one match.
@ -89,7 +89,8 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
# showed this wasn't the case when we had a reject-and-continue before a # showed this wasn't the case when we had a reject-and-continue before a
# match. I still don't really understand what's going on here, but this # match. I still don't really understand what's going on here, but this
# workaround does resolve the issue. # workaround does resolve the issue.
while pattern.attrs.attr != ID and pattern.nr_attr > 0: while pattern.attrs.attr != ID and \
(pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0):
pattern += 1 pattern += 1
return pattern.attrs.value return pattern.attrs.value
@ -101,13 +102,17 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
cdef vector[PatternStateC] new_states cdef vector[PatternStateC] new_states
cdef int nr_predicate = len(py_predicates) cdef int nr_predicate = len(py_predicates)
for i in range(states.size()): for i in range(states.size()):
if states[i].pattern.nr_py != 0: if states[i].pattern.nr_py >= 1:
update_predicate_cache(cached_py_predicates, update_predicate_cache(cached_py_predicates,
states[i].pattern, token, py_predicates) states[i].pattern, token, py_predicates)
for i in range(states.size()):
action = get_action(states[i], token.c, extra_attrs, action = get_action(states[i], token.c, extra_attrs,
cached_py_predicates, nr_predicate) cached_py_predicates)
if action == REJECT: if action == REJECT:
continue continue
# Keep only a subset of states (the active ones). Index q is the
# states which are still alive. If we reject a state, we overwrite
# it in the states list, because q doesn't advance.
state = states[i] state = states[i]
states[q] = state states[q] = state
while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND): while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
@ -126,7 +131,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
update_predicate_cache(cached_py_predicates, update_predicate_cache(cached_py_predicates,
states[q].pattern, token, py_predicates) states[q].pattern, token, py_predicates)
action = get_action(states[q], token.c, extra_attrs, action = get_action(states[q], token.c, extra_attrs,
cached_py_predicates, nr_predicate) cached_py_predicates)
if action == REJECT: if action == REJECT:
pass pass
elif action == ADVANCE: elif action == ADVANCE:
@ -154,8 +159,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
states.push_back(new_states[i]) states.push_back(new_states[i])
cdef void update_predicate_cache(char* cache, cdef int update_predicate_cache(char* cache,
const TokenPatternC* pattern, Token token, predicates): const TokenPatternC* pattern, Token token, predicates) except -1:
# If the state references any extra predicates, check whether they match. # If the state references any extra predicates, check whether they match.
# These are cached, so that we don't call these potentially expensive # These are cached, so that we don't call these potentially expensive
# Python functions more than we need to. # Python functions more than we need to.
@ -192,7 +197,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states)
cdef action_t get_action(PatternStateC state, cdef action_t get_action(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs, const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches, int nr_predicate) nogil: const char* predicate_matches) nogil:
'''We need to consider: '''We need to consider:
a) Does the token match the specification? [Yes, No] a) Does the token match the specification? [Yes, No]
@ -252,7 +257,7 @@ cdef action_t get_action(PatternStateC state,
Problem: If a quantifier is matching, we're adding a lot of open partials Problem: If a quantifier is matching, we're adding a lot of open partials
''' '''
cdef char is_match cdef char is_match
is_match = get_is_match(state, token, extra_attrs, predicate_matches, nr_predicate) is_match = get_is_match(state, token, extra_attrs, predicate_matches)
quantifier = get_quantifier(state) quantifier = get_quantifier(state)
is_final = get_is_final(state) is_final = get_is_final(state)
if quantifier == ZERO: if quantifier == ZERO:
@ -303,9 +308,9 @@ cdef action_t get_action(PatternStateC state,
cdef char get_is_match(PatternStateC state, cdef char get_is_match(PatternStateC state,
const TokenC* token, const attr_t* extra_attrs, const TokenC* token, const attr_t* extra_attrs,
const char* predicate_matches, int nr_predicate) nogil: const char* predicate_matches) nogil:
for i in range(nr_predicate): for i in range(state.pattern.nr_py):
if predicate_matches[i] == -1: if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0 return 0
spec = state.pattern spec = state.pattern
for attr in spec.attrs[:spec.nr_attr]: for attr in spec.attrs[:spec.nr_attr]:
@ -333,7 +338,7 @@ DEF PADDING = 5
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i cdef int i, index
for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs): for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
pattern[i].quantifier = quantifier pattern[i].quantifier = quantifier
pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC)) pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
@ -356,11 +361,13 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
pattern[i].attrs[0].attr = ID pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id pattern[i].attrs[0].value = entity_id
pattern[i].nr_attr = 0 pattern[i].nr_attr = 0
pattern[i].nr_extra_attr = 0
pattern[i].nr_py = 0
return pattern return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
while pattern.nr_attr != 0: while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0:
pattern += 1 pattern += 1
id_attr = pattern[0].attrs[0] id_attr = pattern[0].attrs[0]
if id_attr.attr != ID: if id_attr.attr != ID:
@ -384,7 +391,6 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates. extra_predicates.
""" """
tokens = [] tokens = []
seen_predicates = {}
for spec in token_specs: for spec in token_specs:
if not spec: if not spec:
# Signifier for 'any token' # Signifier for 'any token'
@ -393,7 +399,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates, seen_predicates) predicates = _get_extra_predicates(spec, extra_predicates)
for op in ops: for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates))) tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens return tokens
@ -430,6 +436,7 @@ class _RegexPredicate(object):
self.value = re.compile(value) self.value = re.compile(value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate == 'REGEX' assert self.predicate == 'REGEX'
def __call__(self, Token token): def __call__(self, Token token):
@ -447,6 +454,7 @@ class _SetMemberPredicate(object):
self.value = set(get_string_id(v) for v in value) self.value = set(get_string_id(v) for v in value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate in ('IN', 'NOT_IN') assert self.predicate in ('IN', 'NOT_IN')
def __call__(self, Token token): def __call__(self, Token token):
@ -459,6 +467,9 @@ class _SetMemberPredicate(object):
else: else:
return value not in self.value return value not in self.value
def __repr__(self):
return repr(('SetMemberPredicate', self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate(object): class _ComparisonPredicate(object):
def __init__(self, i, attr, value, predicate, is_extension=False): def __init__(self, i, attr, value, predicate, is_extension=False):
@ -467,6 +478,7 @@ class _ComparisonPredicate(object):
self.value = value self.value = value
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
assert self.predicate in ('==', '!=', '>=', '<=', '>', '<') assert self.predicate in ('==', '!=', '>=', '<=', '>', '<')
def __call__(self, Token token): def __call__(self, Token token):
@ -488,7 +500,7 @@ class _ComparisonPredicate(object):
return value < self.value return value < self.value
def _get_extra_predicates(spec, extra_predicates, seen_predicates): def _get_extra_predicates(spec, extra_predicates):
predicate_types = { predicate_types = {
'REGEX': _RegexPredicate, 'REGEX': _RegexPredicate,
'IN': _SetMemberPredicate, 'IN': _SetMemberPredicate,
@ -499,6 +511,7 @@ def _get_extra_predicates(spec, extra_predicates, seen_predicates):
'>': _ComparisonPredicate, '>': _ComparisonPredicate,
'<': _ComparisonPredicate, '<': _ComparisonPredicate,
} }
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = [] output = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(attr, basestring): if isinstance(attr, basestring):
@ -516,16 +529,15 @@ def _get_extra_predicates(spec, extra_predicates, seen_predicates):
if isinstance(value, dict): if isinstance(value, dict):
for type_, cls in predicate_types.items(): for type_, cls in predicate_types.items():
if type_ in value: if type_ in value:
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) predicate = cls(len(extra_predicates), attr, value[type_], type_)
# Don't create a redundant predicates. # Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results. # This helps with efficiency, as we're caching the results.
if key in seen_predicates: if predicate.key in seen_predicates:
output.append(seen_predicates[key]) output.append(seen_predicates[predicate.key])
else: else:
predicate = cls(len(extra_predicates), attr, value[type_], type_)
extra_predicates.append(predicate) extra_predicates.append(predicate)
output.append(predicate.i) output.append(predicate.i)
seen_predicates[key] = predicate.i seen_predicates[predicate.key] = predicate.i
return output return output

View File

@ -145,9 +145,7 @@ class Pipe(object):
"""Serialize the pipe to a bytestring.""" """Serialize the pipe to a bytestring."""
serialize = OrderedDict() serialize = OrderedDict()
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
if self.model in (True, False, None): if self.model not in (True, False, None):
serialize["model"] = lambda: self.model
else:
serialize["model"] = self.model.to_bytes serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
@ -538,9 +536,7 @@ class Tagger(Pipe):
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
serialize = OrderedDict() serialize = OrderedDict()
if self.model in (None, True, False): if self.model not in (None, True, False):
serialize['model'] = lambda: self.model
else:
serialize['model'] = self.model.to_bytes serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes serialize['vocab'] = self.vocab.to_bytes
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)

View File

@ -261,13 +261,14 @@ class ParserStepModel(Model):
# lowest score. # lowest score.
# numpy's nan_to_num function doesn't take a value, and nan is replaced # numpy's nan_to_num function doesn't take a value, and nan is replaced
# by 0...-inf is replaced by minimum, so we go via that. Ugly to the max. # by 0...-inf is replaced by minimum, so we go via that. Ugly to the max.
scores[self.ops.xp.isnan(scores)] = -self.ops.xp.inf # Note that scores is always a numpy array! Should fix #3112
self.ops.xp.nan_to_num(scores, copy=False) scores[numpy.isnan(scores)] = -numpy.inf
numpy.nan_to_num(scores, copy=False)
def backprop_parser_step(d_scores, sgd=None): def backprop_parser_step(d_scores, sgd=None):
# If we have a non-zero gradient for a previously unseen class, # If we have a non-zero gradient for a previously unseen class,
# replace the weight with 0. # replace the weight with 0.
new_classes = self.ops.xp.logical_and( new_classes = self.vec2scores.ops.xp.logical_and(
self.vec2scores.ops.xp.isnan(self.vec2scores.b), self.vec2scores.ops.xp.isnan(self.vec2scores.b),
d_scores.any(axis=0) d_scores.any(axis=0)
) )

View File

@ -13,10 +13,10 @@ def custom_en_tokenizer(en_vocab):
prefix_re = compile_prefix_regex(English.Defaults.prefixes) prefix_re = compile_prefix_regex(English.Defaults.prefixes)
suffix_re = compile_suffix_regex(English.Defaults.suffixes) suffix_re = compile_suffix_regex(English.Defaults.suffixes)
custom_infixes = [ custom_infixes = [
"\.\.\.+", r"\.\.\.+",
"(?<=[0-9])-(?=[0-9])", r"(?<=[0-9])-(?=[0-9])",
"[0-9]+(,[0-9]+)+", r"[0-9]+(,[0-9]+)+",
"[\[\]!&:,()\*—–\/-]", r"[\[\]!&:,()\*—–\/-]",
] ]
infix_re = compile_infix_regex(custom_infixes) infix_re = compile_infix_regex(custom_infixes)
return Tokenizer( return Tokenizer(

View File

@ -22,9 +22,9 @@ import pytest
u"11-septembre", u"11-septembre",
u"11-Septembre", u"11-Septembre",
u"refox-trottâmes", u"refox-trottâmes",
u"K-POP", # u"K-POP",
u"K-Pop", # u"K-Pop",
u"K-pop", # u"K-pop",
u"z'yeutes", u"z'yeutes",
u"black-outeront", u"black-outeront",
u"états-unienne", u"états-unienne",

View File

@ -207,14 +207,13 @@ def test_matcher_set_value(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
@pytest.mark.xfail
def test_matcher_set_value_operator(en_vocab): def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}] pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
matcher.add("DET_HOUSE", None, pattern) matcher.add("DET_HOUSE", None, pattern)
doc = Doc(en_vocab, words=["In", "a", "house"]) doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 2
doc = Doc(en_vocab, words=["my", "house"]) doc = Doc(en_vocab, words=["my", "house"])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 1 assert len(matches) == 1

View File

@ -358,6 +358,7 @@ def test_issue850_basic():
assert end == 4 assert end == 4
@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"] "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
) )

View File

@ -13,6 +13,22 @@ from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
testwords = u'e2g 2g 52g'
doc = nlp(testwords)
assert len(doc) == 5
assert doc[0].text == "e2g"
assert doc[1].text == "2"
assert doc[2].text == "g"
assert doc[3].text == "52"
assert doc[4].text == "g"
def test_issue1242(): def test_issue1242():
nlp = English() nlp = English()
doc = nlp("") doc = nlp("")

View File

@ -6,7 +6,6 @@ from spacy.matcher import Matcher
from spacy.tokens import Token, Doc from spacy.tokens import Token, Doc
@pytest.mark.xfail
def test_issue1971(en_vocab): def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671? # Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
@ -22,21 +21,20 @@ def test_issue1971(en_vocab):
# We could also assert length 1 here, but this is more conclusive, because # We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id # the real problem here is that it returns a duplicate match for a match_id
# that's not actually in the vocab! # that's not actually in the vocab!
assert all(match_id in en_vocab.strings for match_id, start, end in matcher(doc)) matches = matcher(doc)
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
@pytest.mark.xfail
def test_issue_1971_2(en_vocab): def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern1 = [{"LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = list(reversed(pattern1)) pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] #{"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST", None, pattern1, pattern2) matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == 2 assert len(matches) == 2
@pytest.mark.xfail
def test_issue_1971_3(en_vocab): def test_issue_1971_3(en_vocab):
"""Test that pattern matches correctly for multiple extension attributes.""" """Test that pattern matches correctly for multiple extension attributes."""
Token.set_extension("a", default=1) Token.set_extension("a", default=1)
@ -50,7 +48,6 @@ def test_issue_1971_3(en_vocab):
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
# @pytest.mark.xfail
def test_issue_1971_4(en_vocab): def test_issue_1971_4(en_vocab):
"""Test that pattern matches correctly with multiple extension attribute """Test that pattern matches correctly with multiple extension attribute
values on a single token. values on a single token.

View File

@ -7,10 +7,22 @@ from spacy.displacy import render
from spacy.gold import iob_to_biluo from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian from spacy.lang.it import Italian
import numpy import numpy
from spacy.lang.en import English
from ..util import add_vecs_to_vocab, get_doc from ..util import add_vecs_to_vocab, get_doc
@pytest.mark.xfail(
reason="The dot is now properly split off, but the prefix/suffix rules are not applied again afterwards."
"This means that the quote will still be attached to the remaining token."
)
def test_issue2070():
"""Test that checks that a dot followed by a quote is handled appropriately."""
nlp = English()
doc = nlp('First sentence."A quoted sentence" he said ...')
assert len(doc) == 11
def test_issue2179(): def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER.""" """Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian() nlp = Italian()

View File

@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
def test_issue2656():
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
text = "I went for 40.3, and got home by 10.0."
nlp = English()
doc = nlp(text)
assert len(doc) == 11
assert doc[0].text == "I"
assert doc[1].text == "went"
assert doc[2].text == "for"
assert doc[3].text == "40.3"
assert doc[4].text == ","
assert doc[5].text == "and"
assert doc[6].text == "got"
assert doc[7].text == "home"
assert doc[8].text == "by"
assert doc[9].text == "10.0"
assert doc[10].text == "."

View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.it import Italian
def test_issue2822():
""" Test that the abbreviation of poco is kept as one word """
nlp = Italian()
text = "Vuoi un po' di zucchero?"
doc = nlp(text)
assert len(doc) == 6
assert doc[0].text == "Vuoi"
assert doc[1].text == "un"
assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di"
assert doc[4].text == "zucchero"
assert doc[5].text == "?"

View File

@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.fr import French
def test_issue2926():
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
nlp = French()
text = "Learn html5/css3/javascript/jquery"
doc = nlp(text)
assert len(doc) == 8
assert doc[0].text == "Learn"
assert doc[1].text == "html5"
assert doc[2].text == "/"
assert doc[3].text == "css3"
assert doc[4].text == "/"
assert doc[5].text == "javascript"
assert doc[6].text == "/"
assert doc[7].text == "jquery"

View File

@ -0,0 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.de import German
def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German()
doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl')
assert len(doc) == 5

View File

@ -8,7 +8,6 @@ from spacy import displacy
from ..util import get_doc from ..util import get_doc
@pytest.mark.xfail
def test_issue3288(en_vocab): def test_issue3288(en_vocab):
"""Test that retokenization works correctly via displaCy when punctuation """Test that retokenization works correctly via displaCy when punctuation
is merged onto the preceeding token and tensor is resized.""" is merged onto the preceeding token and tensor is resized."""

View File

@ -5,7 +5,6 @@ import pytest
from spacy.lang.en import English from spacy.lang.en import English
@pytest.mark.xfail
def test_issue3289(): def test_issue3289():
"""Test that Language.to_bytes handles serializing a pipeline component """Test that Language.to_bytes handles serializing a pipeline component
with an uninitialized model.""" with an uninitialized model."""

View File

@ -9,104 +9,104 @@ import pytest
NAUGHTY_STRINGS = [ NAUGHTY_STRINGS = [
# ASCII punctuation # ASCII punctuation
",./;'[]\-=", r",./;'[]\-=",
'<>?:"{}|_+', r'<>?:"{}|_+',
'!@#$%^&*()`~"', r'!@#$%^&*()`~"',
# Unicode additional control characters, byte order marks # Unicode additional control characters, byte order marks
"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪", r"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪",
"", r"",
# Unicode Symbols # Unicode Symbols
"Ω≈ç√∫˜µ≤≥÷", r"Ω≈ç√∫˜µ≤≥÷",
"åß∂ƒ©˙∆˚¬…æ", r"åß∂ƒ©˙∆˚¬…æ",
"œ∑´®†¥¨ˆøπ“‘", "œ∑´®†¥¨ˆøπ“‘",
"¡™£¢∞§¶•ªº–≠", r"¡™£¢∞§¶•ªº–≠",
"¸˛Ç◊ı˜Â¯˘¿", r"¸˛Ç◊ı˜Â¯˘¿",
"ÅÍÎÏ˝ÓÔÒÚÆ☃", r"ÅÍÎÏ˝ÓÔÒÚÆ☃",
"Œ„´‰ˇÁ¨ˆØ∏”’", r"Œ„´‰ˇÁ¨ˆØ∏”’",
"`⁄€‹›fifl‡°·‚—±", r"`⁄€‹›fifl‡°·‚—±",
"⅛⅜⅝⅞", r"⅛⅜⅝⅞",
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
"٠١٢٣٤٥٦٧٨٩", r"٠١٢٣٤٥٦٧٨٩",
# Unicode Subscript/Superscript/Accents # Unicode Subscript/Superscript/Accents
"⁰⁴⁵", r"⁰⁴⁵",
"₀₁₂", r"₀₁₂",
"⁰⁴⁵₀₁₂", r"⁰⁴⁵₀₁₂",
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
# Two-Byte Characters # Two-Byte Characters
"田中さんにあげて下さい", r"田中さんにあげて下さい",
"パーティーへ行かないか", r"パーティーへ行かないか",
"和製漢語", r"和製漢語",
"部落格", r"部落格",
"사회과학원 어학연구소", r"사회과학원 어학연구소",
"찦차를 타고 온 펲시맨과 쑛다리 똠방각하", r"찦차를 타고 온 펲시맨과 쑛다리 똠방각하",
"社會科學院語學研究所", r"社會科學院語學研究所",
"울란바토르", r"울란바토르",
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏", r"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
# Japanese Emoticons # Japanese Emoticons
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ", r"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
"(。◕ ∀ ◕。)", r"(。◕ ∀ ◕。)",
"`ィ(´∀`∩", r"`ィ(´∀`∩",
"__ロ(,_,*)", r"__ロ(,_,*)",
"・( ̄∀ ̄)・:*:", r"・( ̄∀ ̄)・:*:",
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚", r"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’", r",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻", r"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
"┬─┬ノ( º _ ºノ)", r"┬─┬ノ( º _ ºノ)",
"( ͡° ͜ʖ ͡°)", r"( ͡° ͜ʖ ͡°)",
# Emoji # Emoji
"😍", r"😍",
"👩🏽", r"👩🏽",
"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍",
"🐵 🙈 🙉 🙊", r"🐵 🙈 🙉 🙊",
"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙", r"❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙",
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿", r"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧", r"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
"0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟", r"0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟",
# Regional Indicator Symbols # Regional Indicator Symbols
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸", r"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
"🇺🇸🇷🇺🇸🇦🇫🇦🇲", r"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
"🇺🇸🇷🇺🇸🇦", r"🇺🇸🇷🇺🇸🇦",
# Unicode Numbers # Unicode Numbers
"", r"",
"١٢٣", r"١٢٣",
# Right-To-Left Strings # Right-To-Left Strings
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.", r"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
"إيو.", r"إيو.",
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ", r"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
"הָיְתָהtestالصفحات التّحول", r"הָיְתָהtestالصفحات التّحول",
"", r"",
"", r"",
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،", r"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
# Trick Unicode # Trick Unicode
"test", r"test",
"test", r"test",
"test", r"test",
"testtest", r"testtest",
"test", r"test",
# Zalgo Text # Zalgo Text
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣", r"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰", r"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕", r"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮", r"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
# Unicode Upsidedown # Unicode Upsidedown
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥", r"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
"00˙Ɩ$-", r"00˙Ɩ$-",
# Unicode font # Unicode font
" ", r" ",
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠", r"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌", r"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈", r"𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈",
"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰", r"𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰",
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘", r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐", r"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢", r"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
# File paths # File paths
"../../../../../../../../../../../etc/passwd%00", r"../../../../../../../../../../../etc/passwd%00",
"../../../../../../../../../../../etc/hosts", r"../../../../../../../../../../../etc/hosts",
# iOS Vulnerabilities # iOS Vulnerabilities
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗", r"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
"🏳0🌈", r"🏳0🌈",
] ]

View File

@ -407,7 +407,7 @@ cdef class Tokenizer:
if data.get('infix_finditer'): if data.get('infix_finditer'):
self.infix_finditer = re.compile(data['infix_finditer']).finditer self.infix_finditer = re.compile(data['infix_finditer']).finditer
if data.get('token_match'): if data.get('token_match'):
self.token_match = re.compile(data['token_match']).search self.token_match = re.compile(data['token_match']).match
for string, substrings in data.get('rules', {}).items(): for string, substrings in data.get('rules', {}).items():
self.add_special_case(string, substrings) self.add_special_case(string, substrings)
return self return self

View File

@ -222,7 +222,7 @@ def _bulk_merge(Doc doc, merges):
# whether the row is to be deleted, then use numpy.delete # whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0: if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, doc.tensor = _resize_tensor(doc.tensor,
[(m[1][0].start, m[1][0].end) for m in merges]) [(m[0].start, m[0].end) for m in merges])
# Memorize span roots and sets dependencies of the newly merged # Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots. # tokens to the dependencies of their roots.
span_roots = [] span_roots = []

View File

@ -102,11 +102,12 @@ language and training a language model.
In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs
to know the language's character set. If the language you're adding uses to know the language's character set. If the language you're adding uses
non-latin characters, you might need to add the required character classes to non-latin characters, you might need to define the required character classes in
the global the global
[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py). [`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py).
spaCy uses the [`regex` library](https://pypi.python.org/pypi/regex/) to keep For efficiency, spaCy uses hard-coded unicode ranges to define character classes,
this simple and readable. If the language requires very specific punctuation the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block).
If the language requires very specific punctuation
rules, you should consider overwriting the default regular expressions with your rules, you should consider overwriting the default regular expressions with your
own in the language's `Defaults`. own in the language's `Defaults`.

View File

@ -215,6 +215,22 @@ if all of your models are up to date, you can run the
means that the `Matcher` in v2.1.x may produce different results compared to means that the `Matcher` in v2.1.x may produce different results compared to
the `Matcher` in v2.0.x. the `Matcher` in v2.0.x.
- The deprecated [`Doc.merge`](/api/doc#merge) and
[`Span.merge`](/api/span#merge) methods still work, but you may notice that
they now run slower when merging many objects in a row. That's because the
merging engine was rewritten to be more reliable and to support more efficient
merging **in bulk**. To take advantage of this, you should rewrite your logic
to use the [`Doc.retokenize`](/api/doc#retokenize) context manager and perform
as many merges as possible together in the `with` block.
```diff
- doc[1:5].merge()
- doc[6:8].merge()
+ with doc.retokenize() as retokenizer:
+ retokenizer.merge(doc[1:5])
+ retokenizer.merge(doc[6:8])
```
- For better compatibility with the Universal Dependencies data, the lemmatizer - For better compatibility with the Universal Dependencies data, the lemmatizer
now preserves capitalization, e.g. for proper nouns. See now preserves capitalization, e.g. for proper nouns. See
[this issue](https://github.com/explosion/spaCy/issues/3256) for details. [this issue](https://github.com/explosion/spaCy/issues/3256) for details.
@ -227,6 +243,11 @@ if all of your models are up to date, you can run the
+ sentence_splitter = nlp.create_pipe("sentencizer") + sentence_splitter = nlp.create_pipe("sentencizer")
``` ```
- The keyword argument `n_threads` on the `.pipe` methods is now deprecated, as
the v2.x models cannot release the global interpreter lock. (Future versions
may introduce a `n_process` argument for parallel inference via
multiprocessing.)
- The `Doc.print_tree` method is now deprecated. If you need a custom nested - The `Doc.print_tree` method is now deprecated. If you need a custom nested
JSON representation of a `Doc` object, you might want to write your own helper JSON representation of a `Doc` object, you might want to write your own helper
function. For a simple and consistent JSON representation of the `Doc` object function. For a simple and consistent JSON representation of the `Doc` object

View File

@ -1,4 +1,7 @@
redirects = [ redirects = [
# Netlify
# TODO: uncomment once the site is switched over
# {from = "https://spacy.netlify.com/*", to="https://spacy.io/:splat" },
# Old demos # Old demos
{from = "/demos/*", to = "https://explosion.ai/demos/:splat"}, {from = "/demos/*", to = "https://explosion.ai/demos/:splat"},
# Old blog # Old blog

View File

@ -71,11 +71,11 @@ const SEO = ({ description, lang, title, section, sectionTitle, bodyClass }) =>
}, },
{ {
name: 'twitter:creator', name: 'twitter:creator',
content: `@${data.site.siteMetadata.social.twitter}`, content: `@${siteMetadata.social.twitter}`,
}, },
{ {
name: 'twitter:site', name: 'twitter:site',
content: `@${data.site.siteMetadata.social.twitter}`, content: `@${siteMetadata.social.twitter}`,
}, },
{ {
name: 'twitter:title', name: 'twitter:title',
@ -126,8 +126,6 @@ const query = graphql`
title title
description description
slogan slogan
siteUrl
email
social { social {
twitter twitter
} }