Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-27 18:32:57 -05:00
commit c1263a844b
40 changed files with 720 additions and 271 deletions

View File

@ -1,10 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import importlib
from .compat import basestring_
from .cli.info import info
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
from . import util
@ -12,11 +9,8 @@ from . import util
def load(name, **overrides):
name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.')
cls = util.get_lang_class(meta['lang'])
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
return util.load_model(name)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -19,6 +19,8 @@ import numpy
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@ -247,6 +249,7 @@ def doc2feats(cols=None):
model.cols = cols
return model
def print_shape(prefix):
def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX

View File

@ -24,8 +24,9 @@ CONVERTERS = {
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(_, input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other
def convert(cmd, input_file, output_dir, n_sents, morphology):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file)
@ -39,4 +40,4 @@ def convert(_, input_file, output_dir, n_sents, morphology):
prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, morphology=morphology)
n_sents=n_sents, use_morphology=morphology)

View File

@ -17,8 +17,9 @@ from .. import about
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""Download compatible model from default download path using pip. Model
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
@ -31,7 +32,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try:
link(model_name, model, force=True)
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and

View File

@ -14,14 +14,20 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model:
model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
prints(meta_path, title="Can't find model meta.json", exits=1)
meta = read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())

View File

@ -14,13 +14,14 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
def link(cmd, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_model_package_path(origin)
model_path = util.get_package_path(model)
else:
model_path = Path(origin)
if not model_path.exists():

View File

@ -18,8 +18,9 @@ from .. import about
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta, force):
"""Generate Python package for model data, including meta and required
def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
meta = util.read_json(meta_path)
else:
meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version'])
meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {}
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath):

View File

@ -32,9 +32,11 @@ from .. import displacy
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
"""Train a model. Expects data in spaCy's JSON format."""
"""
Train a model. Expects data in spaCy's JSON format.
"""
n_sents = n_sents or None
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
@ -84,11 +86,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
pbar.update(len(docs))
with nlp.use_params(optimizer.averages):
scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
nlp_loaded = dill.load(file_)
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
print_progress(i, losses, scorer.scores)
finally:
print("Saving model...")

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower]
_uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -35,4 +35,4 @@ class English(Language):
Defaults = EnglishDefaults
__all__ = ['English', 'EnglishDefaults']
__all__ = ['English']

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from .char_classes import CURRENCY, UNITS
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY)
LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

26
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -337,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity'
@ -344,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser):
TransitionSystem = ArcEager

View File

@ -335,17 +335,18 @@ cdef cppclass StateC:
this._break = this._b_i
void clone(const StateC* src) nogil:
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i
this._s_i = src._s_i
this._e_i = src._e_i
this._break = src._break
this.offset = src.offset
this._empty_token = src._empty_token
void fast_forward() nogil:
# space token attachement policy:

View File

@ -9,6 +9,7 @@ import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
SHIFT: [''],
REDUCE: [''],
RIGHT: [],
LEFT: [],
BREAK: ['ROOT']})
OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT'])
)))
seen_actions = set()
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
MISSING: [''],
BEGIN: [],
IN: [],
LAST: [],
UNIT: [],
OUT: ['']
})
OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem):
def move_name(self, int move, int label):
if move == OUT:
return 'O'
elif move == 'MISSING':
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]

View File

@ -527,6 +527,14 @@ cdef class Parser:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
@property
def move_names(self):
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs,

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict
from collections import defaultdict, OrderedDict
from ..structs cimport TokenC
from .stateclass cimport StateClass
@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action):
def __init__(self, StringStore string_table, labels_by_action):
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -34,14 +34,14 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()):
for action, label_strs in labels_by_action.items():
for label_str in label_strs:
self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT']
self.init_beam_state = _init_state
def __reduce__(self):
labels_by_action = {}
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
@ -77,6 +77,11 @@ cdef class TransitionSystem:
history.append(i)
action.do(state.c, action.label)
break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history
cdef int initialize_state(self, StateC* state) nogil:

View File

@ -1,7 +1,4 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals
import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
exceptions = ["hu"]
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == length

View File

@ -78,27 +78,86 @@ def ensure_path(path):
return path
def resolve_model_path(name):
"""Resolve a model name or string to a model path.
def load_model(name):
"""Load a model from a shortcut link, package or data path.
name (unicode): Package name, shortcut link or model path.
RETURNS (Path): Path to model data directory.
RETURNS (Language): `Language` class with the loaded model.
"""
data_path = get_data_path()
if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_):
if (data_path / name).exists(): # in data dir or shortcut link
return (data_path / name)
if is_package(name): # installed as a package
return get_model_package_path(name)
if Path(name).exists(): # path to model
return Path(name)
elif hasattr(name, 'exists'): # Path or Path-like object
return name
if (data_path / name).exists(): # in data dir or shortcut
return load_model_from_path(data_path / name)
if is_package(name): # installed as package
return load_model_from_pkg(name)
if Path(name).exists(): # path to model data directory
return load_data_from_path(Path(name))
elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_data_from_path(name)
raise IOError("Can't find model '%s'" % name)
def load_model_from_init_py(init_file):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = Path(init_file).parent
return load_data_from_path(model_path, package=True)
def load_model_from_path(model_path):
"""Import and load a model package from its file path.
path (unicode or Path): Path to package directory.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
spec = importlib.util.spec_from_file_location('model', model_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module.load()
def load_model_from_pkg(name):
"""Import and load a model package.
name (unicode): Name of model package installed via pip.
RETURNS (Language): `Language` class with loaded model.
"""
module = importlib.import_module(name)
return module.load()
def load_data_from_path(model_path, package=False):
"""Initialie a `Language` class with a loaded model from a model data path.
model_path (unicode or Path): Path to model data directory.
package (bool): Does the path point to the parent package directory?
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % location)
meta = read_json(location)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise IOError('No %s setting found in model meta.json' % setting)
if package:
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
model_path = model_path / model_data_path
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path))
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True))
return nlp.from_disk(model_path)
def is_package(name):
"""Check if string maps to a package installed via pip.
@ -112,36 +171,16 @@ def is_package(name):
return False
def get_model_package_path(package_name):
"""Get path to a model package installed via pip.
def get_package_path(name):
"""Get the path to an installed package.
package_name (unicode): Name of installed package.
RETURNS (Path): Path to model data directory.
name (unicode): Package name.
RETURNS (Path): Path to installed package.
"""
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
# Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent
meta = parse_package_meta(package_path / package_name)
model_name = '%s-%s' % (package_name, meta['version'])
return package_path / package_name / model_name
def parse_package_meta(package_path, require=True):
"""Check if a meta.json exists in a package and return its contents.
package_path (Path): Path to model package directory.
require (bool): If True, raise error if no meta.json is found.
RETURNS (dict or None): Model meta.json data or None.
"""
location = package_path / 'meta.json'
if location.is_file():
return read_json(location)
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
return Path(pkg.__file__).parent
def is_in_jupyter():
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased --
and yielding them sometime later. Obviously, this is not unbiased
but should be good enough for batching. Larger bufsize means less bias.
From https://gist.github.com/andres-erbsen/1307752
iterable (iterable): Iterator to shuffle.
bufsize (int): Items to hold back.
YIELDS (iterable): The shuffled iterator.
"""
iterable = iter(iterable)
buf = []
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
def compounding(start, stop, compound):
'''Yield an infinite series of compounding values. Each time the
"""Yield an infinite series of compounding values. Each time the
generator is called, a value is produced by multiplying the previous
value by the compound rate.
EXAMPLE
EXAMPLE:
>>> sizes = compounding(1., 10., 1.5)
>>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * 1.5
>>> assert next(sizes) == 1.5 * 1.5
'''
"""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
curr = float(start)
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
def decaying(start, stop, decay):
'''Yield an infinite series of linearly decaying values.'''
"""Yield an infinite series of linearly decaying values."""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
nr_upd = 1.
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
nr_upd += 1
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def read_json(location):
"""Open and load JSON from file.

View File

@ -53,8 +53,6 @@ cdef class Vocab:
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style>
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" }
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style>
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
</style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>

Before

Width:  |  Height:  |  Size: 9.0 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -1,8 +1,8 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style>
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro" }
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>

Before

Width:  |  Height:  |  Size: 3.1 KiB

After

Width:  |  Height:  |  Size: 3.2 KiB

View File

@ -0,0 +1,123 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
<style>
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19"></text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">s</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,9 +1,9 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>

Before

Width:  |  Height:  |  Size: 7.6 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

@ -158,7 +158,8 @@
"binder": {
"title": "Binder",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/binder.pyx"
},
"annotation": {

View File

@ -2,7 +2,10 @@
include ../../_includes/_mixins
p spaCy currently supports the following languages and capabilities:
p
| spaCy currently provides models for the following languages and
| capabilities:
+aside-code("Download language models", "bash").
python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
+row
+cell French #[code fr]
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
+h(2, "available") Available models
+row
+cell Spanish #[code es]
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
include ../usage/_models-list
p
+button("/docs/usage/models", true, "primary") See available models
+h(2, "alpha-support") Alpha tokenization support
@ -52,9 +59,35 @@ p
| #[+a("https://github.com/mocobeta/janome") Janome].
+table([ "Language", "Code", "Source" ])
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+row
+cell #{language}
+cell #[code=code]
+cell
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+h(2, "multi-language") Multi-language support
+tag-new(2)
p
| As of v2.0, spaCy supports models trained on more than one language. This
| is especially useful for named entity recognition. The language ID used
| for multi-language or language-neutral models is #[code xx]. The
| language class, a generic subclass containing only the base language data,
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
p
| To load your model with the neutral, multi-language class, simply set
| #[code "language": "xx"] in your
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
| meta.json. You can also import the class directly, or call
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
| lazy-loading.
+code("Standard import").
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()
+code("With lazy-loading").
from spacy.util import get_lang_class
nlp = get_lang_class('xx')

View File

@ -11,8 +11,13 @@ p
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be
| determined based on the model's settings.
| argument in this order. If a model is loaded from a shortcut link or
| package name, spaCy will assume it's a Python package and import it and
| call the model's own #[code load()] method. If a model is loaded from a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings off the meta.json and initialise the #[code Language]
| class. The data will be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
@ -20,7 +25,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = spacy.load('en', disable=['parser', 'tagger'])
+table(["Name", "Type", "Description"])
+row

View File

@ -1,12 +1,10 @@
//- 💫 DOCS > API > ANNOTATION SPECS
//- 💫 DOCS > API > UTIL
include ../../_includes/_mixins
p
| spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
+infobox("Important note")
| Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe
@ -74,15 +72,23 @@ p
+cell #[code Language]
+cell Language class.
+h(2, "resolve_model_path") util.resolve_model_path
+h(2, "load_model") util.load_model
+tag function
+tag-new(2)
p Resolve a model name or string to a model path.
p
| Load a model from a shortcut link, package or data path. If called with a
| shortcut link or package name, spaCy will assume the model is a Python
| package and import and call its #[code load()] method. If called with a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings from the meta.json and initialise a #[code Language]
| class. The model data will then be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
model_path = util.resolve_model_path('en')
model_path = util.resolve_model_path('/path/to/en')
nlp = util.load_model('en')
nlp = util.load_model('en_core_web_sm')
nlp = util.load_model('/path/to/data')
+table(["Name", "Type", "Description"])
+row
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+aside-code("Example").
from spacy.util import load_model_from_init_py
def load():
return load_model_from_init_py(__file__)
+table(["Name", "Type", "Description"])
+row
+cell #[code init_file]
+cell unicode
+cell Path to model's __init__.py, i.e. #[code __file__].
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "is_package") util.is_package
+tag function
@ -117,16 +148,18 @@ p
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") util.get_model_package_path
+h(2, "get_package_path") util.get_package_path
+tag function
+tag-new(2)
p
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
| Currently imports the package to find it and parse its meta data.
| Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example").
util.get_model_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
util.get_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm
+table(["Name", "Type", "Description"])
+row
@ -137,37 +170,8 @@ p
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+h(2, "parse_package_meta") util.parse_package_meta
+tag function
p
| Check if a #[code meta.json] exists in a model package and return its
| contents.
+aside-code("Example").
if util.is_package('en_core_web_sm'):
path = util.get_model_package_path('en_core_web_sm')
meta = util.parse_package_meta(path, require=True)
# {'name': 'core_web_sm', 'lang': 'en', ...}
+table(["Name", "Type", "Description"])
+row
+cell #[code package_path]
+cell #[code Path]
+cell Path to model package directory.
+row
+cell #[code require]
+cell #[code bool]
+cell If #[code True], raise error if no #[code meta.json] is found.
+footrow
+cell returns
+cell dict / #[code None]
+cell Model meta data or #[code None].
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)

View File

@ -5,7 +5,7 @@ p
| #[strong how similar they are]. Predicting similarity is useful for
| building recommendation systems or flagging duplicates. For example, you
| can suggest a user content that's similar to what they're currently
| looking at, or label a support ticket as a duplicate, if it's very
| looking at, or label a support ticket as a duplicate if it's very
| similar to an already existing one.
p

View File

@ -16,3 +16,47 @@ p
+row
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
+cell=cell
p
| Fist, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks:
+list("numbers")
+item
| #[strong Does the substring match a tokenizer exception rule?] For
| example, "don't" does not contain whitespace, but should be split
| into two tokens, "do" and "n't", while "U.K." should always
| remain one token.
+item
| #[strong Can a prefix, suffix or infixes be split off?]. For example
| punctuation like commas, periods, hyphens or quotes.
p
| If there's a match, the rule is applied and the tokenizer continues its
| loop, starting with the newly split substrings. This way, spaCy can split
| #[strong complex, nested tokens] like combinations of abbreviations and
| multiple punctuation marks.
+aside
| #[strong Tokenizer exception:] Special-case rule to split a string into
| several tokens or prevent a token from being split when punctuation rules
| are applied.#[br]
| #[strong Prefix:] Character(s) at the beginning, e.g.
| #[code $], #[code (], #[code “], #[code ¿].#[br]
| #[strong Suffix:] Character(s) at the end, e.g.
| #[code km], #[code &#41;], #[code ”], #[code !].#[br]
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code …].#[br]
+image
include ../../../assets/img/docs/tokenization.svg
.u-text-right
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
| why each #[+a("/docs/api/language-models") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.

View File

@ -89,4 +89,6 @@ p
p
| Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different.
| integer IDs are very different. The same applies for all other strings,
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
| export the vocab if you save a #[code Doc] or #[code nlp] object.

View File

@ -19,19 +19,17 @@ p
p
| When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its
| #[code setup] details. This typically includes the ID of a language class,
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
+aside-code("meta.json (excerpt)", "json").
{
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
"pipeline": ["token_vectors", "tagger"]
}
+list("numbers")
@ -146,7 +144,7 @@ p
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[coce Vocab]
+cell #[code Vocab]
+cell
| Shared data between components, including strings, morphology,
| vectors etc.
@ -287,17 +285,15 @@ p
p
| In the model package's meta.json, specify the language class and pipeline
| IDs in #[code setup]:
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "my_sentiment_model",
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"setup": {
"lang": "en",
"pipeline": ["vectorizer", "sentiment"]
}
"pipeline": ["vectorizer", "sentiment"]
}
p
@ -307,7 +303,7 @@ p
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('my_sentiment_model')
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment

View File

@ -129,15 +129,18 @@ p
+code.
import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
new_doc = Doc().from_disk('/moby_dick.bin')
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(2, "rule-matcher") Match text with token rules
@ -148,9 +151,14 @@ p
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
# match "Google I/O" or "Google i/o"
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
matcher.add('GoogleIO', None, pattern)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT)
+infobox

View File

@ -11,7 +11,7 @@ p
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
+aside("What about \"real\" regular expressions?")
//-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns
@ -119,7 +119,7 @@ p
+code.
# Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
+cell match 0 or 1 times
+cell optional, max one
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+h(2, "example1") Example: Using linguistic annotations
p
| Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
# set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True)
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+h(2, "example2") Example: Phone numbers
p
| Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
| It'll produce more predictable results, is much easier to modify and
| extend, and doesn't require any training data only a set of
| test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
| They're very short and often contain various emoji and hashtags. By only
| looking at the plain text, you'll lose a lot of valuable semantic
| information.
p
| Let's say you've extracted a large sample of social media posts on a
| specific topic, for example posts mentioning a brand name or product.
| As the first step of your data exploration, you want to filter out posts
| containing certain emoji and use them to assign a general sentiment
| score, based on whether the expressed emotion is positive or negative,
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
| You also want to find, merge and label hashtags like
| #[code #MondayMotivation], to be able to ignore or analyse them later.
+aside("Note on sentiment analysis")
| Ultimately, sentiment analysis is not always #[em that] easy. In
| addition to the emoji, you'll also want to take specific words into
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to collect training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].
p
| By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well.
+code.
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
p
| Because the #[code on_match] callback receives the ID of each match, you
| can use the same function to handle the sentiment assignment for both
| the positive and negative pattern. To keep it simple, we'll either add
| or subtract #[code 0.1] points this way, the score will also reflect
| combinations of emoji, even positive #[em and] negative ones.
p
| With a library like
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it
| available as #[code token.norm_].
+code.
from emojipedia import Emojipedia # installation: pip install emojipedia
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if match_id is 'HAPPY':
doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
p
| To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag.
+code.
# Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
p
| To process a stream of social media posts, we can use
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
| stream of #[code Doc] objects that we can pass to
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
+code.
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)

View File

@ -74,16 +74,14 @@ p
+aside-code("meta.json", "json").
{
"name": "example_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy",
"author": "You",
"email": "you@example.com",
"license": "CC BY-SA 3.0",
"setup": {
"lang": "en",
"pipeline": ["token_vectors", "tagger"]
}
"pipeline": ["token_vectors", "tagger"]
}
+code(false, "bash").
@ -110,9 +108,9 @@ p
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes a #[code setup] key that lets you customise how
| the model should be initialised and loaded. You can define the language
| data to be loaded and the
| The meta.json includes the model details, like name, requirements and
| license, and lets you customise how the model should be initialised and
| loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute.
@ -183,9 +181,9 @@ p
p
| To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code setup] details
| to initialise a #[code Language] class with a processing pipeline and
| load in the model data.
| look for a meta.json in the directory and use the #[code lang] and
| #[code pipeline] settings to initialise a #[code Language] class with a
| processing pipeline and load in the model data.
+code.
nlp = spacy.load('/path/to/model')

View File

@ -65,7 +65,7 @@ p
| spaCy provides a variety of linguistic annotations to give you insights
| into a text's grammatical structure. This includes the word types,
| i.e. the parts of speech, and how the words are related to each other.
| For example, if you're analysing text, it makes a #[em huge] difference
| For example, if you're analysing text, it makes a huge difference
| whether a noun is the subject of a sentence, or the object or whether
| "google" is used as a verb, or refers to the website or company in a
| specific context.
@ -94,9 +94,10 @@ p
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenizer and its rules work in detail,
| how to #[strong customise] it and how to #[strong add your own tokenizer]
| to a processing pipeline, see the usage guide on
| To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise and replace] the default tokenizer and how to
| #[strong add language-specific data], see the usage guides on
| #[+a("/docs/usage/adding-languages") adding languages] and
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to train and update
| the entity predictions of a model, see the usage guide on
| #[+a("/docs/usage/entity-recognition") named entity recognition].
| #[strong add your own entities] to a document and how to
| #[strong train and update] the entity predictions of a model, see the
| usage guides on
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")

View File

@ -20,19 +20,18 @@ p
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to customise the pipeline with your own components.
| Components are functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you'll want to create a new one
| for each pipeline. You can do that by defining and registering a factory
| which receives the shared #[code Vocab] object and returns a component.
p
| spaCy's default components the vectorizer, tagger, parser and entity
| recognizer, can be added to your pipeline by using their string IDs.
| This way, you won't have to worry about finding and implementing them
| to use the default tagger, simply add #[code "tagger"] to the pipeline,
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+image
include ../../assets/img/docs/pipeline.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@ -96,11 +95,10 @@ p
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. It's now also possible to overwrite the functions that
| compute lexical attributes like #[code like_num], and supply
| language-specific syntax iterators, e.g. to determine noun chunks. spaCy
| now also supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma.
| and simplified. spaCy now also supports simple lookup-based lemmatization.
+image
include ../../assets/img/docs/language_data.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
@ -111,13 +109,10 @@ p
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
assert len(matcher) == 1
assert 'HelloWorld' in matcher
assert 'HEARTS' in matcher
p
| Patterns can now be added to the matcher by calling
@ -157,28 +152,8 @@ p
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+cell #[code Language.create_make_doc]
+cell #[+api("language#attributes") #[code Language.tokenizer]]
+row
+cell
@ -212,6 +187,28 @@ p
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell -
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell #[code Matcher.load]
+cell -
@ -232,7 +229,7 @@ p
+row
+cell #[code Doc.read_bytes]
+cell
+cell #[+api("binder") #[code Binder]]
+row
+cell #[code Token.is_ancestor_of]