Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-28 11:46:57 +02:00
commit 39293ab2ee
91 changed files with 4121 additions and 2416 deletions

View File

@ -1,10 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import importlib
from .compat import basestring_
from .cli.info import info
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
from . import util
@ -12,11 +9,8 @@ from . import util
def load(name, **overrides):
name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.')
cls = util.get_lang_class(meta['lang'])
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
return util.load_model(name)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -19,6 +19,8 @@ import numpy
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@ -86,10 +88,10 @@ class PrecomputableAffine(Model):
d_b=Gradient("b")
)
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nP = pieces
self.nP = nP
self.nI = nI
self.nF = nF
@ -247,6 +249,7 @@ def doc2feats(cols=None):
model.cols = cols
return model
def print_shape(prefix):
def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX

View File

@ -7,7 +7,6 @@ from pathlib import Path
from .converters import conllu2json, iob2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
# to this dict with the file extension mapped to the converter function imported
# from /converters.
@ -25,8 +24,9 @@ CONVERTERS = {
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other
def convert(cmd, input_file, output_dir, n_sents, morphology):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file)
@ -39,4 +39,5 @@ def convert(input_file, output_dir, n_sents, morphology):
if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path, *args)
CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
def iob2json(input_path, output_path, n_sents=10, *a, **k):
@ -29,9 +30,10 @@ def read_iob(file_):
continue
tokens = [t.rsplit('|', 2) for t in line.split()]
words, pos, iob = zip(*tokens)
biluo = iob_to_biluo(iob)
sentences.append([
{'orth': w, 'tag': p, 'ner': ent}
for (w, p, ent) in zip(words, pos, iob)
for (w, p, ent) in zip(words, pos, biluo)
])
sentences = [{'tokens': sent} for sent in sentences]
paragraphs = [{'sentences': [sent]} for sent in sentences]

View File

@ -17,8 +17,9 @@ from .. import about
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""Download compatible model from default download path using pip. Model
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
@ -31,7 +32,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try:
link(model_name, model, force=True)
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and

View File

@ -14,14 +14,20 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model:
model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
prints(meta_path, title="Can't find model meta.json", exits=1)
meta = read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())

View File

@ -14,13 +14,14 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
def link(cmd, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_model_package_path(origin)
model_path = util.get_package_path(model)
else:
model_path = Path(origin)
if not model_path.exists():

View File

@ -18,8 +18,9 @@ from .. import about
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta, force):
"""Generate Python package for model data, including meta and required
def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
@ -42,7 +43,7 @@ def package(input_dir, output_dir, meta, force):
meta = util.read_json(meta_path)
else:
meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version'])
meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
@ -85,20 +86,32 @@ def generate_meta():
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {}
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath):

View File

@ -14,7 +14,7 @@ from timeit import default_timer as timer
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import displacy
@ -32,9 +32,11 @@ from .. import displacy
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=False, no_tagger=False, no_parser=False, no_entities=False):
"""Train a model. Expects data in spaCy's JSON format."""
"""
Train a model. Expects data in spaCy's JSON format.
"""
n_sents = n_sents or None
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
@ -53,45 +55,48 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001))
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
dropout = util.env_opt('dropout', 0.0)
dropout_decay = util.env_opt('dropout_decay', 0.0)
orig_dropout = dropout
n_train_docs = corpus.count_train()
optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu)
n_train_docs = corpus.count_train()
batch_size = float(util.env_opt('min_batch_size', 4))
max_batch_size = util.env_opt('max_batch_size', 64)
batch_accel = util.env_opt('batch_accel', 1.001)
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
for i in range(n_iter):
with tqdm.tqdm(total=n_train_docs) as pbar:
train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True)
idx = 0
while idx < n_train_docs:
batch = list(cytoolz.take(int(batch_size), train_docs))
if not batch:
break
docs, golds = zip(*batch)
nlp.update(docs, golds, drop=dropout, sgd=optimizer)
pbar.update(len(docs))
idx += len(docs)
batch_size *= batch_accel
batch_size = min(batch_size, max_batch_size)
dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx)
with nlp.use_params(optimizer.averages):
start = timer()
scorer = nlp.evaluate(corpus.dev_docs(nlp))
end = timer()
n_words = scorer.tokens.tp + scorer.tokens.fn
assert n_words != 0
wps = n_words / (end-start)
print_progress(i, {}, scorer.scores, wps=wps)
with (output_path / 'model.bin').open('wb') as file_:
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try:
for i in range(n_iter):
with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
pbar.update(len(docs))
with nlp.use_params(optimizer.averages):
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
with (output_path / ('model%d.pickle' % i)).open('rb') as file_:
nlp_loaded = dill.load(file_)
scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False))
print_progress(i, losses, scorer.scores)
finally:
print("Saving model...")
with (output_path / 'model-final.pickle').open('wb') as file_:
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
def _render_parses(i, to_render):
@ -109,13 +114,13 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores.update(losses)
scores['dep_loss'] = losses.get('parser', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores[wps] = wps
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{tag_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',

View File

@ -6,6 +6,7 @@ import io
import re
import ujson
import random
import cytoolz
from .syntax import nonproj
from .util import ensure_path
@ -141,6 +142,19 @@ def _min_edit_path(cand_words, gold_words):
return prev_costs[n_gold], previous_row[-1]
def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
'''
items = iter(items)
while True:
batch_size = next(size) #if hasattr(size, '__next__') else size
batch = list(cytoolz.take(int(batch_size), items))
if len(batch) == 0:
break
yield list(batch)
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
@ -184,15 +198,15 @@ class GoldCorpus(object):
n += 1
return n
def train_docs(self, nlp, shuffle=0, gold_preproc=False,
projectivize=False):
def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples)
if shuffle:
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
@ -201,7 +215,7 @@ class GoldCorpus(object):
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc):
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
@ -212,7 +226,8 @@ class GoldCorpus(object):
gold_preproc)
golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
yield doc, gold
if not max_length or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
@ -291,7 +306,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
yield [paragraph.get('raw', None), sents]
def _iob_to_biluo(tags):
def iob_to_biluo(tags):
out = []
curr_label = None
tags = list(tags)
@ -396,7 +411,10 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.heads[i] = self.gold_to_cand[heads[gold_i]]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower]
_uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -35,4 +35,4 @@ class English(Language):
Defaults = EnglishDefaults
__all__ = ['English', 'EnglishDefaults']
__all__ = ['English']

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from .char_classes import CURRENCY, UNITS
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY)
LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

26
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -6,7 +6,8 @@ import dill
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam
from thinc.neural.optimizers import Adam, SGD
import random
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -172,13 +173,13 @@ class Language(object):
flat_list.append(pipe)
self.pipeline = flat_list
def __call__(self, text, **disabled):
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
EXAMPLE:
@ -189,12 +190,12 @@ class Language(object):
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
proc(doc)
return doc
def update(self, docs, golds, drop=0., sgd=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -211,12 +212,21 @@ class Language(object):
"""
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
for proc in self.pipeline[1:]:
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds, sgd=sgd, drop=drop)
bp_tokvecses(d_tokvecses, sgd=sgd)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
# If we don't do this, the memory leak gets pretty
# bad, because we may be holding part of a batch.
@ -260,13 +270,20 @@ class Language(object):
if cfg.get('use_gpu'):
Model.ops = CupyOps()
Model.Ops = CupyOps
print("Use GPU")
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
contexts.append(context)
optimizer = Adam(Model.ops, 0.001)
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
return optimizer
def evaluate(self, docs_golds):
@ -306,7 +323,7 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
@ -314,7 +331,7 @@ class Language(object):
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
@ -326,7 +343,7 @@ class Language(object):
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -336,12 +353,14 @@ class Language(object):
for doc in docs:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
def to_disk(self, path, disable=[]):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved.
disable (list): Nameds of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
@ -353,7 +372,7 @@ class Language(object):
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
if name in disable:
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
@ -362,13 +381,14 @@ class Language(object):
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
def from_disk(self, path, **exclude):
def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and
returns it.
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
@ -377,35 +397,36 @@ class Language(object):
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
self.from_bytes(bytes_data, disable)
return self
def to_bytes(self, **exclude):
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
for key in disable:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
if key not in disable:
setattr(self, key, value)
return self

View File

@ -43,7 +43,7 @@ class TokenVectorEncoder(object):
name = 'tok2vec'
@classmethod
def Model(cls, width=128, embed_size=5000, **cfg):
def Model(cls, width=128, embed_size=7500, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
@ -119,7 +119,7 @@ class TokenVectorEncoder(object):
assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs
def update(self, docs, golds, state=None, drop=0., sgd=None):
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
@ -199,7 +199,7 @@ class NeuralTagger(object):
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
idx += 1
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
docs, tokvecs = docs_tokvecs
if self.model.nI is None:
@ -228,6 +228,7 @@ class NeuralTagger(object):
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@ -248,7 +249,8 @@ class NeuralTagger(object):
vocab.morphology.lemmatizer)
token_vector_width = pipeline[0].model.nO
self.model = with_flatten(
Softmax(self.vocab.morphology.n_tags, token_vector_width))
chain(Maxout(token_vector_width, token_vector_width),
Softmax(self.vocab.morphology.n_tags, token_vector_width)))
def use_params(self, params):
with self.model.use_params(params):
@ -274,7 +276,8 @@ class NeuralLabeller(NeuralTagger):
self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO
self.model = with_flatten(
Softmax(len(self.labels), token_vector_width))
chain(Maxout(token_vector_width, token_vector_width),
Softmax(len(self.labels), token_vector_width)))
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
@ -290,6 +293,7 @@ class NeuralLabeller(NeuralTagger):
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@ -333,6 +337,9 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity'
@ -340,6 +347,10 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser):
TransitionSystem = ArcEager

View File

@ -335,16 +335,18 @@ cdef cppclass StateC:
this._break = this._b_i
void clone(const StateC* src) nogil:
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i
this._s_i = src._s_i
this._e_i = src._e_i
this._break = src._break
this.offset = src.offset
this._empty_token = src._empty_token
void fast_forward() nogil:
# space token attachement policy:

View File

@ -9,6 +9,7 @@ import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
@ -310,12 +311,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
SHIFT: [''],
REDUCE: [''],
RIGHT: [],
LEFT: [],
BREAK: ['ROOT']})
OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT'])
)))
seen_actions = set()
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':
@ -348,8 +350,15 @@ cdef class ArcEager(TransitionSystem):
def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if all([h is None for h in gold.heads]):
if not self.has_gold(gold):
return None
for i in range(gold.length):
if gold.heads[i] is None: # Missing values

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
MISSING: [''],
BEGIN: [],
IN: [],
LAST: [],
UNIT: [],
OUT: ['']
})
OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
@ -90,13 +103,20 @@ cdef class BiluoPushDown(TransitionSystem):
def move_name(self, int move, int label):
if move == OUT:
return 'O'
elif move == 'MISSING':
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if all([tag == '-' for tag in gold.ner]):
if not self.has_gold(gold):
return None
for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])

View File

@ -249,11 +249,13 @@ cdef class Parser:
with Model.use_device('cpu'):
if depth == 0:
upper = chain()
upper.is_noop = True
else:
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class))
zero_init(Affine(nr_class, drop_factor=0.0))
)
upper.is_noop = False
# TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -364,7 +366,7 @@ cdef class Parser:
cdef np.ndarray scores
c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data
cdef int has_hidden = hasattr(vec2scores, 'W')
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty():
if not has_hidden:
for i in cython.parallel.prange(
@ -414,7 +416,9 @@ cdef class Parser:
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
@ -422,27 +426,33 @@ cdef class Parser:
golds = [golds]
cuda_stream = get_cuda_stream()
golds = [self.moves.preprocess_gold(g) for g in golds]
states = self.moves.init_batch(docs)
states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
drop)
0.0)
todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
if not todo:
return None
backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
cdef float loss = 0.
while len(todo) >= 3:
n_steps = 0
while todo:
states, golds = zip(*todo)
token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0:
mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
vector *= mask
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores, sgd=sgd)
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
if drop != 0:
d_vector *= mask
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
@ -456,15 +466,51 @@ cdef class Parser:
backprops.append((token_ids, d_vector, bp_vector))
self.transition_batch(states, scores)
todo = [st for st in todo if not st[0].is_final()]
if len(backprops) >= 50:
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
backprops = []
if backprops:
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
if losses is not None:
losses[self.name] += (d_scores**2).sum()
n_steps += 1
if n_steps >= max_steps:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
long_doc[:N], and another representing long_doc[N:]."""
cdef:
StateClass state
Transition action
whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
max_moves = 0
states = []
golds = []
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
gold = self.moves.preprocess_gold(gold)
if gold is None:
continue
oracle_actions = self.moves.get_oracle_sequence(doc, gold)
start = 0
while start < len(doc):
state = state.copy()
n_moves = 0
while state.B(0) < start and not state.is_final():
action = self.moves.c[oracle_actions.pop(0)]
action.do(state.c, action.label)
n_moves += 1
has_gold = self.moves.has_gold(gold, start=start,
end=start+max_length)
if not state.is_final() and has_gold:
states.append(state)
golds.append(gold)
max_moves = max(max_moves, n_moves)
start += min(max_length, len(doc)-start)
max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete.
if cuda_stream is not None:
@ -481,6 +527,14 @@ cdef class Parser:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
@property
def move_names(self):
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs,

View File

@ -41,6 +41,11 @@ cdef class StateClass:
def is_final(self):
return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
new_state.c.clone(self.c)
return new_state
def print_state(self, words):
words = list(words) + ['_']
top = words[self.S(0)] + '_%d' % self.S_(0).head

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict
from collections import defaultdict, OrderedDict
from ..structs cimport TokenC
from .stateclass cimport StateClass
@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action):
def __init__(self, StringStore string_table, labels_by_action):
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -34,14 +34,14 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()):
for action, label_strs in labels_by_action.items():
for label_str in label_strs:
self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT']
self.init_beam_state = _init_state
def __reduce__(self):
labels_by_action = {}
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
@ -61,6 +61,29 @@ cdef class TransitionSystem:
offset += len(doc)
return states
def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool()
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
cdef StateClass state = StateClass(doc, offset=0)
self.initialize_state(state.c)
history = []
while not state.is_final():
self.set_costs(is_valid, costs, state, gold)
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= 0:
action = self.c[i]
history.append(i)
action.do(state.c, action.label)
break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history
cdef int initialize_state(self, StateC* state) nogil:
pass
@ -92,11 +115,21 @@ cdef class TransitionSystem:
StateClass stcls, GoldParse gold) except -1:
cdef int i
self.set_valid(is_valid, stcls.c)
cdef int n_gold = 0
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
n_gold += costs[i] <= 0
else:
costs[i] = 9000
if n_gold <= 0:
print(gold.words)
print(gold.ner)
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions.\n"
"%s" % (self.n_moves))
def add_action(self, int action, label):
if not isinstance(label, int):

View File

@ -1,7 +1,4 @@
# coding: utf-8
"""Test that tokenizer exceptions and emoticons are handled correctly."""
from __future__ import unicode_literals
import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
def test_tokenizer_handles_emoji(tokenizer, text, length):
exceptions = ["hu"]
tokens = tokenizer(text)
if tokens[0].lang_ not in exceptions:
assert len(tokens) == length

View File

@ -598,6 +598,24 @@ cdef class Doc:
self.is_tagged = bool(TAG in attrs or POS in attrs)
return self
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Doc): The modified `Doc` object.
"""
raise NotImplementedError()
def to_bytes(self):
"""Serialize, i.e. export the document contents to a binary string.

View File

@ -78,27 +78,86 @@ def ensure_path(path):
return path
def resolve_model_path(name):
"""Resolve a model name or string to a model path.
def load_model(name):
"""Load a model from a shortcut link, package or data path.
name (unicode): Package name, shortcut link or model path.
RETURNS (Path): Path to model data directory.
RETURNS (Language): `Language` class with the loaded model.
"""
data_path = get_data_path()
if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_):
if (data_path / name).exists(): # in data dir or shortcut link
return (data_path / name)
if is_package(name): # installed as a package
return get_model_package_path(name)
if Path(name).exists(): # path to model
return Path(name)
elif hasattr(name, 'exists'): # Path or Path-like object
return name
if (data_path / name).exists(): # in data dir or shortcut
return load_model_from_path(data_path / name)
if is_package(name): # installed as package
return load_model_from_pkg(name)
if Path(name).exists(): # path to model data directory
return load_data_from_path(Path(name))
elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_data_from_path(name)
raise IOError("Can't find model '%s'" % name)
def load_model_from_init_py(init_file):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = Path(init_file).parent
return load_data_from_path(model_path, package=True)
def load_model_from_path(model_path):
"""Import and load a model package from its file path.
path (unicode or Path): Path to package directory.
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
spec = importlib.util.spec_from_file_location('model', model_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module.load()
def load_model_from_pkg(name):
"""Import and load a model package.
name (unicode): Name of model package installed via pip.
RETURNS (Language): `Language` class with loaded model.
"""
module = importlib.import_module(name)
return module.load()
def load_data_from_path(model_path, package=False):
"""Initialie a `Language` class with a loaded model from a model data path.
model_path (unicode or Path): Path to model data directory.
package (bool): Does the path point to the parent package directory?
RETURNS (Language): `Language` class with loaded model.
"""
model_path = ensure_path(model_path)
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % location)
meta = read_json(location)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise IOError('No %s setting found in model meta.json' % setting)
if package:
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
model_path = model_path / model_data_path
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path))
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True))
return nlp.from_disk(model_path)
def is_package(name):
"""Check if string maps to a package installed via pip.
@ -112,36 +171,16 @@ def is_package(name):
return False
def get_model_package_path(package_name):
"""Get path to a model package installed via pip.
def get_package_path(name):
"""Get the path to an installed package.
package_name (unicode): Name of installed package.
RETURNS (Path): Path to model data directory.
name (unicode): Package name.
RETURNS (Path): Path to installed package.
"""
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
# Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent
meta = parse_package_meta(package_path / package_name)
model_name = '%s-%s' % (package_name, meta['version'])
return package_path / package_name / model_name
def parse_package_meta(package_path, require=True):
"""Check if a meta.json exists in a package and return its contents.
package_path (Path): Path to model package directory.
require (bool): If True, raise error if no meta.json is found.
RETURNS (dict or None): Model meta.json data or None.
"""
location = package_path / 'meta.json'
if location.is_file():
return read_json(location)
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
return Path(pkg.__file__).parent
def is_in_jupyter():
@ -174,12 +213,16 @@ def get_async(stream, numpy_array):
array.set(numpy_array, stream=stream)
return array
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased --
and yielding them sometime later. Obviously, this is not unbiased
but should be good enough for batching. Larger bufsize means less bias.
From https://gist.github.com/andres-erbsen/1307752
iterable (iterable): Iterator to shuffle.
bufsize (int): Items to hold back.
YIELDS (iterable): The shuffled iterator.
"""
iterable = iter(iterable)
buf = []
@ -313,10 +356,33 @@ def normalize_slice(length, start, stop, step=None):
return start, stop
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def compounding(start, stop, compound):
"""Yield an infinite series of compounding values. Each time the
generator is called, a value is produced by multiplying the previous
value by the compound rate.
EXAMPLE:
>>> sizes = compounding(1., 10., 1.5)
>>> assert next(sizes) == 1.
>>> assert next(sizes) == 1 * 1.5
>>> assert next(sizes) == 1.5 * 1.5
"""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
curr = float(start)
while True:
yield clip(curr)
curr *= compound
def decaying(start, stop, decay):
"""Yield an infinite series of linearly decaying values."""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
nr_upd = 1.
while True:
yield clip(start * 1./(1. + decay * nr_upd))
nr_upd += 1
def read_json(location):

View File

@ -44,8 +44,6 @@ cdef class Vocab:
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):

View File

@ -37,15 +37,17 @@ mixin svg(file, name, width, height)
size - [integer] icon width and height (default: 20)
mixin icon(name, size)
+svg("icons", name, size || 20).o-icon&attributes(attributes)
- var size = size || 20
+svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes)
//- Pro/Con/Neutral icon
icon - [string] "pro", "con" or "neutral" (default: "neutral")
size - [integer] icon size (optional)
mixin procon(icon)
mixin procon(icon, size)
- colors = { pro: "green", con: "red", neutral: "yellow" }
+icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
+icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes)
//- Headlines Helper Mixin
@ -184,3 +186,14 @@ mixin landing-header()
mixin landing-badge(url, graphic, alt, size)
+a(url)(aria-label=alt title=alt).c-landing__badge
+svg("graphics", graphic, size || 225)
//- Under construction (temporary)
Marks sections that still need to be completed for the v2.0 release.
mixin under-construction()
+infobox("🚧 Under construction")
| This section is still being written and will be updated for the v2.0
| release. Is there anything that you think should definitely mentioned or
| explained here? Any examples you'd like to see? #[strong Let us know]
| on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!

View File

@ -103,9 +103,11 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
icon - [string] icon to display next to code block, mostly used for old/new
height - [integer] optional height to clip code block to
mixin code(label, language, icon)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
mixin code(label, language, icon, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
@ -176,7 +178,7 @@ mixin label()
//- Tag
mixin tag()
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")
span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes)
block
@ -190,6 +192,17 @@ mixin tag-model(...capabs)
+help(intro + ext + ".").u-color-theme
//- "New" tag to label features new in a specific version
By using a separate mixin with a version ID, it becomes easy to quickly
enable/disable tags without having to modify the markup in the docs.
version - [string or integer] version number, without "v" prefix
mixin tag-new(version)
- var version = (typeof version == 'number') ? version.toFixed(1) : version
+tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.")
| v#{version}
//- List
type - [string] "numbers", "letters", "roman" (bulleted list if none set)
start - [integer] start number
@ -350,7 +363,22 @@ mixin pos-row(tag, pos, morph, desc)
| #[code=m]
+cell.u-text-small=desc
mixin dep-row(label, desc)
+row
+cell #[code=label]
+cell=desc
//- Table rows for linguistic annotations
annots [array] - array of cell content
style [array] array of 1 (display as code) or 0 (display as text)
mixin annotation-row(annots, style)
+row
for cell, i in annots
if style && style[i]
- cell = (typeof(cell) != 'boolean') ? cell : cell ? 'True' : 'False'
+cell #[code=cell]
else
+cell=cell

View File

@ -6,9 +6,17 @@ include _sidebar
main.o-main.o-main--sidebar.o-main--aside
article.o-content
+h(1)=title
if tag
+tag=tag
+grid.o-no-block
+grid-col(source ? "two-thirds" : "full")
+h(1)=title
if tag
+tag=tag
if source
+grid-col("third").u-text-right
.o-inline-list
+button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)]
if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")

View File

@ -1,128 +1,128 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="736">
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
<style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
.text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" }
.svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__architecture__text-code { fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<text class="text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z" pointer-events="none"/>
<ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 432v242.8" stroke-dasharray="2 2"/>
<path fill="#82b366" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M345 680.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(324 535.5)" width="37" height="18"/>
<text class="text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z" pointer-events="none"/>
<text class="svg__architecture__text-small" dy="1em" style="fill: #82b366" transform="translate(324 535.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M457 434l100.5 80"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M562.3 517.6l-8.8-1.8 4-2 1-4.3z"/>
<rect fill="#f6f6f6" transform="translate(424.5 462.5)" width="158" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<text class="text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(424.5 462.5)" width="158" height="18">nlp.vocab.morphology</text>
<ellipse cx="404" cy="399" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" dy="1em" transform="translate(377.5 386.5)" width="51" height="22">Vocab</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 253v87.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 346.8l-4-8 4 2 4-2z"/>
<rect fill="#f6f6f6" transform="translate(364.5 285.5)" width="79" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<text class="text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(364.5 285.5)" width="79" height="18">nlp.vocab</text>
<ellipse cx="743" cy="399" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" transform="translate(694.5 386.5)" dy="1em" width="95" height="22">StringStore</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 399h181.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M665.8 399l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(498.5 388.5)" width="137" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(498.5 388.5)" width="137" height="18">nlp.vocab.strings</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M108 244l235.6 115.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M349 362h-9l3.6-2.6V355z"/>
<rect fill="#f6f6f6" transform="translate(141.5 284.5)" width="151" height="18" />
<text class="text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z" pointer-events="none"/>
<text class="text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(141.5 284.5)" width="151" height="18">nlp.tokenizer.vocab</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M38.7 159.3H104l33 43.6-32.8 43.5H38.7L6 203z"/>
<text class="svg__architecture__text-large" transform="translate(30.5 190.5)" dy="1em" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 203v-1H145.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M139.2 202l8-4-2 4 2 4z"/>
<rect fill="#f6f6f6" transform="translate(188.5 191.5)" width="115" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(188.5 191.5)" width="115" height="18">nlp.make_doc()</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M478 203h83v-4h105.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M672.8 199l-8 4 2-4-2-4z"/>
<rect fill="#f6f6f6" transform="translate(512.5 191.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(512.5 191.5)" width="101" height="18">nlp.pipeline</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M709 242.8L464.4 359.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M459 362l5.5-7v4.4l3.5 2.8z"/>
<rect fill="#f6f6f6" transform="translate(505.5 297.5)" width="166" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z" pointer-events="none"/>
<text transform="translate(221.5 77.5)" class="text-small" dy="0.85em" width="10" height="14">ja</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z" pointer-events="none"/>
<text class="text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z" pointer-events="none"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(505.5 297.5)" width="166" height="18">nlp.pipeline[i].vocab</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M275.3 34.6L288.6 1h54L329 34.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(301.5 9.5)" width="12" height="14">pt</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M60.8 34.6L74.3 1h54l-13.6 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(86.5 9.5)" width="14" height="14">en</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M114.4 34.6L128 1h53.8l-13.5 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(140.5 9.5)" width="14" height="14">de</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M168 34.6L181.5 1h54l-13.6 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(196.5 9.5)" width="8" height="14">fr</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M221.6 34.6L235 1h54l-13.5 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(248.5 9.5)" width="12" height="14">es</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M47 68.3l13.6-33.6h53.8L101 68.3z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(75.5 43.5)" width="8" height="14">it</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M100.7 68.3l13.5-33.6H168l-13.4 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(127.5 43.5)" width="12" height="14">nl</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M154.3 68.3l13.5-33.6h53.8l-13.4 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(180.5 43.5)" width="12" height="14">sv</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M208 68.3l13.4-33.6h53.8L262 68.3z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(236.5 43.5)" width="8" height="14">fi</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M261.5 68.3L275 34.7h54l-13.6 33.6z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(286.5 43.5)" width="16" height="14">nb</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M33.4 102L47 68.2h53.7L87.3 102z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(58.5 77.5)" width="16" height="14">hu</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M87 102l13.5-33.7h53.8L141 102z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(112.5 77.5)" width="14" height="14">he</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M140.6 102L154 68.2h54L194.4 102z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(165.5 77.5)" width="16" height="14">bn</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M194.2 102l13.5-33.7h53.8L248 102z"/>
<text transform="translate(221.5 77.5)" class="svg__architecture__text-small" dy="0.85em" width="10" height="14">ja</text>
<path fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" d="M247.8 102l13.5-33.7H315L301.8 102z"/>
<text class="svg__architecture__text-small" dy="0.85em" transform="translate(273.5 77.5)" width="14" height="14">zh</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M329 51h75v93.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M404 150.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M213 480l109.3-76.3"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 400.3L323 408l-.8-4.3-4-2z"/>
<rect fill="#f6f6f6" transform="translate(226.5 431.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(226.5 431.5)" width="79" height="18">doc.vocab</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M39.6 555.5l.4 121.3" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M40 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(23.5 604.5)" width="37" height="18"/>
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z" pointer-events="none"/>
<text class="text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z" pointer-events="none"/>
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(23.5 604.5)" width="37" height="18">MAKES</text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M1 479.5h283v74.8H1z"/>
<text class="svg__architecture__text-large" dy="1em" transform="translate(125.5 504.5)" width="32" height="22">Doc</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 246v117h1v108.8" stroke-dasharray="2 2"/>
<path fill="#c00" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M72 477.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(54.5 355.5)" width="37" height="18"/>
<text class="text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z" pointer-events="none"/>
<text class="svg__architecture__text-small" style="fill: #cc0000" dy="1em" dx="-0.5em" transform="translate(54.5 355.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104 685l.4-121.2"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M104.5 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(62.5 632.5)" width="79" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(62.5 632.5)" width="79" height="18">token.doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M7.2 685h129.6v50H7.2z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(49.5 700.5)" width="43" height="18">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M148 685h129.7v50H148z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(193.5 700.5)" width="37" height="18">Span</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 686V456.6"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M405 450.6l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(356.5 584.5)" width="101" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2" pointer-events="none"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z" pointer-events="none"/>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(356.5 584.5)" width="101" height="18">lexeme.vocab</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M296.7 685h155.8v50H296.7z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(346.5 700.5)" width="55" height="18">Lexeme</text>
<path fill="none" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M180.5 559.3l.5 117.5" stroke-dasharray="2 2"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M181 682.8l-4-8 4 2 4-2z"/>
<rect fill="#fff" transform="translate(164.5 606.5)" width="37" height="18" />
<text class="text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564" pointer-events="none"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z" pointer-events="none"/>
<text class="svg__architecture__text-small" style="fill: #9673a6" dy="1em" dx="-0.5em" transform="translate(164.5 606.5)" width="37" height="18">MAKES</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 685V564"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M245.3 557.8l4 8-4-2-4 2z"/>
<rect fill="#f6f6f6" transform="translate(211.5 633.5)" width="72" height="18"/>
<text class="text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z" pointer-events="none"/>
<text class="text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z" pointer-events="none"/>
<text class="text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8" pointer-events="none"/>
<text class="text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
<text class="svg__architecture__text-code" dy="1em" dx="0.5em" transform="translate(211.5 633.5)" width="72" height="18">span.doc</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 112H872l32.8 43.5L872 199h-65.4L774 155.6z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(794.5 135.5)" width="88" height="38">Dependency <tspan dy="1.25em" dx="-4.1em">Parser</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.6 199H872l32.8 43.8-32.8 43.6h-65.4L774 242.8z"/>
<text class="svg__architecture__text-medium" dy="1em" dx="1.1em" transform="translate(799.5 222.5)" width="78" height="38">Entity <tspan dy="1.25em" dx="-3.75em">Recognizer</tspan></text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708 155.5h65.6l32.7 43.6-32.7 43.8H708L675.5 199z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(715.5 189.5)" width="48" height="18">Tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M806.8 24.5h65.5L905 68 872.3 112h-65.5L774 68z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(809.5 58.5)" width="58" height="18">Matcher</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M708.6 68H774l32.8 43.5L774 155h-65.4L676 111.6z"/>
<text class="svg__architecture__text-medium" dy="1em" transform="translate(698.5 101.5)" width="84" height="18">Lemmatizer</text>
<ellipse cx="617" cy="555" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="svg__architecture__text-large" dy="1em" transform="translate(565.5 542.5)" width="101" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,13 +1,13 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
<style>
.text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
.text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
.svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
</style>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M642.7 361.3H708l33 43.6-33 43.5H643L610 405z"/>
<text class="text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
<text class="svg__langdata__text-large" transform="translate(634.5 410)" width="80" height="22">Tokenizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H621v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M621 240.2l4 8-4-2-4 2z"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M855 253v-20.8"/>
@ -17,7 +17,7 @@
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M780 303H504v-56.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M504 240.2l4 8-4-2-4 2z"/>
<ellipse cx="855" cy="303" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<text class="svg__langdata__text-large" transform="translate(815 308)" width="119" height="46">Base data</text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100l.4 39.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389.5 145.8l-4-8 4 2 4-2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17h232v22.8"/>
@ -33,50 +33,50 @@
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M389 100v17H46v22.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M46 145.8l-4-8 4 2 4-2z"/>
<ellipse cx="389" cy="50" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
<text class="text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
<text class="svg__langdata__text-large" transform="translate(350 42)" width="81" height="46">Language <tspan dy="1.45em" dx="-3.25em">data</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M435 193h15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M456.8 193l-8 4 2-4-2-4z"/>
<ellipse cx="390" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
<text class="svg__langdata__text-small" transform="translate(375 187.5)" width="39" height="30">stop <tspan dx="-2.5em" dy="1.25em">words</tspan></text>
<path fill="none" stroke="#9673a6" stroke-width="3" stroke-miterlimit="10" d="M472 225l-1.5 133.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#9673a6" stroke="#9673a6" stroke-width="2" stroke-miterlimit="10" d="M470.4 364.8l-4-8 4 2 4-2z"/>
<ellipse cx="504" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
<text class="svg__langdata__text-small" transform="translate(481 187.5)" width="85" height="30">lexical <tspan dx="-3.75em" dy="1.25em">attributes</tspan></text>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M653 225l5.6 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M659 358.8l-4.5-8 4 2 4-2.2z"/>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M576 193h-18.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M551.2 193l8-4-2 4 2 4z"/>
<ellipse cx="621" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
<text class="svg__langdata__text-small" transform="translate(588 187.5)" width="85" height="30">tokenizer <tspan dx="-4.6em" dy="1.25em">exceptions</tspan></text>
<path fill="none" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M690 193h-15.8"/>
<path fill="#09a3d5" stroke="#09a3d5" stroke-width="2" stroke-miterlimit="10" d="M668.2 193l8-4-2 4 2 4z"/>
<path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M703 225l-10.3 127.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M692.2 358.8l-3.4-8.3 4 2.3 4-1.7z"/>
<ellipse cx="735" cy="193" fill="#f5f5f5" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
<text class="svg__langdata__text-small" transform="translate(705 182)" width="53" height="46">prefixes, <tspan dy="1.25em" dx="-3.9em">suffixes,</tspan> <tspan dy="1.25em" dx="-3.6em">infixes</tspan>
</text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M280 238v114.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M280 358.8l-4-8 4 2 4-2z"/>
<ellipse cx="280" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
<text class="svg__langdata__text-small" transform="translate(254 187.5)" width="71" height="30">lemma <tspan dy="1.25em" dx="-2.7em">data</tspan></text>
<path fill="none" stroke="#d79b00" stroke-width="3" stroke-miterlimit="10" d="M346 404h53.8" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#d79b00" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M405.8 404l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M247.7 361.3H313l33 43.6-33 43.5h-65.3L215 405z"/>
<text class="text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
<text class="svg__langdata__text-large" transform="translate(232 410)" width="100" height="22">Lemmatizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M823 193h-34.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M782.2 193l8-4-2 4 2 4z"/>
<ellipse cx="855" cy="193" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="31.5" ry="31.5"/>
<text class="text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
<text class="svg__langdata__text-tiny" transform="translate(838 189)" width="50" height="30">char <tspan dy="1.1em" dx="-2.75em">classes</tspan></text>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M408 367h124v74H408z"/>
<text class="text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<text class="svg__langdata__text-large" transform="translate(443.5 410)" width="51" height="22">Token</text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M131 225l-21 122.2" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M109 353l-2.5-8.5 3.6 2.7 4.4-1.3z"/>
<ellipse cx="163" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
<text class="svg__langdata__text-small" transform="translate(139 187.5)" width="45" height="30">morph <tspan dy="1.25em" dx="-2.8em">rules</tspan></text>
<path fill="none" stroke="#666" stroke-width="3" stroke-miterlimit="10" d="M78 225l15.4 122" stroke-dasharray="1 6" stroke-linecap="round"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M94.2 353l-5-7.5 4.2 1.5 3.7-2.5z"/>
<ellipse cx="46" cy="193" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="45" ry="45"/>
<text class="text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
<text class="svg__langdata__text-small" transform="translate(33 187.5)" width="27" height="30">tag <tspan dy="1.25em" dx="-1.8em">map</tspan></text>
<ellipse cx="101" cy="405" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="74.5" ry="49.5"/>
<text class="text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
<text class="svg__langdata__text-large" transform="translate(49.5 410)" width="100" height="22">Morphology</text>
</svg>

Before

Width:  |  Height:  |  Size: 8.8 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -0,0 +1,30 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
<style>
.svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
<path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
<text class="svg__pipeline__text" dy="0.75em" width="28" height="19" transform="translate(846.5 75.5)">Doc</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M121.2 84.7h29.4"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M156.6 84.7l-8 4 2-4-2-4z"/>
<path fill="#f5f5f5" stroke="#999" stroke-width="2" d="M1 55h120v60H1z"/>
<text class="svg__pipeline__text" dy="0.85em" width="34" height="22" transform="translate(43.5 73.5)">Text</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M760 84.7h33"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M799 84.7l-8 4 2-4-2-4z"/>
<rect width="75" height="39" x="422" y="1" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="5.8" ry="5.8"/>
<text class="svg__pipeline__text-code" dy="0.8em" dx="0.1em" width="29" height="17" transform="translate(444.5 11.5)">nlp</text>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(455.5 80.5)">tagger</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M519 58h103.3L639 88l-16.8 30H519l16.8-30z"/>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="40" height="14" transform="translate(558.5 80.5)">parser</text>
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M622 58h103.3L742 88l-16.8 30H622l16.8-30z"/>
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="20" height="14" transform="translate(671.5 80.5)">ner</text>
</svg>

After

Width:  |  Height:  |  Size: 3.2 KiB

View File

@ -0,0 +1,123 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
<style>
.svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19"></text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Lets</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
<path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
<path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">s</text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19"></text>
<path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">s</text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19"></text>
<path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
<text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
<rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
<rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
<rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
<rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
<text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
</svg>

After

Width:  |  Height:  |  Size: 12 KiB

View File

@ -0,0 +1,77 @@
<svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
<style>
.svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
.svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
.svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
.svg__vocab__text-code { fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
</style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
<rect fill="#E1D5E7" width="50" height="12" transform="translate(202.5 53.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="50" height="12" transform="translate(202.5 53.5)">nsubj</text>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M363 60h72.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M441.8 60l-8 4 2-4-2-4z"/>
<rect fill="#E1D5E7" width="43" height="12" transform="translate(375.5 54.5)"/>
<text class="svg__vocab__text-code" dx="0.5em" dy="1em" width="43" height="12" transform="translate(375.5 54.5)">dobj</text>
<rect width="50" height="88" x="1" y="246" fill="#666" stroke="#666" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="1em" width="53" height="36" transform="rotate(-90 162 155)">String</text>
<text class="svg__vocab__text-large" dy="2em" width="53" height="36" transform="rotate(-90 162 155)">Store</text>
<rect width="50" height="88" x="1" y="135" fill="#82b366" stroke="#82b366" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="47" height="17" transform="rotate(-90 109.5 93)">Vocab</text>
<rect width="50" height="110" x="1" y="1" fill="#9673a6" stroke="#9673a6" stroke-width="2" rx="7.5" ry="7.5"/>
<text class="svg__vocab__text-large" dx="-0.25em" dy="0.9em" width="31" height="17" transform="rotate(-90 44 27.5)">Doc</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 27h100v66H263z"/>
<text class="svg__vocab__text" dy="1em" width="31" height="33" transform="translate(296.5 42.5)">love</text>
<text class="svg__vocab__text-code" dy="2.8em" width="31" height="33" transform="translate(296.5 42.5)">VERB</text>
<rect width="50" height="20" x="288" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" transform="translate(294.5 19.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 27h100v66H76z"/>
<text class="svg__vocab__text" dx="0.8em" dy="1em" width="29" height="33" transform="translate(110.5 42.5)">I</text>
<text class="svg__vocab__text-code" dy="2.8em" width="29" height="33" transform="translate(110.5 42.5)">PRON</text>
<rect width="50" height="20" x="105" y="17" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(111.5 20.5)">Token</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 27h100v66H444z"/>
<text class="svg__vocab__text" dy="1em" width="45" height="33" transform="translate(470.5 42.5)">coffee</text>
<text class="svg__vocab__text-code" dx="0.6em" dy="2.8em" width="45" height="33" transform="translate(470.5 42.5)">NOUN</text>
<rect width="50" height="20" x="469" y="16" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="36" height="12" transform="translate(475.5 19.5)">Token</text>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 141.8v-38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 149.8l-2.7-8h5.4zM126 95.2l2.7 8h-5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M126 206.2l2.7 8h-5.4zM126 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 95.2l2.7 8h-5.4zM313 149.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M313 206.2l2.7 8h-5.4zM313 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 214.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 206.2l2.7 8h-5.4zM494 260.8l-2.7-8h5.4z"/>
<path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 103.2v38.6"/>
<path fill="#666" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M494 95.2l2.7 8h-5.4zM494 149.8l-2.7-8h5.4z"/>
</svg>

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
+h(3, "dependency-parsing-english") English dependency labels
p

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
+table([ "Type", "Description" ])
+row
+cell #[code PERSON]

View File

@ -1,10 +1,5 @@
//- 💫 DOCS > API > ANNOTATION > POS TAGS
+infobox("Tip")
| In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the
| description for the string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
+h(3, "pos-tagging-english") English part-of-speech tag scheme
p

View File

@ -24,11 +24,11 @@
"Vocab": "vocab",
"StringStore": "stringstore",
"GoldParse": "goldparse",
"GoldCorpus": "goldcorpus"
"GoldCorpus": "goldcorpus",
"Binder": "binder"
},
"Other": {
"Annotation Specs": "annotation",
"Feature Scheme": "features"
"Annotation Specs": "annotation"
}
},
@ -48,62 +48,74 @@
"spacy": {
"title": "spaCy top-level functions",
"source": "spacy/__init__.py",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"source": "spacy/displacy",
"next": "util"
},
"util": {
"title": "Utility Functions",
"source": "spacy/util.py",
"next": "cli"
},
"cli": {
"title": "Command Line Interface"
"title": "Command Line Interface",
"source": "spacy/cli"
},
"language": {
"title": "Language",
"tag": "class"
"tag": "class",
"source": "spacy/language.py"
},
"doc": {
"title": "Doc",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/doc.pyx"
},
"token": {
"title": "Token",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/token.pyx"
},
"span": {
"title": "Span",
"tag": "class"
"tag": "class",
"source": "spacy/tokens/span.pyx"
},
"lexeme": {
"title": "Lexeme",
"tag": "class"
"tag": "class",
"source": "spacy/lexeme.pyx"
},
"vocab": {
"title": "Vocab",
"tag": "class"
"tag": "class",
"source": "spacy/vocab.pyx"
},
"stringstore": {
"title": "StringStore",
"tag": "class"
"tag": "class",
"source": "spacy/strings.pyx"
},
"matcher": {
"title": "Matcher",
"tag": "class"
"tag": "class",
"source": "spacy/matcher.pyx"
},
"dependenyparser": {
@ -123,7 +135,8 @@
"tokenizer": {
"title": "Tokenizer",
"tag": "class"
"tag": "class",
"source": "spacy/tokenizer.pyx"
},
"tagger": {
@ -133,19 +146,23 @@
"goldparse": {
"title": "GoldParse",
"tag": "class"
"tag": "class",
"source": "spacy/gold.pyx"
},
"goldcorpus": {
"title": "GoldCorpus",
"tag": "class"
"tag": "class",
"source": "spacy/gold.pyx"
},
"binder": {
"title": "Binder",
"tag": "class",
"source": "spacy/tokens/binder.pyx"
},
"annotation": {
"title": "Annotation Specifications"
},
"features": {
"title": "Linear Model Feature Scheme"
}
}

View File

@ -14,11 +14,12 @@ p
| (#[code ' ']) is included as a token.
+aside-code("Example").
from spacy.en import English
nlp = English(parser=False)
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
print([t.orth_ for t in tokens])
# ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| The whitespace tokens are useful for much the same reason punctuation is
@ -38,6 +39,11 @@ p
+h(2, "pos-tagging") Part-of-speech Tagging
+aside("Tip: Understanding tags")
| You can also use #[code spacy.explain()] to get the escription for the
| string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".
include _annotation/_pos-tags
+h(2, "lemmatization") Lemmatization
@ -50,25 +56,35 @@ p A "lemma" is the uninflected form of a word. In English, this means:
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
+aside("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for
| all personal pronouns.
p
| The lemmatization data is taken from
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
| special case for pronouns: all pronouns are lemmatized to the special
| token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+h(2, "dependency-parsing") Syntactic Dependency Parsing
+aside("Tip: Understanding labels")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a label. For example,
| #[code spacy.explain("prt")] will return "particle".
include _annotation/_dep-labels
+h(2, "named-entities") Named Entity Recognition
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > API > BINDER
include ../../_includes/_mixins
+under-construction

View File

@ -166,7 +166,7 @@ p
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+table(["Argument", "Type", "Description"])
+row
@ -192,18 +192,13 @@ p
+row
+cell #[code --n-iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 15]).
+cell Number of iterations (default: #[code 20]).
+row
+cell #[code --n_sents], #[code -ns]
+cell #[code --n-sents], #[code -ns]
+cell option
+cell Number of sentences (default: #[code 0]).
+row
+cell #[code --parser-L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --use-gpu], #[code -G]
+cell flag
@ -220,7 +215,7 @@ p
+cell Don't train parser.
+row
+cell #[code --no-ner], #[code -N]
+cell #[code --no-entities], #[code -N]
+cell flag
+cell Don't train NER.
@ -229,6 +224,106 @@ p
+cell flag
+cell Show help message and available arguments.
+h(3, "train-hyperparams") Environment variables for hyperparameters
p
| spaCy lets you set hyperparameters for training via environment variables.
| This is useful, because it keeps the command simple and allows you to
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias]
| for your custom #[code train] command while still being able to easily
| tweak the hyperparameters. For example:
+code(false, "bash").
parser_hidden_depth=2 parser_maxout_pieces=1 train-parser
+under-construction
+table(["Name", "Description", "Default"])
+row
+cell #[code dropout_from]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_to]
+cell
+cell #[code 0.2]
+row
+cell #[code dropout_decay]
+cell
+cell #[code 0.0]
+row
+cell #[code batch_from]
+cell
+cell #[code 1]
+row
+cell #[code batch_to]
+cell
+cell #[code 64]
+row
+cell #[code batch_compound]
+cell
+cell #[code 1.001]
+row
+cell #[code token_vector_width]
+cell
+cell #[code 128]
+row
+cell #[code embed_size]
+cell
+cell #[code 7500]
+row
+cell #[code parser_maxout_pieces]
+cell
+cell #[code 2]
+row
+cell #[code parser_hidden_depth]
+cell
+cell #[code 1]
+row
+cell #[code hidden_width]
+cell
+cell #[code 128]
+row
+cell #[code learn_rate]
+cell
+cell #[code 0.001]
+row
+cell #[code optimizer_B1]
+cell
+cell #[code 0.9]
+row
+cell #[code optimizer_B2]
+cell
+cell #[code 0.999]
+row
+cell #[code optimizer_eps]
+cell
+cell #[code 1e-08]
+row
+cell #[code L2_penalty]
+cell
+cell #[code 1e-06]
+row
+cell #[code grad_norm_clip]
+cell
+cell #[code 1.0]
+h(2, "package") Package
p

View File

@ -10,6 +10,7 @@ p
+h(2, "serve") displacy.serve
+tag method
+tag-new(2)
p
| Serve a dependency parse tree or named entity visualization to view it
@ -71,6 +72,7 @@ p
+h(2, "render") displacy.render
+tag method
+tag-new(2)
p Render a dependency parse tree or named entity visualization.

View File

@ -253,6 +253,47 @@ p
+cell #[code Doc]
+cell Itself.
+h(2, "to_disk") Doc.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
+aside-code("Example").
doc.to_disk('/path/to/doc')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Doc.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk('/path/to/doc')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Doc]
+cell The modified #[code Doc] object.
+h(2, "to_bytes") Doc.to_bytes
+tag method

View File

@ -1,138 +0,0 @@
//- 💫 DOCS > API > LINEAR MOEL FEATURES
include ../../_includes/_mixins
p
| There are two popular strategies for putting together machine learning
| models for NLP: sparse linear models, and neural networks. To solve NLP
| problems with linear models, feature templates need to be assembled that
| combine multiple atomic predictors. This page documents the atomic
| predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]],
| #[+api("tagger") #[code Tagger]] and
| #[+api("entityrecognizer") #[code EntityRecognizer]].
p
| To understand the scheme, recall that spaCy's #[code Parser] and
| #[code EntityRecognizer] are implemented as push-down automata. They
| maintain a "stack" that holds the current entity, and a "buffer"
| consisting of the words to be processed.
p
| Each state consists of the words on the stack (if any), which consistute
| the current entity being constructed. We also have the current word, and
| the two subsequent words. Finally, we also have the entities previously
| built.
p
| This gives us a number of tokens to ask questions about, to make the
| features. About each of these tokens, we can ask about a number of
| different properties. Each feature identifier asks about a specific
| property of a specific token of the context.
+h(2, "tokens") Context tokens
+table([ "ID", "Description" ])
+row
+cell #[code S0]
+cell
| The first word on the stack, i.e. the token most recently added
| to the current entity.
+row
+cell #[code S1]
+cell The second word on the stack, i.e. the second most recently added.
+row
+cell #[code S2]
+cell The third word on the stack, i.e. the third most recently added.
+row
+cell #[code N0]
+cell The first word of the buffer, i.e. the current word being tagged.
+row
+cell #[code N1]
+cell The second word of the buffer.
+row
+cell #[code N2]
+cell The third word of the buffer.
+row
+cell #[code P1]
+cell The word immediately before #[code N0].
+row
+cell #[code P2]
+cell The second word before #[code N0].
+row
+cell #[code E0]
+cell The first word of the previously constructed entity.
+row
+cell #[code E1]
+cell The first word of the second previously constructed entity.
p About each of these tokens, we can ask:
+table([ "ID", "Attribute", "Description" ])
+row
+cell #[code N0w]
+cell #[code token.orth]
+cell The word form.
+row
+cell #[code N0W]
+cell #[code token.lemma]
+cell The word's lemma.
+row
+cell #[code N0p]
+cell #[code token.tag]
+cell The word's (full) POS tag.
+row
+cell #[code N0c]
+cell #[code token.cluster]
+cell The word's (full) Brown cluster.
+row
+cell #[code N0c4]
+cell -
+cell First four digit prefix of the word's Brown cluster.
+row
+cell #[code N0c6]
+cell -
+cell First six digit prefix of the word's Brown cluster.
+row
+cell #[code N0L]
+cell -
+cell The word's dependency label. Not used as a feature in the NER.
+row
+cell #[code N0_prefix]
+cell #[code token.prefix]
+cell The first three characters of the word.
+row
+cell #[code N0_suffix]
+cell #[code token.suffix]
+cell The last three characters of the word.
+row
+cell #[code N0_shape]
+cell #[code token.shape]
+cell The word's shape, i.e. is it alphabetic, numeric, etc.
+row
+cell #[code N0_ne_iob]
+cell #[code token.ent_iob]
+cell The Inside/Outside/Begin code of the word's NER tag.
+row
+cell #[code N0_ne_type]
+cell #[code token.ent_type]
+cell The word's NER type.

View File

@ -8,6 +8,7 @@ p
+h(2, "init") GoldCorpus.__init__
+tag method
+tag-new(2)
p Create a #[code GoldCorpus].

View File

@ -2,7 +2,10 @@
include ../../_includes/_mixins
p spaCy currently supports the following languages and capabilities:
p
| spaCy currently provides models for the following languages and
| capabilities:
+aside-code("Download language models", "bash").
python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
+row
+cell French #[code fr]
each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
+h(2, "available") Available models
+row
+cell Spanish #[code es]
each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
+cell.u-text-center #[+procon(icon)]
include ../usage/_models-list
p
+button("/docs/usage/models", true, "primary") See available models
+h(2, "alpha-support") Alpha tokenization support
@ -52,9 +59,35 @@ p
| #[+a("https://github.com/mocobeta/janome") Janome].
+table([ "Language", "Code", "Source" ])
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+row
+cell #{language}
+cell #[code=code]
+cell
+src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
+h(2, "multi-language") Multi-language support
+tag-new(2)
p
| As of v2.0, spaCy supports models trained on more than one language. This
| is especially useful for named entity recognition. The language ID used
| for multi-language or language-neutral models is #[code xx]. The
| language class, a generic subclass containing only the base language data,
| can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
p
| To load your model with the neutral, multi-language class, simply set
| #[code "language": "xx"] in your
| #[+a("/docs/usage/saving-loading#models-generating") model package]'s
| meta.json. You can also import the class directly, or call
| #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
| lazy-loading.
+code("Standard import").
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()
+code("With lazy-loading").
from spacy.util import get_lang_class
nlp = get_lang_class('xx')

View File

@ -73,15 +73,26 @@ p
+cell The text to be processed.
+row
+cell #[code **disabled]
+cell -
+cell Elements of the pipeline that should not be run.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Doc]
+cell A container for accessing the annotations.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old doc = nlp(u"I don't want parsed", parse=False)
+h(2, "pipe") Language.pipe
+tag method
@ -112,6 +123,13 @@ p
+cell int
+cell The number of texts to buffer.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell yields
+cell #[code Doc]
@ -227,8 +245,11 @@ p
+h(2, "to_disk") Language.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
p
| Save the current state to a directory. If a model is loaded, this will
| #[strong include the model].
+aside-code("Example").
nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
p
| Loads state from a directory. Modifies the object in place and returns
| it. If the saved #[code Language] object contains a model, the
| #[strong model will be loaded].
+aside-code("Example").
from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The modified #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy v2.0, the #[code save_to_directory] method has been
| renamed to #[code to_disk], to improve consistency across classes.
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+code-old nlp = spacy.load('en', tagger=False, entity=False)
+h(2, "to_bytes") Language.to_bytes
+tag method
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being serialized.
+footrow
+cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
@ -327,6 +382,11 @@ p Load state from a binary string.
+cell #[code Vocab]
+cell A container for the lexical types.
+row
+cell #[code tokenizer]
+cell #[code Tokenizer]
+cell The tokenizer.
+row
+cell #[code make_doc]
+cell #[code lambda text: Doc]

View File

@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation.
+row
+cell #[code is_alpha]
+cell bool
+cell Equivalent to #[code word.orth_.isalpha()].
+cell
| Does the lexeme consist of alphabetic characters? Equivalent to
| #[code lexeme.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+cell
| Does the lexeme consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell Equivalent to #[code word.orth_.isdigit()].
+cell
| Does the lexeme consist of digits? Equivalent to
| #[code lexeme.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell Equivalent to #[code word.orth_.islower()].
+cell
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+row
+cell #[code is_title]
+cell bool
+cell Equivalent to #[code word.orth_.istitle()].
+cell
| Is the lexeme in titlecase? Equivalent to
| #[code lexeme.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Equivalent to #[code word.orth_.ispunct()].
+cell Is the lexeme punctuation?
+row
+cell #[code is_space]
+cell bool
+cell Equivalent to #[code word.orth_.isspace()].
+cell
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+row
+cell #[code like_url]
+cell bool
+cell Does the word resemble a URL?
+cell Does the lexeme resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the word resemble an email address?
+cell Does the lexeme resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the word out-of-vocabulary?
+cell Is the lexeme out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the word part of a "stop list"?
+cell Is the lexeme part of a "stop list"?
+row
+cell #[code lang]

View File

@ -5,13 +5,14 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules.
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
| is now called #[+api("matcher#get") #[code matcher.get]].
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
| and #[code Matcher.has_entity] (now redundant) have been removed.
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
| is now called #[+api("matcher#get") #[code matcher.get]].
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
| and #[code Matcher.has_entity] (now redundant) have been removed.
+h(2, "init") Matcher.__init__
+tag method
@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+h(2, "pipe") Matcher.pipe
+tag method
@ -118,6 +119,7 @@ p Match a stream of documents, yielding them in turn.
+h(2, "len") Matcher.__len__
+tag method
+tag-new(2)
p
| Get the number of rules added to the matcher. Note that this only returns
@ -138,6 +140,7 @@ p
+h(2, "contains") Matcher.__contains__
+tag method
+tag-new(2)
p Check whether the matcher contains rules for a match ID.
@ -159,6 +162,7 @@ p Check whether the matcher contains rules for a match ID.
+h(2, "add") Matcher.add
+tag method
+tag-new(2)
p
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
@ -198,8 +202,23 @@ p
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID.
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+h(2, "remove") Matcher.remove
+tag method
+tag-new(2)
p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
@ -219,6 +238,7 @@ p
+h(2, "get") Matcher.get
+tag method
+tag-new(2)
p
| Retrieve the pattern stored for a key. Returns the rule as an

View File

@ -1,14 +0,0 @@
//- 💫 DOCS > API > PHILOSOPHY
include ../../_includes/_mixins
p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries.
+h(2) 1. No job too big.
p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us — so if we want to read the web, we have to sweat performance.
+h(2) 2. Take a stand.
p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component.
+h(2) 3. Stay current.
p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff.

View File

@ -11,8 +11,13 @@ p
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be
| determined based on the model's settings.
| argument in this order. If a model is loaded from a shortcut link or
| package name, spaCy will assume it's a Python package and import it and
| call the model's own #[code load()] method. If a model is loaded from a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings off the meta.json and initialise the #[code Language]
| class. The data will be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
@ -20,12 +25,7 @@ p
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
nlp = spacy.load('en', disable=['parser', 'tagger'])
+table(["Name", "Type", "Description"])
+row
@ -33,11 +33,29 @@ p
+cell unicode or #[code Path]
+cell Model to load, i.e. shortcut link, package name or path.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+code-new nlp = spacy.load('/model')
+code-old nlp = spacy.load('en', path='/model')
+h(2, "info") spacy.info
+tag function
@ -93,3 +111,37 @@ p
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.
+h(2, "set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").
def my_factory(vocab):
def my_component(doc):
return doc
return my_component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory'])
+table(["Name", "Type", "Description"])
+row
+cell #[code factory_id]
+cell unicode
+cell
| Unique name of factory. If added to a new pipeline, spaCy will
| look up the factory for this ID and use it to create the
| component.
+row
+cell #[code factory]
+cell callable
+cell
| Callable that takes a #[code Vocab] object and returns a pipeline
| component.

View File

@ -104,6 +104,7 @@ p
+h(2, "to_disk") StringStore.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
@ -118,8 +119,9 @@ p Save the current state to a directory.
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+h(2, "from_disk") StringStore.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
@ -137,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+cell #[code StringStore]
+cell The modified #[code StringStore] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+h(2, "to_bytes") StringStore.to_bytes
+tag method
p Serialize the current state to a binary string.
@ -157,9 +159,9 @@ p Serialize the current state to a binary string.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+cell The serialized form of the #[code StringStore] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+h(2, "from_bytes") StringStore.from_bytes
+tag method
p Load state from a binary string.

View File

@ -338,8 +338,10 @@ p The L2 norm of the token's vector representation.
+cell #[code ent_iob]
+cell int
+cell
| IOB code of named entity tag.
| #[code 1="I", 2="O", 3="B"]. #[code 0] means no tag is assigned.
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_iob_]
@ -368,116 +370,131 @@ p The L2 norm of the token's vector representation.
+cell #[code lemma]
+cell int
+cell
| Base form of the word, with no inflectional suffixes.
| Base form of the token, with no inflectional suffixes.
+row
+cell #[code lemma_]
+cell unicode
+cell Base form of the word, with no inflectional suffixes.
+cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code lower]
+cell int
+cell Lower-case form of the word.
+cell Lower-case form of the token.
+row
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the word.
+cell Lower-case form of the token.
+row
+cell #[code shape]
+cell int
+cell Transform of the word's string, to show orthographic features.
+cell
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row
+cell #[code shape_]
+cell unicode
+cell A transform of the word's string, to show orthographic features.
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
+row
+cell #[code prefix]
+cell int
+cell Integer ID of a length-N substring from the start of the
| word. Defaults to #[code N=1].
| token. Defaults to #[code N=1].
+row
+cell #[code prefix_]
+cell unicode
+cell
| A length-N substring from the start of the word. Defaults to
| A length-N substring from the start of the token. Defaults to
| #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell
| Length-N substring from the end of the word. Defaults to #[code N=3].
| Length-N substring from the end of the token. Defaults to #[code N=3].
+row
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
+row
+cell #[code is_alpha]
+cell bool
+cell Equivalent to #[code word.orth_.isalpha()].
+cell
| Does the token consist of alphabetic characters? Equivalent to
| #[code token.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]].
+cell
| Does the token consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in token.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell Equivalent to #[code word.orth_.isdigit()].
+cell
| Does the token consist of digits? Equivalent to
| #[code token.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell Equivalent to #[code word.orth_.islower()].
+cell
| Is the token in lowercase? Equivalent to
| #[code token.text.islower()].
+row
+cell #[code is_title]
+cell bool
+cell Equivalent to #[code word.orth_.istitle()].
+cell
| Is the token in titlecase? Equivalent to
| #[code token.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Equivalent to #[code word.orth_.ispunct()].
+cell Is the token punctuation?
+row
+cell #[code is_space]
+cell bool
+cell Equivalent to #[code word.orth_.isspace()].
+cell
| Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()].
+row
+cell #[code like_url]
+cell bool
+cell Does the word resemble a URL?
+cell Does the token resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the word resemble an email address?
+cell Does the token resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the word out-of-vocabulary?
+cell Is the token out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the word part of a "stop list"?
+cell Is the token part of a "stop list"?
+row
+cell #[code pos]

View File

@ -198,91 +198,6 @@ p
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])

View File

@ -1,12 +1,10 @@
//- 💫 DOCS > API > ANNOTATION SPECS
//- 💫 DOCS > API > UTIL
include ../../_includes/_mixins
p
| spaCy comes with a small collection of utility functions located in
| #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
+infobox("Important note")
| Because utility functions are mostly intended for
| #[strong internal use within spaCy], their behaviour may change with
| future releases. The functions documented on this page should be safe
@ -74,14 +72,23 @@ p
+cell #[code Language]
+cell Language class.
+h(2, "resolve_model_path") util.resolve_model_path
+h(2, "load_model") util.load_model
+tag function
+tag-new(2)
p Resolve a model name or string to a model path.
p
| Load a model from a shortcut link, package or data path. If called with a
| shortcut link or package name, spaCy will assume the model is a Python
| package and import and call its #[code load()] method. If called with a
| path, spaCy will assume it's a data directory, read the language and
| pipeline settings from the meta.json and initialise a #[code Language]
| class. The model data will then be loaded in via
| #[+api("language#from_disk") #[code Language.from_disk()]].
+aside-code("Example").
model_path = util.resolve_model_path('en')
model_path = util.resolve_model_path('/path/to/en')
nlp = util.load_model('en')
nlp = util.load_model('en_core_web_sm')
nlp = util.load_model('/path/to/data')
+table(["Name", "Type", "Description"])
+row
@ -91,8 +98,33 @@ p Resolve a model name or string to a model path.
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "load_model_from_init_py") util.load_model_from_init_py
+tag function
+tag-new(2)
p
| A helper function to use in the #[code load()] method of a model package's
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
+aside-code("Example").
from spacy.util import load_model_from_init_py
def load():
return load_model_from_init_py(__file__)
+table(["Name", "Type", "Description"])
+row
+cell #[code init_file]
+cell unicode
+cell Path to model's __init__.py, i.e. #[code __file__].
+footrow
+cell returns
+cell #[code Language]
+cell #[code Language] class with the loaded model.
+h(2, "is_package") util.is_package
+tag function
@ -116,16 +148,18 @@ p
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") util.get_model_package_path
+h(2, "get_package_path") util.get_package_path
+tag function
+tag-new(2)
p
| Get path to a #[+a("/docs/usage/models") model package] installed via pip.
| Currently imports the package to find it and parse its meta data.
| Get path to an installed package. Mainly used to resolve the location of
| #[+a("/docs/usage/models") model packages]. Currently imports the package
| to find its path.
+aside-code("Example").
util.get_model_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
util.get_package_path('en_core_web_sm')
# /usr/lib/python3.6/site-packages/en_core_web_sm
+table(["Name", "Type", "Description"])
+row
@ -136,39 +170,11 @@ p
+footrow
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+h(2, "parse_package_meta") util.parse_package_meta
+tag function
p
| Check if a #[code meta.json] exists in a model package and return its
| contents.
+aside-code("Example").
if util.is_package('en_core_web_sm'):
path = util.get_model_package_path('en_core_web_sm')
meta = util.parse_package_meta(path, require=True)
# {'name': 'core_web_sm', 'lang': 'en', ...}
+table(["Name", "Type", "Description"])
+row
+cell #[code package_path]
+cell #[code Path]
+cell Path to model package directory.
+row
+cell #[code require]
+cell #[code bool]
+cell If #[code True], raise error if no #[code meta.json] is found.
+footrow
+cell returns
+cell dict / #[code None]
+cell Model meta data or #[code None].
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
+tag-new(2)
p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
@ -221,11 +227,12 @@ p
+h(2, "prints") util.prints
+tag function
+tag-new(2)
p
| Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the #[+a("/docs/usage/cli") CLI].
| be used for interactive components like the #[+api("cli") cli].
+aside-code("Example").
data_path = Path('/some/path')

View File

@ -159,6 +159,7 @@ p
+h(2, "to_disk") Vocab.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
@ -175,6 +176,7 @@ p Save the current state to a directory.
+h(2, "from_disk") Vocab.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.

View File

@ -3,28 +3,25 @@
"Get started": {
"Installation": "./",
"Models": "models",
"spaCy 101": "spacy-101",
"Lightning tour": "lightning-tour",
"Visualizers": "visualizers",
"Troubleshooting": "troubleshooting",
"What's new in v2.0": "v2"
},
"Workflows": {
"spaCy 101": "spacy-101",
"Loading the pipeline": "language-processing-pipeline",
"Processing text": "processing-text",
"spaCy's data model": "data-model",
"Guides": {
"POS tagging": "pos-tagging",
"Using the parse": "dependency-parse",
"Entity recognition": "entity-recognition",
"Custom pipelines": "customizing-pipeline",
"Rule-based matching": "rule-based-matching",
"Word vectors": "word-vectors-similarities",
"Deep learning": "deep-learning",
"Custom tokenization": "customizing-tokenizer",
"Rule-based matching": "rule-based-matching",
"Adding languages": "adding-languages",
"Processing pipelines": "language-processing-pipeline",
"Deep learning": "deep-learning",
"Production use": "production-use",
"Training": "training",
"Training NER": "training-ner",
"Saving & loading": "saving-loading"
"Saving & loading": "saving-loading",
"Visualizers": "visualizers"
},
"Examples": {
"Tutorials": "tutorials",
@ -38,55 +35,33 @@
"quickstart": true
},
"v2": {
"title": "What's new in v2.0"
},
"models": {
"title": "Models",
"next": "lightning-tour",
"next": "spacy-101",
"quickstart": true
},
"spacy-101": {
"title": "spaCy 101",
"next": "lightning-tour"
},
"lightning-tour": {
"title": "Lightning tour",
"next": "spacy-101"
"next": "v2"
},
"visualizers": {
"title": "Visualizers"
},
"troubleshooting": {
"title": "Troubleshooting",
"next": "resources"
"v2": {
"title": "What's new in v2.0"
},
"resources": {
"title": "Resources"
},
"spacy-101": {
"title": "spaCy 101"
},
"language-processing-pipeline": {
"title": "Loading a language processing pipeline",
"next": "processing-text"
},
"customizing-pipeline": {
"title": "Customizing the pipeline",
"next": "customizing-tokenizer"
},
"processing-text": {
"title": "Processing text",
"next": "data-model"
},
"data-model": {
"title": "Understanding spaCy's data model"
"pos-tagging": {
"title": "Part-of-speech tagging",
"next": "dependency-parse"
},
"dependency-parse": {
@ -95,25 +70,43 @@
},
"entity-recognition": {
"title": "Entity recognition",
"title": "Named Entity Recognition",
"next": "training-ner"
},
"word-vectors-similarities": {
"title": "Using word vectors and semantic similarities",
"next": "customizing-tokenizer"
},
"customizing-tokenizer": {
"title": "Customising the tokenizer",
"next": "rule-based-matching"
},
"rule-based-matching": {
"title": "Rule-based matching"
"title": "Rule-based matching",
"next": "adding-languages"
},
"word-vectors-similarities": {
"title": "Using word vectors and semantic similarities"
"adding-languages": {
"title": "Adding languages",
"next": "training"
},
"language-processing-pipeline": {
"title": "Language processing pipelines",
"next": "deep-learning"
},
"deep-learning": {
"title": "Hooking a deep learning model into spaCy"
"title": "Hooking a deep learning model into spaCy",
"next": "production use"
},
"customizing-tokenizer": {
"title": "Customizing the tokenizer",
"next": "adding-languages"
"production-use": {
"title": "Production use",
"next": "training"
},
"training": {
@ -127,17 +120,7 @@
},
"saving-loading": {
"title": "Saving and loading models"
},
"pos-tagging": {
"title": "Part-of-speech tagging",
"next": "dependency-parse"
},
"adding-languages": {
"title": "Adding languages",
"next": "training"
"title": "Saving, loading and data serialization"
},
"showcase": {

View File

@ -0,0 +1,38 @@
//- 💫 DOCS > USAGE > SPACY 101 > NAMED ENTITIES
p
| A named entity is a "real-world object" that's assigned a name for
| example, a person, a country, a product or a book title. spaCy can
| #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types]
| of named entities in a document, by asking the model for a
| #[strong prediction]. Because models are statistical and strongly depend
| on the examples they were trained on, this doesn't always work
| #[em perfectly] and might need some tuning later, depending on your use
| case.
p
| Named entities are available as the #[code ents] property of a #[code Doc]:
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
+aside
| #[strong Text]: The original entity text.#[br]
| #[strong Start]: Index of start of entity in the #[code Doc].#[br]
| #[strong End]: Index of end of entity in the #[code Doc].#[br]
| #[strong Label]: Entity label, i.e. type.
+table(["Text", "Start", "End", "Label", "Description"])
- var style = [0, 1, 1, 1, 0]
+annotation-row(["Apple", 0, 5, "ORG", "Companies, agencies, institutions."], style)
+annotation-row(["U.K.", 27, 31, "GPE", "Geopolitical entity, i.e. countries, cities, states."], style)
+annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style)
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its named entities look like:
+codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160)

View File

@ -0,0 +1,60 @@
//- 💫 DOCS > USAGE > SPACY 101 > PIPELINES
p
| When you call #[code nlp] on a text, spaCy first tokenizes the text to
| produce a #[code Doc] object. The #[code Doc] is then processed in several
| different steps this is also referred to as the
| #[strong processing pipeline]. The pipeline used by the
| #[+a("/docs/usage/models") default models] consists of a
| vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
| component returns the processed #[code Doc], which is then passed on to
| the next component.
+image
include ../../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
+aside
| #[strong Name:] ID of the pipeline component.#[br]
| #[strong Component:] spaCy's implementation of the component.#[br]
| #[strong Creates:] Objects, attributes and properties modified and set by
| the component.
+table(["Name", "Component", "Creates"])
+row
+cell tokenizer
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell #[code Doc]
+row("divider")
+cell vectorizer
+cell #[code Vectorizer]
+cell #[code Doc.tensor]
+row
+cell tagger
+cell #[+api("tagger") #[code Tagger]]
+cell #[code Doc[i].tag]
+row
+cell parser
+cell #[+api("dependencyparser") #[code DependencyParser]]
+cell
| #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
| #[code Doc.noun_chunks]
+row
+cell ner
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
p
| The processing pipeline always #[strong depends on the statistical model]
| and its capabilities. For example, a pipeline can only include an entity
| recognizer component if the model includes data to make predictions of
| entity labels. This is why each model will specify the pipeline to use
| in its meta data, as a simple list containing the component names:
+code(false, "json").
"pipeline": ["vectorizer", "tagger", "parser", "ner"]

View File

@ -0,0 +1,62 @@
//- 💫 DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING
p
| After tokenization, spaCy can also #[strong parse] and #[strong tag] a
| given #[code Doc]. This is where the statistical model comes in, which
| enables spaCy to #[strong make a prediction] of which tag or label most
| likely applies in this context. A model consists of binary data and is
| produced by showing a system enough examples for it to make predictions
| that generalise across the language for example, a word following "the"
| in English is most likely a noun.
p
| Linguistic annotations are available as
| #[+api("token#attributes") #[code Token] attributes]. Like many NLP
| libraries, spaCy #[strong encodes all strings to integers] to reduce
| memory usage and improve efficiency. So to get the readable string
| representation of an attribute, we need to add an underscore #[code _]
| to its name:
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
+aside
| #[strong Text:] The original word text.#[br]
| #[strong Lemma:] The base form of the word.#[br]
| #[strong POS:] The simple part-of-speech tag.#[br]
| #[strong Tag:] The detailed part-of-speech tag.#[br]
| #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
| #[strong Shape:] The word shape capitalisation, punctuation, digits.#[br]
| #[strong is alpha:] Is the token an alpha character?#[br]
| #[strong is stop:] Is the token part of a stop list, i.e. the most common
| words of the language?#[br]
+table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"])
- var style = [0, 0, 1, 1, 1, 1, 1, 1]
+annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style)
+annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style)
+annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style)
+annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style)
+annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style)
+annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style)
+annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style)
+annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style)
+annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style)
+annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style)
+annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style)
+aside("Tip: Understanding tags and labels")
| Most of the tags and labels look pretty abstract, and they vary between
| languages. #[code spacy.explain()] will show you a short description
| for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person
| singular present".
p
| Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
| here's what our example sentence and its dependencies look like:
+codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460)

View File

@ -0,0 +1,68 @@
//- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION
p
| If you've been modifying the pipeline, vocabulary vectors and entities, or made
| updates to the model, you'll eventually want
| to #[strong save your progress] for example, everything that's in your #[code nlp]
| object. This means you'll have to translate its contents and structure
| into a format that can be saved, like a file or a byte string. This
| process is called serialization. spaCy comes with
| #[strong built-in serialization methods] and supports the
| #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol].
+aside("What's pickle?")
| Pickle is Python's built-in object persistance system. It lets you
| transfer arbitrary Python objects between processes. This is usually used
| to load an object to and from disk, but it's also used for distributed
| computing, e.g. with
| #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark]
| or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an
| object, you're agreeing to execute whatever code it contains. It's like
| calling #[code eval()] on a string so don't unpickle objects from
| untrusted sources.
p
| All container classes, i.e. #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and
| #[+api("stringstore") #[code StringStore]] have the following methods
| available:
+table(["Method", "Returns", "Example"])
- style = [1, 0, 1]
+annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style)
+annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
p
| For example, if you've processed a very large document, you can use
| #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
| local machine. This will save the document and its tokens, as well as
| the vocabulary associated with the #[code Doc].
+aside("Why saving the vocab?")
| Saving the vocabulary with the #[code Doc] is important, because the
| #[code Vocab] holds the context-independent information about the words,
| tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
| those IDs for example, the word text or the dependency labels. You
| might be saving #[code 446] for "whale", but in a different vocabulary,
| this ID could map to "VERB". Similarly, if your document was processed by
| a German model, its vocab will include the specific
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
+code.
moby_dick = open('moby_dick.txt', 'r') # open a large document
doc = nlp(moby_dick) # process it
doc.to_disk('/moby_dick.bin') # save the processed Doc
p
| If you need it again later, you can load it back into an empty #[code Doc]
| with an empty #[code Vocab] by calling
| #[+api("doc#from_disk") #[code from_disk()]]:
+code.
from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc

View File

@ -0,0 +1,44 @@
//- 💫 DOCS > USAGE > SPACY 101 > SIMILARITY
p
| spaCy is able to compare two objects, and make a prediction of
| #[strong how similar they are]. Predicting similarity is useful for
| building recommendation systems or flagging duplicates. For example, you
| can suggest a user content that's similar to what they're currently
| looking at, or label a support ticket as a duplicate if it's very
| similar to an already existing one.
p
| Each #[code Doc], #[code Span] and #[code Token] comes with a
| #[+api("token#similarity") #[code .similarity()]] method that lets you
| compare it with another object, and determine the similarity. Of course
| similarity is always subjective whether "dog" and "cat" are similar
| really depends on how you're looking at it. spaCy's similarity model
| usually assumes a pretty general-purpose definition of similarity.
+code.
tokens = nlp(u'dog cat banana')
for token1 in tokens:
for token2 in tokens:
print(token1.similarity(token2))
+aside
| #[strong #[+procon("neutral", 16)] similarity:] identical#[br]
| #[strong #[+procon("pro", 16)] similarity:] similar (higher is more similar) #[br]
| #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar)
+table(["", "dog", "cat", "banana"])
each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]}
+row
+cell.u-text-label.u-color-theme=label
for cell in cells
+cell #[code=cell.toFixed(2)]
| #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")]
p
| In this case, the model's predictions are pretty on point. A dog is very
| similar to a cat, whereas a banana is not very similar to either of them.
| Identical tokens are obviously 100% similar to each other (just not always
| exactly #[code 1.0], because of vector math and floating point
| imprecisions).

View File

@ -0,0 +1,62 @@
//- 💫 DOCS > USAGE > SPACY 101 > TOKENIZATION
p
| During processing, spaCy first #[strong tokenizes] the text, i.e.
| segments it into words, punctuation and so on. This is done by applying
| rules specific to each language. For example, punctuation at the end of a
| sentence should be split off whereas "U.K." should remain one token.
| Each #[code Doc] consists of individual tokens, and we can simply iterate
| over them:
+code.
for token in doc:
print(token.text)
+table([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).u-text-center
+row
for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
+cell=cell
p
| Fist, the raw text is split on whitespace characters, similar to
| #[code text.split(' ')]. Then, the tokenizer processes the text from
| left to right. On each substring, it performs two checks:
+list("numbers")
+item
| #[strong Does the substring match a tokenizer exception rule?] For
| example, "don't" does not contain whitespace, but should be split
| into two tokens, "do" and "n't", while "U.K." should always
| remain one token.
+item
| #[strong Can a prefix, suffix or infixes be split off?]. For example
| punctuation like commas, periods, hyphens or quotes.
p
| If there's a match, the rule is applied and the tokenizer continues its
| loop, starting with the newly split substrings. This way, spaCy can split
| #[strong complex, nested tokens] like combinations of abbreviations and
| multiple punctuation marks.
+aside
| #[strong Tokenizer exception:] Special-case rule to split a string into
| several tokens or prevent a token from being split when punctuation rules
| are applied.#[br]
| #[strong Prefix:] Character(s) at the beginning, e.g.
| #[code $], #[code (], #[code “], #[code ¿].#[br]
| #[strong Suffix:] Character(s) at the end, e.g.
| #[code km], #[code &#41;], #[code ”], #[code !].#[br]
| #[strong Infix:] Character(s) in between, e.g.
| #[code -], #[code --], #[code /], #[code …].#[br]
+image
include ../../../assets/img/docs/tokenization.svg
.u-text-right
+button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
p
| While punctuation rules are usually pretty general, tokenizer exceptions
| strongly depend on the specifics of the individual language. This is
| why each #[+a("/docs/api/language-models") available language] has its
| own subclass like #[code English] or #[code German], that loads in lists
| of hard-coded data and exception rules.

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
+under-construction

View File

@ -0,0 +1,94 @@
//- 💫 DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE
p
| Whenever possible, spaCy tries to store data in a vocabulary, the
| #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also
| encodes all strings to #[strong integer IDs] in this case for example,
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy
| only "speaks" in integer IDs.
+aside
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
| its attributes, tags and dependencies.#[br]
| #[strong Lexeme]: A "word type" with no context. Includes the word shape
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
| #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br]
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for
| example #[code 3672] &rarr; "coffee".
+image
include ../../../assets/img/docs/vocab_stringstore.svg
.u-text-right
+button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic
p
| If you process lots of documents containing the word "coffee" in all
| kinds of different contexts, storing the exact string "coffee" every time
| would take up way too much space. So instead, spaCy assigns it an ID
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
| think of the #[code StringStore] as a
| #[strong lookup table that works in both directions] you can look up a
| string to get its ID, or an ID to get its string:
+code.
doc = nlp(u'I like coffee')
assert doc.vocab.strings[u'coffee'] == 3572
assert doc.vocab.strings[3572] == u'coffee'
p
| Now that all strings are encoded, the entries in the vocabulary
| #[strong don&apos;t need to include the word text] themselves. Instead,
| they can look it up in the #[code StringStore] via its integer ID. Each
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
| contains the #[strong context-independent] information about a word.
| For example, no matter if "love" is used as a verb or a noun in some
| context, its spelling and whether it consists of alphabetic characters
| won't ever change.
+code.
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
+aside
| #[strong Text]: The original text of the lexeme.#[br]
| #[strong Orth]: The integer ID of the lexeme.#[br]
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
| #[strong Prefix]: By default, the first letter of the word string.#[br]
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong is digit]: Does the lexeme consist of digits?#[br]
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong Lang]: The language of the parent vocabulary.
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
p
| The specific entries in the voabulary and their IDs don't really matter
| #[strong as long as they match]. That's why you always need to make sure
| all objects you create have access to the same vocabulary. If they don't,
| the IDs won't match and spaCy will either produce very confusing results,
| or fail alltogether.
+code.
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
p
| Even though both #[code Doc] objects contain the same words, the internal
| integer IDs are very different. The same applies for all other strings,
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
| export the vocab if you save a #[code Doc] or #[code nlp] object.

View File

@ -0,0 +1,152 @@
//- 💫 DOCS > USAGE > SPACY 101 > WORD VECTORS
p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
| #[+a("/docs/usage/models") default models] come with
| #[strong 300-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
3.28450017e-02, -4.19569999e-01, 7.20689967e-02,
-3.74760002e-01, 5.74599989e-02, -1.24009997e-02,
5.29489994e-01, -5.23800015e-01, -1.97710007e-01,
-3.41470003e-01, 5.33169985e-01, -2.53309999e-02,
1.73800007e-01, 1.67720005e-01, 8.39839995e-01,
5.51070012e-02, 1.05470002e-01, 3.78719985e-01,
2.42750004e-01, 1.47449998e-02, 5.59509993e-01,
1.25210002e-01, -6.75960004e-01, 3.58420014e-01,
-4.00279984e-02, 9.59490016e-02, -5.06900012e-01,
-8.53179991e-02, 1.79800004e-01, 3.38669986e-01,
1.32300004e-01, 3.10209990e-01, 2.18779996e-01,
1.68530002e-01, 1.98740005e-01, -5.73849976e-01,
-1.06490001e-01, 2.66689986e-01, 1.28380001e-01,
-1.28030002e-01, -1.32839993e-01, 1.26570001e-01,
8.67229998e-01, 9.67210010e-02, 4.83060002e-01,
2.12709993e-01, -5.49900010e-02, -8.24249983e-02,
2.24079996e-01, 2.39749998e-01, -6.22599982e-02,
6.21940017e-01, -5.98999977e-01, 4.32009995e-01,
2.81430006e-01, 3.38420011e-02, -4.88150001e-01,
-2.13589996e-01, 2.74010003e-01, 2.40950003e-01,
4.59500015e-01, -1.86049998e-01, -1.04970002e+00,
-9.73049998e-02, -1.89080000e-01, -7.09290028e-01,
4.01950002e-01, -1.87680006e-01, 5.16870022e-01,
1.25200003e-01, 8.41499984e-01, 1.20970003e-01,
8.82389992e-02, -2.91959997e-02, 1.21510006e-03,
5.68250008e-02, -2.74210006e-01, 2.55640000e-01,
6.97930008e-02, -2.22580001e-01, -3.60060006e-01,
-2.24020004e-01, -5.36990017e-02, 1.20220006e+00,
5.45350015e-01, -5.79980016e-01, 1.09049998e-01,
4.21669990e-01, 2.06619993e-01, 1.29360005e-01,
-4.14570011e-02, -6.67770028e-01, 4.04670000e-01,
-1.52179999e-02, -2.76400000e-01, -1.56110004e-01,
-7.91980028e-02, 4.00369987e-02, -1.29439995e-01,
-2.40900001e-04, -2.67850012e-01, -3.81150007e-01,
-9.72450018e-01, 3.17259997e-01, -4.39509988e-01,
4.19340014e-01, 1.83530003e-01, -1.52600005e-01,
-1.08080000e-01, -1.03579998e+00, 7.62170032e-02,
1.65189996e-01, 2.65259994e-04, 1.66160002e-01,
-1.52810007e-01, 1.81229994e-01, 7.02740014e-01,
5.79559989e-03, 5.16639985e-02, -5.97449988e-02,
-2.75510013e-01, -3.90489995e-01, 6.11319989e-02,
5.54300010e-01, -8.79969969e-02, -4.16810006e-01,
3.28260005e-01, -5.25489986e-01, -4.42880005e-01,
8.21829960e-03, 2.44859993e-01, -2.29819998e-01,
-3.49810004e-01, 2.68940002e-01, 3.91660005e-01,
-4.19039994e-01, 1.61909997e-01, -2.62630010e+00,
6.41340017e-01, 3.97430003e-01, -1.28680006e-01,
-3.19460005e-01, -2.56330013e-01, -1.22199997e-01,
3.22750002e-01, -7.99330026e-02, -1.53479993e-01,
3.15050006e-01, 3.05909991e-01, 2.60120004e-01,
1.85530007e-01, -2.40429997e-01, 4.28860001e-02,
4.06219989e-01, -2.42559999e-01, 6.38700008e-01,
6.99829996e-01, -1.40430003e-01, 2.52090007e-01,
4.89840001e-01, -6.10670000e-02, -3.67659986e-01,
-5.50890028e-01, -3.82649988e-01, -2.08430007e-01,
2.28320003e-01, 5.12179971e-01, 2.78679997e-01,
4.76520002e-01, 4.79510017e-02, -3.40079993e-01,
-3.28729987e-01, -4.19669986e-01, -7.54989982e-02,
-3.89539987e-01, -2.96219997e-02, -3.40700001e-01,
2.21699998e-01, -6.28560036e-02, -5.19029975e-01,
-3.77739996e-01, -4.34770016e-03, -5.83010018e-01,
-8.75459984e-02, -2.39289999e-01, -2.47109994e-01,
-2.58870006e-01, -2.98940003e-01, 1.37150005e-01,
2.98919994e-02, 3.65439989e-02, -4.96650010e-01,
-1.81600004e-01, 5.29389977e-01, 2.19919994e-01,
-4.45140004e-01, 3.77979994e-01, -5.70620000e-01,
-4.69460003e-02, 8.18059966e-02, 1.92789994e-02,
3.32459986e-01, -1.46200001e-01, 1.71560004e-01,
3.99809986e-01, 3.62170011e-01, 1.28160000e-01,
3.16439986e-01, 3.75690013e-01, -7.46899992e-02,
-4.84800003e-02, -3.14009994e-01, -1.92860007e-01,
-3.12940001e-01, -1.75529998e-02, -1.75139993e-01,
-2.75870003e-02, -1.00000000e+00, 1.83870003e-01,
8.14339995e-01, -1.89129993e-01, 5.09989977e-01,
-9.19600017e-03, -1.92950002e-03, 2.81890005e-01,
2.72470005e-02, 4.34089988e-01, -5.49669981e-01,
-9.74259973e-02, -2.45399997e-01, -1.72030002e-01,
-8.86500031e-02, -3.02980006e-01, -1.35910004e-01,
-2.77649999e-01, 3.12860007e-03, 2.05559999e-01,
-1.57720000e-01, -5.23079991e-01, -6.47010028e-01,
-3.70139986e-01, 6.93930015e-02, 1.14009999e-01,
2.75940001e-01, -1.38750002e-01, -2.72680014e-01,
6.68910027e-01, -5.64539991e-02, 2.40170002e-01,
-2.67300010e-01, 2.98599988e-01, 1.00830004e-01,
5.55920005e-01, 3.28489989e-01, 7.68579990e-02,
1.55279994e-01, 2.56359994e-01, -1.07720003e-01,
-1.23590000e-01, 1.18270002e-01, -9.90289971e-02,
-3.43279988e-01, 1.15019999e-01, -3.78080010e-01,
-3.90120000e-02, -3.45930010e-01, -1.94040000e-01,
-3.35799992e-01, -6.23340011e-02, 2.89189994e-01,
2.80319989e-01, -5.37410021e-01, 6.27939999e-01,
5.69549985e-02, 6.21469975e-01, -2.52819985e-01,
4.16700006e-01, -1.01079997e-02, -2.54339993e-01,
4.00029987e-01, 4.24320012e-01, 2.26720005e-01,
1.75530002e-01, 2.30489999e-01, 2.83230007e-01,
1.38820007e-01, 3.12180002e-03, 1.70570001e-01,
3.66849989e-01, 2.52470002e-03, -6.40089989e-01,
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
p
| The #[code .vector] attribute will return an object's vector.
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] will default to an average
| of their token vectors. You can also check if a token has a vector
| assigned, and get the L2 norm, which can be used to normalise
| vectors.
+code.
tokens = nlp(u'dog cat banana sasquatch')
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
+aside
| #[strong Text]: The original token text.#[br]
| #[strong has vector]: Does the token have a vector representation?#[br]
| #[strong Vector norm]: The L2 norm of the token's vector (the square root
| of the sum of the values squared)#[br]
| #[strong is OOV]: Is the word out-of-vocabulary?
+table(["Text", "Has vector", "Vector norm", "OOV"])
- var style = [0, 1, 1, 1]
+annotation-row(["dog", true, 7.033672992262838, false], style)
+annotation-row(["cat", true, 6.68081871208896, false], style)
+annotation-row(["banana", true, 6.700014292148571, false], style)
+annotation-row(["sasquatch", false, 0, true], style)
p
| The words "dog", "cat" and "banana" are all pretty common in English, so
| they're part of the model's vocabulary, and come with a vector. The word
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
| so its vector representation consists of 300 dimensions of #[code 0],
| which means it's practically nonexistent.
p
| If your application will benefit from a large vocabulary with more
| vectors, you should consider using one of the
| #[+a("/docs/usage/models#available") larger models] instead of the default,
| smaller ones, which usually come with a clipped vocabulary.

View File

@ -104,6 +104,8 @@ p
+image
include ../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+table(["File name", "Variables", "Description"])
+row
@ -436,6 +438,8 @@ p
+h(3, "morph-rules") Morph rules
+under-construction
+h(2, "testing") Testing the new language tokenizer
p
@ -533,8 +537,8 @@ p
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
| script from the spaCy developer resources. Note that your corpus should
| not be preprocessed (i.e. you need punctuation for example). The
| #[+a("/docs/usage/cli#model") #[code model] command] expects a
| tab-separated word frequencies file with three columns:
| #[+api("cli#model") #[code model]] command expects a tab-separated word
| frequencies file with three columns:
+list("numbers")
+item The number of times the word occurred in your language sample.
@ -626,37 +630,20 @@ p
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
| The #[code vectors.bin] file should consist of one word and vector per line.
+h(2, "model-directory") Setting up a model directory
p
| Once you've collected the word frequencies, Brown clusters and word
| vectors files, you can use the
| #[+a("/docs/usage/cli#model") #[code model] command] to create a data
| directory:
+code(false, "bash").
python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
+aside-code("your_data_directory", "yaml").
//-+aside-code("your_data_directory", "yaml").
├── vocab/
| ├── lexemes.bin # via nlp.vocab.dump(path)
| ├── strings.json # via nlp.vocab.strings.dump(file_)
| └── oov_prob # optional
├── pos/ # optional
| ├── model # via nlp.tagger.model.dump(path)
| └── config.json # via Langage.train
├── deps/ # optional
| ├── model # via nlp.parser.model.dump(path)
| └── config.json # via Langage.train
└── ner/ # optional
├── model # via nlp.entity.model.dump(path)
└── config.json # via Langage.train
p
| This creates a spaCy data directory with a vocabulary model, ready to be
| loaded. By default, the command expects to be able to find your language
| class using #[code spacy.util.get_lang_class(lang_id)].
| ├── lexemes.bin
| ├── strings.json
| └── oov_prob
├── pos/
| ├── model
| └── config.json
├── deps/
| ├── model
| └── config.json
└── ner/
├── model
└── config.json
+h(2, "train-tagger-parser") Training the tagger and parser
@ -666,13 +653,12 @@ p
| If your corpus uses the
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
| i.e. files with the extension #[code .conllu], you can use the
| #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to
| spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training.
| #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format] for training.
p
| Once you have your UD corpus transformed into JSON, you can train your
| model use the using spaCy's
| #[+a("/docs/usage/cli#train") #[code train] command]:
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

View File

@ -1,38 +0,0 @@
//- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE
include ../../_includes/_mixins
p
| spaCy provides several linguistic annotation functions by default. Each
| function takes a Doc object, and modifies it in-place. The default
| pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0
| introduced the ability to customise this pipeline with arbitrary
| functions.
+code.
def arbitrary_fixup_rules(doc):
for token in doc:
if token.text == u'bill' and token.tag_ == u'NNP':
token.tag_ = u'NN'
def custom_pipeline(nlp):
return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
nlp = spacy.load('en', create_pipeline=custom_pipeline)
p
| The easiest way to customise the pipeline is to pass a
| #[code create_pipeline] callback to the #[code spacy.load()] function.
p
| The callback you pass to #[code create_pipeline] should take a single
| argument, and return a sequence of callables. Each callable in the
| sequence should accept a #[code Doc] object and modify it in place.
p
| Instead of passing a callback, you can also write to the
| #[code .pipeline] attribute directly.
+code.
nlp = spacy.load('en')
nlp.pipeline = [nlp.tagger]

View File

@ -11,18 +11,50 @@ p
| #[code spaces] booleans, which allow you to maintain alignment of the
| tokens into the original string.
+aside("See Also")
| If you haven't read up on spaCy's #[+a("data-model") data model] yet,
| you should probably have a look. The main point to keep in mind is that
| spaCy's #[code Doc] doesn't copy or refer to the original string. The
| string is reconstructed from the tokens when required.
+h(2, "101") Tokenizer 101
include _spacy-101/_tokenization
+h(3, "101-data") Tokenizer data
p
| #[strong Global] and #[strong language-specific] tokenizer data is
| supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang].
| The tokenizer exceptions define special cases like "don't" in English,
| which needs to be split into two tokens: #[code {ORTH: "do"}] and
| #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes
| mosty define punctuation rules for example, when to split off periods
| (at the end of a sentence), and when to leave token containing periods
| intact (abbreviations like "U.S.").
+image
include ../../assets/img/docs/language_data.svg
.u-text-right
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
+infobox
| For more details on the language-specific data, see the
| usage workflow on #[+a("/docs/usage/adding-languages") adding languages].
+h(2, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncracies that require custom
| tokenization rules. Here's how to add a special case rule to an existing
| tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field.
+aside("Language data vs. custom tokenization")
| Tokenization rules that are specific to one language, but can be
| #[strong generalised across that language] should ideally live in the
| language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]  we
| always appreciate pull requests! Anything that's specific to a domain or
| text type like financial trading abbreviations, or Bavarian youth slang
| should be added as a special case rule to your tokenizer instance. If
| you're dealing with a lot of customisations, it might make sense to create
| an entirely custom subclass.
p
| Here's how to add a special case rule to an existing
| #[+api("tokenizer") #[code Tokenizer]] instance:
+code.
@ -30,15 +62,12 @@ p
from spacy.symbols import ORTH, LEMMA, POS
nlp = spacy.load('en')
assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
nlp.tokenizer.add_special_case(u'gimme',
[
{
ORTH: u'gim',
LEMMA: u'give',
POS: u'VERB'},
{
ORTH: u'me'}])
doc = nlp(u'gimme that') # phrase to tokenize
assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization
# add special case rule
special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}]
nlp.tokenizer.add_special_case(u'gimme', special_case)
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
@ -55,9 +84,8 @@ p
| The special case rules have precedence over the punctuation splitting:
+code.
nlp.tokenizer.add_special_case(u'...gimme...?',
[{
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]
nlp.tokenizer.add_special_case(u'...gimme...?', special_case)
assert len(nlp(u'...gimme...?')) == 1
p
@ -137,8 +165,8 @@ p
+h(2, "native-tokenizers") Customizing spaCy's Tokenizer class
p
| Let's imagine you wanted to create a tokenizer for a new language. There
| are four things you would need to define:
| Let's imagine you wanted to create a tokenizer for a new language or
| specific domain. There are four things you would need to define:
+list("numbers")
+item
@ -170,14 +198,14 @@ p
import re
from spacy.tokenizer import Tokenizer
prefix_re = re.compile(r'''[\[\(&quot;']''')
suffix_re = re.compile(r'''[\]\)&quot;']''')
def create_tokenizer(nlp):
return Tokenizer(nlp.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search)
prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
nlp = spacy.load('en', tokenizer=create_make_doc)
def create_tokenizer(nlp):
return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
suffix_search=suffix_re.search)
nlp = spacy.load('en', tokenizer=create_tokenizer)
p
| If you need to subclass the tokenizer instead, the relevant methods to
@ -187,29 +215,68 @@ p
+h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p
| You can pass a custom tokenizer using the #[code make_doc] keyword, when
| you're creating the pipeline:
| The tokenizer is the first component of the processing pipeline and the
| only one that can't be replaced by writing to #[code nlp.pipeline]. This
| is because it has a different signature from all the other components:
| it takes a text and returns a #[code Doc], whereas all other components
| expect to already receive a tokenized #[code Doc].
+code.
import spacy
+image
include ../../assets/img/docs/pipeline.svg
.u-text-right
+button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
nlp = spacy.load('en', make_doc=my_tokenizer)
p
| However, this approach often leaves us with a chicken-and-egg problem.
| To construct the tokenizer, we usually want attributes of the #[code nlp]
| pipeline. Specifically, we want the tokenizer to hold a reference to the
| pipeline's vocabulary object. Let's say we have the following class as
| our tokenizer:
| To overwrite the existing tokenizer, you need to replace
| #[code nlp.tokenizer] with a custom function that takes a text, and
| returns a #[code Doc].
+code.
nlp = spacy.load('en')
nlp.tokenizer = my_tokenizer
+table(["Argument", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell The raw text to tokenize.
+footrow
+cell returns
+cell #[code Doc]
+cell The tokenized document.
+infobox("Important note: using a custom tokenizer")
.o-block
| In spaCy v1.x, you had to add a custom tokenizer by passing it to the
| #[code make_doc] keyword argument, or by passing a tokenizer "factory"
| to #[code create_make_doc]. This was unnecessarily complicated. Since
| spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your
| tokenizer needs the vocab, you can write a function and use
| #[code nlp.vocab].
+code-new.
nlp.tokenizer = my_tokenizer
nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
+code-old.
nlp = spacy.load('en', make_doc=my_tokenizer)
nlp = spacy.load('en', create_make_doc=my_tokenizer_factory)
+h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer
p
| To construct the tokenizer, we usually want attributes of the #[code nlp]
| pipeline. Specifically, we want the tokenizer to hold a reference to the
| vocabulary object. Let's say we have the following class as
| our tokenizer:
+code.
import spacy
from spacy.tokens import Doc
class WhitespaceTokenizer(object):
def __init__(self, nlp):
self.vocab = nlp.vocab
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
@ -218,28 +285,12 @@ p
return Doc(self.vocab, words=words, spaces=spaces)
p
| As you can see, we need a #[code vocab] instance to construct this — but
| we won't get the #[code vocab] instance until we get back the #[code nlp]
| object from #[code spacy.load()]. The simplest solution is to build the
| object in two steps:
| As you can see, we need a #[code Vocab] instance to construct this — but
| we won't have it until we get back the loaded #[code nlp] object. The
| simplest solution is to build the tokenizer in two steps. This also means
| that you can reuse the "tokenizer factory" and initialise it with
| different instances of #[code Vocab].
+code.
nlp = spacy.load('en')
nlp.make_doc = WhitespaceTokenizer(nlp)
p
| You can instead pass the class to the #[code create_make_doc] keyword,
| which is invoked as callback once the #[code nlp] object is ready:
+code.
nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer)
p
| Finally, you can of course create your own subclasses, and create a bound
| #[code make_doc] method. The disadvantage of this approach is that spaCy
| uses inheritance to give each language-specific pipeline its own class.
| If you're working with multiple languages, a naive solution will
| therefore require one custom class per language you're working with.
| This might be at least annoying. You may be able to do something more
| generic by doing some clever magic with metaclasses or mixins, if that's
| the sort of thing you're into.
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

View File

@ -1,264 +0,0 @@
//- 💫 DOCS > USAGE > SPACY'S DATA MODEL
include ../../_includes/_mixins
p After reading this page, you should be able to:
+list
+item Understand how spaCy's Doc, Span, Token and Lexeme object work
+item Start using spaCy's Cython API
+item Use spaCy more efficiently
+h(2, "architecture") Architecture
+image
include ../../assets/img/docs/architecture.svg
+h(2, "design-considerations") Design considerations
+h(3, "no-job-too-big") No job too big
p
| When writing spaCy, one of my mottos was #[em no job too big]. I wanted
| to make sure that if Google or Facebook were founded tomorrow, spaCy
| would be the obvious choice for them. I wanted spaCy to be the obvious
| choice for web-scale NLP. This meant sweating about performance, because
| for web-scale tasks, Moore's law can't save you.
p
| Most computational work gets less expensive over time. If you wrote a
| program to solve fluid dynamics in 2008, and you ran it again in 2014,
| you would expect it to be cheaper. For NLP, it often doesn't work out
| that way. The problem is that we're writing programs where the task is
| something like "Process all articles in the English Wikipedia". Sure,
| compute prices dropped from $0.80 per hour to $0.20 per hour on AWS in
| 2008-2014. But the size of Wikipedia grew from 3GB to 11GB. Maybe the
| job is a #[em little] cheaper in 2014 — but not by much.
+h(3, "annotation-layers") Multiple layers of annotation
p
| When I tell a certain sort of person that I'm a computational linguist,
| this comic is often the first thing that comes to their mind:
+image("http://i.imgur.com/n3DTzqx.png", 450)
+image-caption &copy; #[+a("http://xkcd.com") xkcd]
p
| I've thought a lot about what this comic is really trying to say. It's
| probably not talking about #[em data models] — but in that sense at
| least, it really rings true.
p
| You'll often need to model a document as a sequence of sentences. Other
| times you'll need to model it as a sequence of words. Sometimes you'll
| care about paragraphs, other times you won't. Sometimes you'll care
| about extracting quotes, which can cross paragraph boundaries. A quote
| can also occur within a sentence. When we consider sentence structure,
| things get even more complicated and contradictory. We have syntactic
| trees, sequences of entities, sequences of phrases, sub-word units,
| multi-word units...
p
| Different applications are going to need to query different,
| overlapping, and often contradictory views of the document. They're
| often going to need to query them jointly. You need to be able to get
| the syntactic head of a named entity, or the sentiment of a paragraph.
+h(2, "solutions") Solutions
+h(3) Fat types, thin tokens
+h(3) Static model, dynamic views
p
| Different applications are going to need to query different,
| overlapping, and often contradictory views of the document. For this
| reason, I think it's a bad idea to have too much of the document
| structure reflected in the data model. If you structure the data
| according to the needs of one layer of annotation, you're going to need
| to copy the data and transform it in order to use a different layer of
| annotation. You'll soon have lots of copies, and no single source of
| truth.
+h(3) Never go full stand-off
+h(3) Implementation
+h(3) Cython 101
+h(3) #[code cdef class Doc]
p
| Let's start at the top. Here's the memory layout of the
| #[+api("doc") #[code Doc]] class, minus irrelevant details:
+code.
from cymem.cymem cimport Pool
from ..vocab cimport Vocab
from ..structs cimport TokenC
cdef class Doc:
cdef Pool mem
cdef Vocab vocab
cdef TokenC* c
cdef int length
cdef int max_length
p
| So, our #[code Doc] class is a wrapper around a TokenC* array — that's
| where the actual document content is stored. Here's the #[code TokenC]
| struct, in its entirety:
+h(3) #[code cdef struct TokenC]
+code.
cdef struct TokenC:
const LexemeC* lex
uint64_t morph
univ_pos_t pos
bint spacy
int tag
int idx
int lemma
int sense
int head
int dep
bint sent_start
uint32_t l_kids
uint32_t r_kids
uint32_t l_edge
uint32_t r_edge
int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id
p
| The token owns all of its linguistic annotations, and holds a const
| pointer to a #[code LexemeC] struct. The #[code LexemeC] struct owns all
| of the #[em vocabulary] data about the word — all the dictionary
| definition stuff that we want to be shared by all instances of the type.
| Here's the #[code LexemeC] struct, in its entirety:
+h(3) #[code cdef struct LexemeC]
+code.
cdef struct LexemeC:
int32_t id
int32_t orth # Allows the string to be retrieved
int32_t length # Length of the string
uint64_t flags # These are the most useful parts.
int32_t cluster # Distributional similarity cluster
float prob # Probability
float sentiment # Slot for sentiment
int32_t lang
int32_t lower # These string views made sense
int32_t norm # when NLP meant linear models.
int32_t shape # Now they're less relevant, and
int32_t prefix # will probably be revised.
int32_t suffix
float* vector # &lt;-- This was a design mistake, and will change.
+h(2, "dynamic-views") Dynamic views
+h(3) Text
p
| You might have noticed that in all of the structs above, there's not a
| string to be found. The strings are all stored separately, in the
| #[+api("stringstore") #[code StringStore]] class. The lexemes don't know
| the strings — they only know their integer IDs. The document string is
| never stored anywhere, either. Instead, it's reconstructed by iterating
| over the tokens, which look up the #[code orth] attribute of their
| underlying lexeme. Once we have the orth ID, we can fetch the string
| from the vocabulary. Finally, each token knows whether a single
| whitespace character (#[code ' ']) should be used to separate it from
| the subsequent tokens. This allows us to preserve whitespace.
+code.
cdef print_text(Vocab vocab, const TokenC* tokens, int length):
for i in range(length):
word_string = vocab.strings[tokens.lex.orth]
if tokens.lex.spacy:
word_string += ' '
print(word_string)
p
| This is why you get whitespace tokens in spaCy — we need those tokens,
| so that we can reconstruct the document string. I also think you should
| have those tokens anyway. Most NLP libraries strip them, making it very
| difficult to recover the paragraph information once you're at the token
| level. You'll never have that sort of problem with spaCy — because
| there's a single source of truth.
+h(3) #[code cdef class Token]
p When you do...
+code.
doc[i]
p
| ...you get back an instance of class #[code spacy.tokens.token.Token].
| This instance owns no data. Instead, it holds the information
| #[code (doc, i)], and uses these to retrieve all information via the
| parent container.
+h(3) #[code cdef class Span]
p When you do...
+code.
doc[i : j]
p
| ...you get back an instance of class #[code spacy.tokens.span.Span].
| #[code Span] instances are also returned by the #[code .sents],
| #[code .ents] and #[code .noun_chunks] iterators of the #[code Doc]
| object. A #[code Span] is a slice of tokens, with an optional label
| attached. Its data model is:
+code.
cdef class Span:
cdef readonly Doc doc
cdef int start
cdef int end
cdef int start_char
cdef int end_char
cdef int label
p
| Once again, the #[code Span] owns almost no data. Instead, it refers
| back to the parent #[code Doc] container.
p
| The #[code start] and #[code end] attributes refer to token positions,
| while #[code start_char] and #[code end_char] record the character
| positions of the span. By recording the character offsets, we can still
| use the #[code Span] object if the tokenization of the document changes.
+h(3) #[code cdef class Lexeme]
p When you do...
+code.
vocab[u'the']
p
| ...you get back an instance of class #[code spacy.lexeme.Lexeme]. The
| #[code Lexeme]'s data model is:
+code.
cdef class Lexeme:
cdef LexemeC* c
cdef readonly Vocab vocab

View File

@ -17,6 +17,8 @@ p
| #[+a("http://deeplearning.net/software/theano/") Theano] is also
| supported.
+under-construction
+code("Runtime usage").
def count_entity_sentiment(nlp, texts):
'''Compute the net document sentiment for each entity in the texts.'''
@ -153,7 +155,9 @@ p
| adding another LSTM layer, using attention mechanism, using character
| features, etc.
+h(2, "attribute-hooks") Attribute hooks (experimental)
+h(2, "attribute-hooks") Attribute hooks
+under-construction
p
| Earlier, we saw how to store data in the new generic #[code user_data]

View File

@ -6,57 +6,85 @@ p
| spaCy features a fast and accurate syntactic dependency parser, and has
| a rich API for navigating the tree. The parser also powers the sentence
| boundary detection, and lets you iterate over base noun phrases, or
| "chunks".
| "chunks". You can check whether a #[+api("doc") #[code Doc]] object has
| been parsed with the #[code doc.is_parsed] attribute, which returns a
| boolean value. If this attribute is #[code False], the default sentence
| iterator will raise an exception.
+aside-code("Example").
import spacy
+h(2, "noun-chunks") Noun chunks
+tag-model("dependency parse")
p
| Noun chunks are "base noun phrases" flat phrases that have a noun as
| their head. You can think of noun chunks as a noun plus the words describing
| the noun for example, "the lavish green grass" or "the worlds largest
| tech fund". To get the noun chunks in a document, simply iterate over
| #[+api("doc#noun_chunks") #[code Doc.noun_chunks]].
+code("Example").
nlp = spacy.load('en')
doc = nlp(u'I like green eggs and ham.')
for np in doc.noun_chunks:
print(np.text, np.root.text, np.root.dep_, np.root.head.text)
# I I nsubj like
# green eggs eggs dobj like
# ham ham conj eggs
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
for chunk in doc.noun_chunks:
print(chunk.text, chunk.root.text, chunk.root.dep_,
chunk.root.head.text)
p
| You can check whether a #[+api("doc") #[code Doc]] object has been
| parsed with the #[code doc.is_parsed] attribute, which returns a boolean
| value. If this attribute is #[code False], the default sentence iterator
| will raise an exception.
+aside
| #[strong Text:] The original noun chunk text.#[br]
| #[strong Root text:] The original text of the word connecting the noun
| chunk to the rest of the parse.#[br]
| #[strong Root dep:] Dependcy relation connecting the root to its head.#[br]
| #[strong Root head text:] The text of the root token's head.#[br]
+h(2, "displacy") The displaCy visualizer
p
| The best way to understand spaCy's dependency parser is interactively,
| through the #[+a(DEMOS_URL + "/displacy", true) displaCy visualizer]. If
| you want to know how to write rules that hook into some type of syntactic
| construction, just plug the sentence into the visualizer and see how
| spaCy annotates it.
+table(["Text", "root.text", "root.dep_", "root.head.text"])
- var style = [0, 0, 1, 0]
+annotation-row(["Autonomous cars", "cars", "nsubj", "shift"], style)
+annotation-row(["insurance liability", "liability", "dobj", "shift"], style)
+annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style)
+h(2, "navigating") Navigating the parse tree
p
| spaCy uses the terms #[em head] and #[em child] to describe the words
| connected by a single arc in the dependency tree. The term #[em dep] is
| used for the arc label, which describes the type of syntactic relation
| that connects the child to the head. As with other attributes, the value
| of #[code token.dep] is an integer. You can get the string value with
| #[code token.dep_].
| spaCy uses the terms #[strong head] and #[strong child] to describe the words
| #[strong connected by a single arc] in the dependency tree. The term
| #[strong dep] is used for the arc label, which describes the type of
| syntactic relation that connects the child to the head. As with other
| attributes, the value of #[code .dep] is an integer. You can get
| the string value with #[code .dep_].
+aside-code("Example").
from spacy.symbols import det
the, dog = nlp(u'the dog')
assert the.dep == det
assert the.dep_ == 'det'
+code("Example").
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
for token in doc:
print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children])
+aside
| #[strong Text]: The original token text.#[br]
| #[strong Dep]: The syntactic relation connecting child to head.#[br]
| #[strong Head text]: The original text of the token head.#[br]
| #[strong Head POS]: The part-of-speech tag of the token head.#[br]
| #[strong Children]: The immediate syntactic dependents of the token.
+table(["Text", "Dep", "Head text", "Head POS", "Children"])
- var style = [0, 1, 0, 1, 0]
+annotation-row(["Autonomous", "amod", "cars", "NOUN", ""], style)
+annotation-row(["cars", "nsubj", "shift", "VERB", "Autonomous"], style)
+annotation-row(["shift", "ROOT", "shift", "VERB", "cars, liability"], style)
+annotation-row(["insurance", "compound", "liability", "NOUN", ""], style)
+annotation-row(["liability", "dobj", "shift", "VERB", "insurance, toward"], style)
+annotation-row(["toward", "prep", "liability", "NOUN", "manufacturers"], style)
+annotation-row(["manufacturers", "pobj", "toward", "ADP", ""], style)
+codepen("dcf8d293367ca185b935ed2ca11ebedd", 370)
p
| Because the syntactic relations form a tree, every word has exactly one
| head. You can therefore iterate over the arcs in the tree by iterating
| over the words in the sentence. This is usually the best way to match an
| arc of interest — from below:
| Because the syntactic relations form a tree, every word has
| #[strong exactly one head]. You can therefore iterate over the arcs in
| the tree by iterating over the words in the sentence. This is usually
| the best way to match an arc of interest — from below:
+code.
from spacy.symbols import nsubj, VERB
# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
@ -82,6 +110,8 @@ p
| attribute, which provides a sequence of #[+api("token") #[code Token]]
| objects.
+h(3, "navigating-around") Iterating around the local tree
p
| A few more convenience attributes are provided for iterating around the
| local tree from the token. The #[code .lefts] and #[code .rights]
@ -90,75 +120,118 @@ p
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
| that give the number of left and right children.
+aside-code("Examples").
apples = nlp(u'bright red apples on the tree')[2]
print([w.text for w in apples.lefts])
# ['bright', 'red']
print([w.text for w in apples.rights])
# ['on']
assert apples.n_lefts == 2
assert apples.n_rights == 1
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.')
root = [w for w in doc if w.head is w][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
assert subject.is_ancestor_of(descendant)
from spacy.symbols import nsubj
doc = nlp(u'Credit and mortgage account holders must submit their requests.')
holders = doc[4]
span = doc[holders.left_edge.i : holders.right_edge.i + 1]
span.merge()
for word in doc:
print(word.text, word.pos_, word.dep_, word.head.text)
# Credit and mortgage account holders nsubj NOUN submit
# must VERB aux submit
# submit VERB ROOT submit
# their DET det requests
# requests NOUN dobj submit
+code.
doc = nlp(u'bright red apples on the tree')
assert [token.text for token in doc[2].lefts]) == [u'bright', u'red']
assert [token.text for token in doc[2].rights]) == ['on']
assert doc[2].n_lefts == 2
assert doc[2].n_rights == 1
p
| You can get a whole phrase by its syntactic head using the
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
| For the default English model, the parse tree is #[em projective], which
| means that there are no crossing brackets. The tokens returned by
| #[code .subtree] are therefore guaranteed to be contiguous. This is not
| true for the German model, which has many
| #[+a("https://explosion.ai/blog/german-model#word-order", true) non-projective dependencies].
| You can walk up the tree with the #[code .ancestors] attribute, and
| check dominance with the #[code .is_ancestor()] method.
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
| method.
+aside("Projective vs. non-projective")
| For the #[+a("/docs/usage/models#available") default English model], the
| parse tree is #[strong projective], which means that there are no crossing
| brackets. The tokens returned by #[code .subtree] are therefore guaranteed
| to be contiguous. This is not true for the German model, which has many
| #[+a(COMPANY_URL + "/blog/german-model#word-order", true) non-projective dependencies].
+code.
doc = nlp(u'Credit and mortgage account holders must submit their requests')
root = [token for token in doc if token.head is token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
assert subject.is_ancestor(descendant)
print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights,
[ancestor.text for ancestor in descendant.ancestors])
+table(["Text", "Dep", "n_lefts", "n_rights", "ancestors"])
- var style = [0, 1, 1, 1, 0]
+annotation-row(["Credit", "nmod", 0, 2, "holders, submit"], style)
+annotation-row(["and", "cc", 0, 0, "Credit, holders, submit"], style)
+annotation-row(["mortgage", "compound", 0, 0, "account, Credit, holders, submit"], style)
+annotation-row(["account", "conj", 1, 0, "Credit, holders, submit"], style)
+annotation-row(["holders", "nsubj", 1, 0, "submit"], style)
p
| Finally, I often find the #[code .left_edge] and #[code right_edge]
| attributes especially useful. They give you the first and last token
| Finally, the #[code .left_edge] and #[code .right_edge] attributes
| can be especially useful, because they give you the first and last token
| of the subtree. This is the easiest way to create a #[code Span] object
| for a syntactic phrase — a useful operation.
| for a syntactic phrase. Note that #[code .right_edge] gives a token
| #[strong within] the subtree — so if you use it as the end-point of a
| range, don't forget to #[code +1]!
+code.
doc = nlp(u'Credit and mortgage account holders must submit their requests')
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
span.merge()
for token in doc:
print(token.text, token.pos_, token.dep_, token.head.text)
+table(["Text", "POS", "Dep", "Head text"])
- var style = [0, 1, 1, 0]
+annotation-row(["Credit and mortgage account holders", "NOUN", "nsubj", "submit"], style)
+annotation-row(["must", "VERB", "aux", "submit"], style)
+annotation-row(["submit", "VERB", "ROOT", "submit"], style)
+annotation-row(["their", "ADJ", "poss", "requests"], style)
+annotation-row(["requests", "NOUN", "dobj", "submit"], style)
+h(2, "displacy") Visualizing dependencies
p
| Note that #[code .right_edge] gives a token #[em within] the subtree —
| so if you use it as the end-point of a range, don't forget to #[code +1]!
| The best way to understand spaCy's dependency parser is interactively.
| To make this easier, spaCy v2.0+ comes with a visualization module. Simply
| pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup. If you want to know how to write rules that
| hook into some type of syntactic construction, just plug the sentence into
| the visualizer and see how spaCy annotates it.
+code.
from spacy import displacy
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')
displacy.serve(doc, style='dep')
+infobox
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You
| can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo].
+h(2, "disabling") Disabling the parser
p
| The parser is loaded and enabled by default. If you don't need any of
| the syntactic information, you should disable the parser. Disabling the
| parser will make spaCy load and run much faster. Here's how to prevent
| the parser from being loaded:
| In the #[+a("/docs/usage/models/available") default models], the parser
| is loaded and enabled as part of the
| #[+a("docs/usage/language-processing-pipelines") standard processing pipeline].
| If you don't need any of the syntactic information, you should disable
| the parser. Disabling the parser will make spaCy load and run much faster.
| If you want to load the parser, but need to disable it for specific
| documents, you can also control its use on the #[code nlp] object.
+code.
import spacy
nlp = spacy.load('en', disable=['parser'])
nlp = English().from_disk('/model', disable=['parser'])
doc = nlp(u"I don't want parsed", disable=['parser'])
nlp = spacy.load('en', parser=False)
p
| If you need to load the parser, but need to disable it for specific
| documents, you can control its use with the #[code parse] keyword
| argument:
+code.
nlp = spacy.load('en')
doc1 = nlp(u'Text I do want parsed.')
doc2 = nlp(u"Text I don't want parsed", parse=False)
+infobox("Important note: disabling pipeline components")
.o-block
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser] keyword argument
| has been replaced with #[code disable], which takes a list of
| #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['parser'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', parser=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -9,14 +9,12 @@ p
| locations, organizations and products. You can add arbitrary classes to
| the entity recognition system, and update the model with new examples.
+aside-code("Example").
import spacy
nlp = spacy.load('en')
doc = nlp(u'London is a big city in the United Kingdom.')
for ent in doc.ents:
print(ent.label_, ent.text)
# GPE London
# GPE United Kingdom
+h(2, "101") Named Entity Recognition 101
+tag-model("named entities")
include _spacy-101/_named-entities
+h(2, "accessing") Accessing entity annotations
p
| The standard way to access entity annotations is the
@ -26,56 +24,89 @@ p
| #[code ent.label] and #[code ent.label_]. The #[code Span] object acts
| as a sequence of tokens, so you can iterate over the entity or index into
| it. You can also get the text form of the whole entity, as though it were
| a single token. See the #[+api("span") API reference] for more details.
| a single token.
p
| You can access token entity annotations using the #[code token.ent_iob]
| and #[code token.ent_type] attributes. The #[code token.ent_iob]
| attribute indicates whether an entity starts, continues or ends on the
| tag (In, Begin, Out).
| You can also access token entity annotations using the
| #[+api("token#attributes") #[code token.ent_iob]] and
| #[+api("token#attributes") #[code token.ent_type]] attributes.
| #[code token.ent_iob] indicates whether an entity starts, continues or
| ends on the tag. If no entity type is set on a token, it will return an
| empty string.
+aside("IOB Scheme")
| #[code I] Token is inside an entity.#[br]
| #[code O] Token is outside an entity.#[br]
| #[code B] Token is the beginning of an entity.#[br]
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)
# (u'London', 2, u'GPE')
print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)
# (u'is', 3, u'')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
assert ent_san == [u'San', u'B', u'GPE']
assert ent_francisco == [u'Francisco', u'I', u'GPE']
+table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"])
- var style = [0, 1, 1, 1, 1, 0]
+annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style)
+annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style)
+annotation-row(["considers", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["banning", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["sidewalk", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["delivery", 2, "O", 0, '""', "outside an entity"], style)
+annotation-row(["robots", 2, "O", 0, '""', "outside an entity"], style)
+h(2, "setting") Setting entity annotations
p
| To ensure that the sequence of token annotations remains consistent, you
| have to set entity annotations at the document level — you can't write
| directly to the #[code token.ent_iob] or #[code token.ent_type]
| attributes. The easiest way to set entities is to assign to the
| #[code doc.ents] attribute.
| have to set entity annotations #[strong at the document level]. However,
| you can't write directly to the #[code token.ent_iob] or
| #[code token.ent_type] attributes, so the easiest way to set entities is
| to assign to the #[+api("doc#ents") #[code doc.ents]] attribute
| and create the new entity as a #[+api("span") #[code Span]].
+code("Example").
doc = nlp(u'London is a big city in the United Kingdom.')
doc.ents = []
assert doc[0].ent_type_ == ''
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])]
assert doc[0].ent_type_ == 'GPE'
doc.ents = []
doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)]
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
# the model didn't recognise any entities :(
ORG = doc.vocab.strings[u'ORG'] # get integer ID of entity label
netflix_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
doc.ents = [netflix_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents = [(u'Netflix', 0, 7, u'ORG')]
p
| The value you assign should be a sequence, the values of which
| can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)]
| tuples, where #[code start] and #[code end] are token offsets that
| describe the slice of the document that should be annotated.
| Keep in mind that you need to create a #[code Span] with the start and
| end index of the #[strong token], not the start and end index of the
| entity in the document. In this case, "Netflix" is token #[code (0, 1)]
| but at the document level, the entity will have the start and end
| indices #[code (0, 7)].
+h(3, "setting-from-array") Setting entity annotations from array
p
| You can also assign entity annotations using the #[code doc.from_array()]
| method. To do this, you should include both the #[code ENT_TYPE] and the
| #[code ENT_IOB] attributes in the array you're importing from.
| You can also assign entity annotations using the
| #[+api("doc#from_array") #[code doc.from_array()]] method. To do this,
| you should include both the #[code ENT_TYPE] and the #[code ENT_IOB]
| attributes in the array you're importing from.
+code("Example").
from spacy.attrs import ENT_IOB, ENT_TYPE
+code.
import numpy
from spacy.attrs import ENT_IOB, ENT_TYPE
doc = nlp.make_doc(u'London is a big city in the United Kingdom.')
assert list(doc.ents) == []
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 2 # B
@ -83,12 +114,14 @@ p
doc.from_array(header, attr_array)
assert list(doc.ents)[0].text == u'London'
+h(3, "setting-cython") Setting entity annotations in Cython
p
| Finally, you can always write to the underlying struct, if you compile
| a Cython function. This is easy to do, and allows you to write efficient
| native code.
| a #[+a("http://cython.org/") Cython] function. This is easy to do, and
| allows you to write efficient native code.
+code("Example").
+code.
# cython: infer_types=True
from spacy.tokens.doc cimport Doc
@ -104,67 +137,30 @@ p
| you'll have responsibility for ensuring that the data is left in a
| consistent state.
+h(2, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization
| yourself. To help you do that, spaCy v2.0+ comes with a visualization
| module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup.
p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
+h(2, "entity-types") Built-in entity types
include ../api/_annotation/_named-entities
+aside("Tip: Understanding entity types")
| You can also use #[code spacy.explain()] to get the description for the
| string representation of an entity label. For example,
| #[code spacy.explain("LANGUAGE")] will return "any named language".
+aside("Install")
| The #[+api("load") #[code spacy.load()]] function configures a pipeline that
| includes all of the available annotators for the given ID. In the example
| above, the #[code 'en'] ID tells spaCy to load the default English
| pipeline. If you have installed the data with
| #[code python -m spacy download en], this will include the entity
| recognition model.
include ../api/_annotation/_named-entities
+h(2, "updating") Training and updating
p
| To provide training examples to the entity recogniser, you'll first need
| to create an instance of the #[code GoldParse] class. You can specify
| your annotations in a stand-off format or as token tags.
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
| You can specify your annotations in a stand-off format or as token tags.
+code.
import spacy
import random
import spacy
from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
from spacy.pipeline import EntityRecognizer
train_data = [
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
@ -237,3 +233,34 @@ p
| loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle]
| imitation learning strategy. The transition system is equivalent to the
| BILOU tagging scheme.
+h(2, "displacy") Visualizing named entities
p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization
| yourself. To help you do that, spaCy v2.0+ comes with a visualization
| module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup.
p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)

View File

@ -175,6 +175,136 @@ p
+cell Python 3.5+
+cell Visual Studio 2015
+h(2, "troubleshooting") Troubleshooting guide
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy. Check the
| #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory. Run the
| #[code download] or #[code link] command as administrator, or use a
| #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to
| the latest version of pip. To see which version you have installed,
| run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment. Make sure you have spaCy installed. If you're using a
| #[code virtualenv], make sure it's activated and check that spaCy is
| installed in that environment otherwise, you're trying to load a system
| installation. You can also run #[code which python] to find out where
| your Python executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/docs/usage/models#available") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist. Set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.lang.bn import Bengali].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Run the command with #[code python -m],
| for example #[code python -m spacy download en]. For more info on this,
| see #[+api("cli#download") download].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy]. So, when
| using spaCy, never call anything else #[code spacy].
+h(2, "tests") Run tests
p

View File

@ -2,127 +2,352 @@
include ../../_includes/_mixins
p
| The standard entry point into spaCy is the #[code spacy.load()]
| function, which constructs a language processing pipeline. The standard
| variable name for the language processing pipeline is #[code nlp], for
| Natural Language Processing. The #[code nlp] variable is usually an
| instance of class #[code spacy.language.Language]. For English, the
| #[code spacy.en.English] class is the default.
+h(2, "101") Pipelines 101
include _spacy-101/_pipelines
+h(2, "pipelines") How pipelines work
p
| You'll use the nlp instance to produce #[+api("doc") #[code Doc]]
| objects. You'll then use the #[code Doc] object to access linguistic
| annotations to help you with whatever text processing task you're
| trying to do.
| spaCy makes it very easy to create your own pipelines consisting of
| reusable components this includes spaCy's default vectorizer, tagger,
| parser and entity regcognizer, but also your own custom processing
| functions. A pipeline component can be added to an already existing
| #[code nlp] object, specified when initialising a #[code Language] class,
| or defined within a
| #[+a("/docs/usage/saving-loading#models-generating") model package].
p
| When you load a model, spaCy first consults the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
+aside-code("meta.json (excerpt)", "json").
{
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"pipeline": ["token_vectors", "tagger"]
}
+list("numbers")
+item
| Look up #[strong pipeline IDs] in the available
| #[strong pipeline factories].
+item
| Initialise the #[strong pipeline components] by calling their
| factories with the #[code Vocab] as an argument. This gives each
| factory and component access to the pipeline's shared data, like
| strings, morphology and annotation scheme.
+item
| Load the #[strong language class and data] for the given ID via
| #[+api("util.get_lang_class") #[code get_lang_class]].
+item
| Pass the path to the #[strong model data] to the #[code Language]
| class and return it.
p
| So when you call this...
+code.
import spacy # See "Installing spaCy"
nlp = spacy.load('en') # You are here.
doc = nlp(u'Hello, spacy!') # See "Using the pipeline"
print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token"
+aside("Why do we have to preload?")
| Loading the models takes ~200x longer than
| processing a document. We therefore want to amortize the start-up cost
| across multiple invocations. It's often best to wrap the pipeline as a
| singleton. The library avoids doing that for you, because it's a
| difficult design to back out of.
p The #[code load] function takes the following positional arguments:
+table([ "Name", "Description" ])
+row
+cell #[code lang_id]
+cell
| An ID that is resolved to a class or factory function by
| #[code spacy.util.get_lang_class()]. Common values are
| #[code 'en'] for the English pipeline, or #[code 'de'] for the
| German pipeline. You can register your own factory function or
| class with #[code spacy.util.set_lang_class()].
nlp = spacy.load('en')
p
| All keyword arguments are passed forward to the pipeline factory. No
| keyword arguments are required. The built-in factories (e.g.
| #[code spacy.en.English], #[code spacy.de.German]), which are subclasses
| of #[+api("language") #[code Language]], respond to the following
| keyword arguments:
| ... the model tells spaCy to use the pipeline
| #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look
| up each string in its internal factories registry and initialise the
| individual components. It'll then load #[code spacy.lang.en.English],
| pass it the path to the model's data directory, and return it for you
| to use as the #[code nlp] object.
+table([ "Name", "Description"])
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
| then #[strong call each component] on the #[code Doc], in order.
| Components all return the modified document, which is then processed by
| the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
+h(2, "creating") Creating pipeline components and factories
p
| spaCy lets you customise the pipeline with your own components. Components
| are functions that receive a #[code Doc] object, modify and return it.
| If your component is stateful, you'll want to create a new one for each
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
+h(3, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
| #[strong performs the actual processing] for example, using the current
| weights to make a prediction and set some annotation on the document. By
| adding a component to the pipeline, you'll get access to the #[code Doc]
| at any point #[strong during] processing instead of only being able to
| modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code path]
+cell
| Where to load the data from. If None, the default data path is
| fetched via #[code spacy.util.get_data_path()]. You can
| configure this default using #[code spacy.util.set_data_path()].
| The data path is expected to be either a string, or an object
| responding to the #[code pathlib.Path] interface. If the path is
| a string, it will be immediately transformed into a
| #[code pathlib.Path] object. spaCy promises to never manipulate
| or open file-system paths as strings. All access to the
| file-system is done via the #[code pathlib.Path] interface.
| spaCy also promises to never check the type of path objects.
| This allows you to customize the loading behaviours in arbitrary
| ways, by creating your own object that implements the
| #[code pathlib.Path] interface.
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row
+cell #[code pipeline]
+cell
| A sequence of functions that take the Doc object and modify it
| in-place. See
| #[+a("customizing-pipeline") Customizing the pipeline].
+footrow
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
+row
+cell #[code create_pipeline]
+cell
| Callback to construct the pipeline sequence. It should accept
| the #[code nlp] instance as its only argument, and return a
| sequence of functions that take the #[code Doc] object and
| modify it in-place.
| See #[+a("customizing-pipeline") Customizing the pipeline]. If
| a value is supplied to the pipeline keyword argument, the
| #[code create_pipeline] keyword argument is ignored.
p
| When creating a new #[code Language] class, you can pass it a list of
| pipeline component functions to execute in that order. You can also
| add it to an existing pipeline by modifying #[code nlp.pipeline] just
| be careful not to overwrite a pipeline or its components by accident!
+row
+cell #[code make_doc]
+cell A function that takes the input and returns a document object.
+code.
# Create a new Language object with a pipeline
from spacy.language import Language
nlp = Language(pipeline=[my_component])
+row
+cell #[code create_make_doc]
+cell
| Callback to construct the #[code make_doc] function. It should
| accept the #[code nlp] instance as its only argument. To use the
| built-in annotation processes, it should return an object of
| type #[code Doc]. If a value is supplied to the #[code make_doc]
| keyword argument, the #[code create_make_doc] keyword argument
| is ignored.
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
+h(3, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
| It's called with the #[code Vocab] object, to give it access to the
| shared data between components for example, the strings, morphology,
| vectors or annotation scheme. Factories are useful for creating
| #[strong stateful components], especially ones which
| #[strong depend on shared data].
+aside-code("Example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell Supply a pre-built Vocab instance, instead of constructing one.
+row
+cell #[code add_vectors]
+cell #[code Vocab]
+cell
| Callback that installs word vectors into the Vocab instance. The
| #[code add_vectors] callback should take a
| #[+api("vocab") #[code Vocab]] instance as its only argument,
| and set the word vectors and #[code vectors_length] in-place. See
| #[+a("word-vectors-similarities") Word Vectors and Similarities].
| Shared data between components, including strings, morphology,
| vectors etc.
+row
+cell #[code tagger]
+cell Supply a pre-built tagger, instead of creating one.
+footrow
+cell returns
+cell callable
+cell The pipeline component.
+row
+cell #[code parser]
+cell Supply a pre-built parser, instead of creating one.
p
| By creating a factory, you're essentially telling spaCy how to get the
| pipeline component #[strong once the vocab is available]. Factories need to
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
| by assigning them a unique ID. This ID can be added to the pipeline as a
| string. When creating a pipeline, you're free to mix strings and
| callable components:
+row
+cell #[code entity]
+cell Supply a pre-built entity recognizer, instead of creating one.
+code.
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', my_other_component])
+row
+cell #[code matcher]
+cell Supply a pre-built matcher, instead of creating one.
p
| If spaCy comes across a string in the pipeline, it will try to resolve it
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
| #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
| #[+a("/docs/usage/saving-loading#models-generating") model package] with
| a custom pipeline.
+h(2, "example1") Example: Custom sentence segmentation logic
+aside("Real-world examples")
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or
| #[+src(gh("spacy")) entity recognizer].
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(2, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default vectorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"pipeline": ["vectorizer", "sentiment"]
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default vectorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
+h(2, "disabling") Disabling pipeline components
p
| If you don't need a particular component of the pipeline for
| example, the tagger or the parser, you can disable loading it. This can
| sometimes make a big difference and improve loading speed. Disabled
| component names can be provided to #[+api("spacy#load") #[code spacy.load]],
| #[+api("language#from_disk") #[code Language.from_disk]] or the
| #[code nlp] object itself as a list:
+code.
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = English().from_disk('/model', disable=['vectorizer', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
p
| Note that you can't write directly to #[code nlp.pipeline], as this list
| holds the #[em actual components], not the IDs. However, if you know the
| order of the components, you can still slice the list:
+code.
nlp = spacy.load('en')
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
+infobox("Important note: disabling pipeline components")
.o-block
| Since spaCy v2.0 comes with better support for customising the
| processing pipeline components, the #[code parser], #[code tagger]
| and #[code entity] keyword arguments have been replaced with
| #[code disable], which takes a list of pipeline component names.
| This lets you disable both default and custom components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -6,23 +6,164 @@ p
| The following examples and code snippets give you an overview of spaCy's
| functionality and its usage.
+h(2, "models") Install and load models
+h(2, "models") Install models and process text
+code(false, "bash").
python -m spacy download en
python -m spacy download de
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Hello, world. Here are two sentences.')
+h(2, "examples-resources") Load resources and process text
nlp_de = spacy.load('de')
doc_de = nlp_de(u'Ich bin ein Berliner.')
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
| #[strong Usage:] #[+a("/docs/usage/models") Models],
| #[+a("/docs/usage/spacy-101") spaCy 101]
+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
+tag-model("dependency parse")
+code.
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ")
assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
assert doc[17:19].text == u'outranking eggplant'
assert doc.noun_chunks[0].text == u'Peach emoji'
sentences = list(doc.sents)
assert len(sentences) == 3
assert sentences[0].text == u'Peach is the superior emoji.'
+infobox
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
+h(2, "examples-pos-tags") Get part-of-speech tags and flags
+tag-model("tagger")
+code.
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
assert [apple.pos_, apple.pos] == [u'PROPN', 94]
assert [apple.tag_, apple.tag] == [u'NNP', 475]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
assert apple.is_alpha == True
assert apple.is_punct == False
billion = doc[10]
assert billion.is_digit == False
assert billion.like_num == True
assert billion.like_email == False
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
+h(2, "examples-integer-ids") Use integer IDs for any string
+code.
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
assert token.text == hello_id == 3125
assert token.text == hello_str == 'Hello'
+h(2, "examples-entities") Recongnise and update named entities
+tag-model("NER")
+code.
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
assert ents == [(0, 7, u'ORG')]
+infobox
| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
+h(2, "displacy") Visualize a dependency parse and named entities in your browser
+tag-model("dependency parse", "NER")
+code.
from spacy import displacy
doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
u'in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
+h(2, "examples-word-vectors") Get word vectors and similarity
+tag-model("word vectors")
+code.
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
assert apple.similarity(banana) > pasta.similarity(hippo)
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
+infobox
| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
+h(2, "examples-serialization") Simple and efficient serialization
+code.
import spacy
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')
en_doc = en_nlp(u'Hello, world. Here are two sentences.')
de_doc = de_nlp(u'ich bin ein Berliner.')
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
+infobox
| #[strong API:] #[+api("language") #[code Language]],
| #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(2, "rule-matcher") Match text with token rules
+code.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT)
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(2, "multi-threaded") Multi-threaded generator
@ -35,37 +176,25 @@ p
if i == 100:
break
+h(2, "examples-tokens-sentences") Get tokens and sentences
+infobox
| #[strong API:] #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
+h(2, "examples-dependencies") Get syntactic dependencies
+tag-model("dependency parse")
+code.
token = doc[0]
sentence = next(doc.sents)
assert token is sentence[0]
assert sentence.text == 'Hello, world.'
def dependency_labels_to_root(token):
"""Walk up the syntactic tree, collecting the arc labels."""
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
+h(2, "examples-integer-ids") Use integer IDs for any string
+code.
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
assert token.orth == hello_id == 3125
assert token.orth_ == hello_str == 'Hello'
+h(2, "examples-string-views-flags") Get and set string views and flags
+code.
assert token.shape_ == 'Xxxxx'
for lexeme in nlp.vocab:
if lexeme.is_alpha:
lexeme.shape_ = 'W'
elif lexeme.is_digit:
lexeme.shape_ = 'D'
elif lexeme.is_punct:
lexeme.shape_ = 'P'
else:
lexeme.shape_ = 'M'
assert token.shape_ == 'W'
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
+h(2, "examples-numpy-arrays") Export to numpy arrays
@ -80,107 +209,25 @@ p
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
+h(2, "examples-word-vectors") Word vectors
+code.
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
assert apples.similarity(oranges) > boots.similarity(hippos)
+h(2, "examples-pos-tags") Part-of-speech tags
+code.
from spacy.parts_of_speech import ADV
def is_adverb(token):
return token.pos == spacy.parts_of_speech.ADV
# These are data-specific, so no constants are provided. You have to look
# up the IDs from the StringStore.
NNS = nlp.vocab.strings['NNS']
NNPS = nlp.vocab.strings['NNPS']
def is_plural_noun(token):
return token.tag == NNS or token.tag == NNPS
def print_coarse_pos(token):
print(token.pos_)
def print_fine_pos(token):
print(token.tag_)
+h(2, "examples-dependencies") Syntactic dependencies
+code.
def dependency_labels_to_root(token):
'''Walk up the syntactic tree, collecting the arc labels.'''
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
+h(2, "examples-entities") Named entities
+code.
def iter_products(docs):
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PRODUCT':
yield ent
def word_is_in_entity(word):
return word.ent_type != 0
def count_parent_verb_by_person(docs):
counts = defaultdict(lambda: defaultdict(int))
for doc in docs:
for ent in doc.ents:
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
counts[ent.orth_][ent.root.head.lemma_] += 1
return counts
+h(2, "examples-inline") Calculate inline mark-up on original string
+h(2, "examples-inline") Calculate inline markup on original string
+code.
def put_spans_around_tokens(doc, get_classes):
'''Given some function to compute class names, put each token in a
span element, with the appropriate classes computed.
All whitespace is preserved, outside of the spans. (Yes, I know HTML
won't display it. But the point is no information is lost, so you can
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
'''
"""Given some function to compute class names, put each token in a
span element, with the appropriate classes computed. All whitespace is
preserved, outside of the spans. (Of course, HTML won't display more than
one whitespace character it but the point is, no information is lost
and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
"""
output = []
template = '<span classes="{classes}">{word}</span>{space}'
html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
for token in doc:
if token.is_space:
output.append(token.orth_)
output.append(token.text)
else:
output.append(
template.format(
classes=' '.join(get_classes(token)),
word=token.orth_,
space=token.whitespace_))
classes = ' '.join(get_classes(token))
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
string = ''.join(output)
string = string.replace('\n', '')
string = string.replace('\t', ' ')
return string
+h(2, "examples-binary") Efficient binary serialization
+code.
import spacy
from spacy.tokens.doc import Doc
byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)
nlp = spacy.load('en')
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
doc = Doc(nlp.vocab)
doc.from_bytes(byte_string)

View File

@ -195,7 +195,7 @@ p
| privileges, the #[code spacy link] command may fail. The easiest solution
| is to re-run the command as admin, or use a #[code virtualenv]. For more
| info on this, see the
| #[+a("/docs/usage/troubleshooting#symlink-privilege") troubleshooting guide].
| #[+a("/docs/usage/#symlink-privilege") troubleshooting guide].
+h(3, "usage-import") Importing models as modules
@ -233,4 +233,4 @@ p
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading") saving and loading models].
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -7,22 +7,12 @@ p
| assigned to each token in the document. They're useful in rule-based
| processes. They can also be useful features in some statistical models.
p
| To use spaCy's tagger, you need to have a data pack installed that
| includes a tagging model. Tagging models are included in the data
| downloads for English and German. After you load the model, the tagger
| is applied automatically, as part of the default pipeline. You can then
| access the tags using the #[+api("token") #[code Token.tag]] and
| #[+api("token") #[code token.pos]] attributes. For English, the tagger
| also triggers some simple rule-based morphological processing, which
| gives you the lemma as well.
+h(2, "101") Part-of-speech tagging 101
+tag-model("tagger", "dependency parse")
+code("Usage").
import spacy
nlp = spacy.load('en')
doc = nlp(u'They told us to duck.')
for word in doc:
print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)
include _spacy-101/_pos-deps
+aside("Help spaCy's output is wrong!")
+h(2, "rule-based-morphology") Rule-based morphology
@ -63,7 +53,8 @@ p
+list("numbers")
+item
| The tokenizer consults a #[strong mapping table]
| The tokenizer consults a
| #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table]
| #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters
| to be mapped to multiple tokens. Each token may be assigned a part
| of speech and one or more morphological features.
@ -77,8 +68,9 @@ p
+item
| For words whose POS is not set by a prior process, a
| #[strong mapping table] #[code TAG_MAP] maps the tags to a
| part-of-speech and a set of morphological features.
| #[+a("/docs/usage/adding-languages#tag-map") mapping table]
| #[code TAG_MAP] maps the tags to a part-of-speech and a set of
| morphological features.
+item
| Finally, a #[strong rule-based deterministic lemmatizer] maps the

View File

@ -1,134 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING TEXT
include ../../_includes/_mixins
p
| Once you have loaded the #[code nlp] object, you can call it as though
| it were a function. This allows you to process a single unicode string.
+code.
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...')
p
| The library should perform equally well with short or long documents.
| All algorithms are linear-time in the length of the string, and once the
| data is loaded, there's no significant start-up cost to consider. This
| means that you don't have to strategically merge or split your text —
| you should feel free to feed in either single tweets or whole novels.
p
| If you run #[code nlp = spacy.load('en')], the #[code nlp] object will
| be an instance of #[code spacy.en.English]. This means that when you run
| #[code doc = nlp(text)], you're executing
| #[code spacy.en.English.__call__], which is implemented on its parent
| class, #[+api("language") #[code Language]].
+code.
doc = nlp.make_doc(text)
for proc in nlp.pipeline:
proc(doc)
p
| I've tried to make sure that the #[code Language.__call__] function
| doesn't do any "heavy lifting", so that you won't have complicated logic
| to replicate if you need to make your own pipeline class. This is all it
| does.
p
| The #[code .make_doc()] method and #[code .pipeline] attribute make it
| easier to customise spaCy's behaviour. If you're using the default
| pipeline, we can desugar one more time.
+code.
doc = nlp.tokenizer(text)
nlp.tagger(doc)
nlp.parser(doc)
nlp.entity(doc)
p Finally, here's where you can find out about each of those components:
+table(["Name", "Source"])
+row
+cell #[code tokenizer]
+cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer]
+row
+cell #[code tagger]
+cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger]
+row
+cell #[code parser]
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser]
+row
+cell #[code entity]
+cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer]
+h(2, "multithreading") Multi-threading with #[code .pipe()]
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()]
| method takes an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that the #[code .pipe()] method will be significantly faster in most
| practical situations, because it allows shared memory parallelism.
+code.
for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
pass
p
| To make full use of the #[code .pipe()] function, you might want to
| brush up on Python generators. Here are a few quick hints:
+list
+item
| Generator comprehensions can be written
| (#[code item for item in sequence])
+item
| The #[code itertools] built-in library and the #[code cytoolz]
| package provide a lot of handy generator tools
+item
| Often you'll have an input stream that pairs text with some
| important metadata, e.g. a JSON document. To pair up the metadata
| with the processed #[code Doc] object, you should use the tee
| function to split the generator in two, and then #[code izip] the
| extra stream to the document stream.
+h(2, "own-annotations") Bringing your own annotations
p
| spaCy generally assumes by default that your data is raw text. However,
| sometimes your data is partially annotated, e.g. with pre-existing
| tokenization, part-of-speech tags, etc. The most common situation is
| that you have pre-defined tokenization. If you have a list of strings,
| you can create a #[code Doc] object directly. Optionally, you can also
| specify a list of boolean values, indicating whether each word has a
| subsequent space.
+code.
doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
p
| If provided, the spaces list must be the same length as the words list.
| The spaces list affects the #[code doc.text], #[code span.text],
| #[code token.idx], #[code span.start_char] and #[code span.end_char]
| attributes. If you don't provide a #[code spaces] sequence, spaCy will
| assume that all words are whitespace delimited.
+code.
good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
assert bad_spaces.text == u'Hello , world !'
assert good_spaces.text == u'Hello, world!'
p
| Once you have a #[+api("doc") #[code Doc]] object, you can write to its
| attributes to set the part-of-speech tags, syntactic dependencies, named
| entities and other attributes. For details, see the respective usage
| pages.

View File

@ -0,0 +1,78 @@
//- 💫 DOCS > USAGE > PROCESSING TEXT
include ../../_includes/_mixins
+under-construction
+h(2, "multithreading") Multi-threading with #[code .pipe()]
p
| If you have a sequence of documents to process, you should use the
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
| an iterator of texts, and accumulates an internal buffer,
| which it works on in parallel. It then yields the documents in order,
| one-by-one. After a long and bitter struggle, the global interpreter
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
| that #[code .pipe()] will be significantly faster in most
| practical situations, because it allows shared memory parallelism.
+code.
for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
pass
p
| To make full use of the #[code .pipe()] function, you might want to
| brush up on #[strong Python generators]. Here are a few quick hints:
+list
+item
| Generator comprehensions can be written as
| #[code (item for item in sequence)].
+item
| The
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
| and the
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
| provide a lot of handy #[strong generator tools].
+item
| Often you'll have an input stream that pairs text with some
| important meta data, e.g. a JSON document. To
| #[strong pair up the meta data] with the processed #[code Doc]
| object, you should use the #[code itertools.tee] function to split
| the generator in two, and then #[code izip] the extra stream to the
| document stream.
+h(2, "own-annotations") Bringing your own annotations
p
| spaCy generally assumes by default that your data is raw text. However,
| sometimes your data is partially annotated, e.g. with pre-existing
| tokenization, part-of-speech tags, etc. The most common situation is
| that you have pre-defined tokenization. If you have a list of strings,
| you can create a #[code Doc] object directly. Optionally, you can also
| specify a list of boolean values, indicating whether each word has a
| subsequent space.
+code.
doc = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
p
| If provided, the spaces list must be the same length as the words list.
| The spaces list affects the #[code doc.text], #[code span.text],
| #[code token.idx], #[code span.start_char] and #[code span.end_char]
| attributes. If you don't provide a #[code spaces] sequence, spaCy will
| assume that all words are whitespace delimited.
+code.
good_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'], spaces=[False, True, False, False])
bad_spaces = Doc(nlp.vocab, words=[u'Hello', u',', u'world', u'!'])
assert bad_spaces.text == u'Hello , world !'
assert good_spaces.text == u'Hello, world!'
p
| Once you have a #[+api("doc") #[code Doc]] object, you can write to its
| attributes to set the part-of-speech tags, syntactic dependencies, named
| entities and other attributes. For details, see the respective usage
| pages.

View File

@ -1,118 +0,0 @@
//- 💫 DOCS > USAGE > RESOURCES
include ../../_includes/_mixins
p Many of the associated tools and resources that we're developing alongside spaCy can be found in their own repositories.
+h(2, "developer") Developer tools
+table(["Name", "Description"])
+row
+cell
+src(gh("spacy-models")) spaCy Models
+cell
| Model releases for spaCy.
+row
+cell
+src(gh("spacy-dev-resources")) spaCy Dev Resources
+cell
| Scripts, tools and resources for developing spaCy, adding new
| languages and training new models.
+row
+cell
+src("spacy-benchmarks") spaCy Benchmarks
+cell
| Runtime performance comparison of spaCy against other NLP
| libraries.
+row
+cell
+src(gh("spacy-services")) spaCy Services
+cell
| REST microservices for spaCy demos and visualisers.
+row
+cell
+src(gh("spacy-notebooks")) spaCy Notebooks
+cell
| Jupyter notebooks for spaCy examples and tutorials.
+h(2, "libraries") Libraries and projects
+table(["Name", "Description"])
+row
+cell
+src(gh("sense2vec")) sense2vec
+cell
| Use spaCy to go beyond vanilla
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec].
+h(2, "utility") Utility libraries and dependencies
+table(["Name", "Description"])
+row
+cell
+src(gh("thinc")) Thinc
+cell
| spaCy's Machine Learning library for NLP in Python.
+row
+cell
+src(gh("cymem")) Cymem
+cell
| Gate Cython calls to malloc/free behind Python ref-counted
| objects.
+row
+cell
+src(gh("preshed")) Preshed
+cell
| Cython hash tables that assume keys are pre-hashed
+row
+cell
+src(gh("murmurhash")) MurmurHash
+cell
| Cython bindings for
| #[+a("https://en.wikipedia.org/wiki/MurmurHash") MurmurHash2].
+h(2, "visualizers") Visualisers and demos
+table(["Name", "Description"])
+row
+cell
+src(gh("displacy")) displaCy.js
+cell
| A lightweight dependency visualisation library for the modern
| web, built with JavaScript, CSS and SVG.
| #[+a(DEMOS_URL + "/displacy") Demo here].
+row
+cell
+src(gh("displacy-ent")) displaCy#[sup ENT]
+cell
| A lightweight and modern named entity visualisation library
| built with JavaScript and CSS.
| #[+a(DEMOS_URL + "/displacy-ent") Demo here].
+row
+cell
+src(gh("sense2vec-demo")) sense2vec Demo
+cell
| Source of our Semantic Analysis of the Reddit Hivemind
| #[+a(DEMOS_URL + "/sense2vec") demo] using
| #[+a(gh("sense2vec")) sense2vec].

View File

@ -11,7 +11,7 @@ p
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
+aside("What about \"real\" regular expressions?")
//-+aside("What about \"real\" regular expressions?")
+h(2, "adding-patterns") Adding patterns
@ -119,7 +119,7 @@ p
+code.
# Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
@ -221,7 +221,7 @@ p
+cell match 0 or 1 times
+cell optional, max one
+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations
+h(2, "example1") Example: Using linguistic annotations
p
| Let's say you're analysing user comments and you want to find out what
@ -283,7 +283,7 @@ p
# set manual=True to make displaCy render straight from a dictionary
displacy.serve(matched_sents, style='ent', manual=True)
+h(3, "quantifiers-example2") Quantifiers example: Phone numbers
+h(2, "example2") Example: Phone numbers
p
| Phone numbers can have many different formats and matching them is often
@ -320,3 +320,114 @@ p
| It'll produce more predictable results, is much easier to modify and
| extend, and doesn't require any training data only a set of
| test cases.
+h(2, "example3") Example: Hashtags and emoji on social media
p
| Social media posts, especially tweets, can be difficult to work with.
| They're very short and often contain various emoji and hashtags. By only
| looking at the plain text, you'll lose a lot of valuable semantic
| information.
p
| Let's say you've extracted a large sample of social media posts on a
| specific topic, for example posts mentioning a brand name or product.
| As the first step of your data exploration, you want to filter out posts
| containing certain emoji and use them to assign a general sentiment
| score, based on whether the expressed emotion is positive or negative,
| e.g. #[span.o-icon.o-icon--inline 😀] or #[span.o-icon.o-icon--inline 😞].
| You also want to find, merge and label hashtags like
| #[code #MondayMotivation], to be able to ignore or analyse them later.
+aside("Note on sentiment analysis")
| Ultimately, sentiment analysis is not always #[em that] easy. In
| addition to the emoji, you'll also want to take specific words into
| account and check the #[code subtree] for intensifiers like "very", to
| increase the sentiment score. At some point, you might also want to train
| a sentiment model. However, the approach described in this example is
| very useful for #[strong bootstrapping rules to collect training data].
| It's also an incredibly fast way to gather first insights into your data
| with about 1 million tweets, you'd be looking at a processing time of
| #[strong under 1 minute].
p
| By default, spaCy's tokenizer will split emoji into separate tokens. This
| means that you can create a pattern for one or more emoji tokens. In this
| case, a sequence of identical emoji should be treated as one instance.
| Valid hashtags usually consist of a #[code #], plus a sequence of
| ASCII characters with no whitespace, making them easy to match as well.
+code.
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp = English() # we only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # positive emoji
neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒'] # negative emoji
# add patterns to match one or more emoji tokens
pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji]
matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}])
p
| Because the #[code on_match] callback receives the ID of each match, you
| can use the same function to handle the sentiment assignment for both
| the positive and negative pattern. To keep it simple, we'll either add
| or subtract #[code 0.1] points this way, the score will also reflect
| combinations of emoji, even positive #[em and] negative ones.
p
| With a library like
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it
| available as #[code token.norm_].
+code.
from emojipedia import Emojipedia # installation: pip install emojipedia
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if match_id is 'HAPPY':
doc.sentiment += 0.1 # add 0.1 for positive sentiment
elif match_id is 'SAD':
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title
p
| To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag.
+code.
# Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True
p
| To process a stream of social media posts, we can use
| #[+api("language#pipe") #[code Language.pipe()]], which will return a
| stream of #[code Doc] objects that we can pass to
| #[+api("matcher#pipe") #[code Matcher.pipe()]].
+code.
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)

View File

@ -1,45 +1,87 @@
include ../../_includes/_mixins
+h(2, "101") Serialization 101
include _spacy-101/_serialization
+infobox("Important note")
| In spaCy v2.0, the API for saving and loading has changed to only use the
| four methods listed above consistently across objects and classes. For an
| overview of the changes, see #[+a("/docs/usage/v2#incompat") this table]
| and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating].
+h(3, "example-doc") Example: Saving and loading a document
p
| For simplicity, let's assume you've
| #[+a("/docs/usage/entity-recognition#setting") added custom entities] to
| a #[code Doc], either manually, or by using a
| #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can
| save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]],
| and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]].
| This will overwrite the existing object and return it.
+code.
import spacy
from spacy.tokens import Span
text = u'Netflix is hiring a new VP of global policy'
nlp = spacy.load('en')
doc = nlp(text)
assert len(doc.ents) == 0 # Doc has no entities
doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity
doc.to_disk('/path/to/doc') # save Doc to disk
new_doc = nlp(text)
assert len(new_doc.ents) == 0 # new Doc has no entities
new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite
assert len(new_doc.ents) == 1 # entity is now recognised!
assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')]
+h(2, "models") Saving models
p
| After training your model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#save_to_directory") #[code Language.save_to_directory()]]
| #[+api("language#to_disk") #[code Language.to_disk()]]
| method:
+code.
nlp.save_to_directory('/home/me/data/en_example_model')
nlp.to_disk('/home/me/data/en_example_model')
p
| The directory will be created if it doesn't exist, and the whole pipeline
| will be written out. To make the model more convenient to deploy, we
| recommend wrapping it as a Python package.
+h(2, "generating") Generating a model package
+h(3, "models-generating") Generating a model package
+infobox("Important note")
| The model packages are #[strong not suitable] for the public
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
| designed for binary data and files over 50 MB. However, if your company
| is running an internal installation of pypi, publishing your models on
| there can be a convenient solution to share them with your team.
| is running an #[strong internal installation] of PyPi, publishing your
| models on there can be a convenient way to share them with your team.
p
| spaCy comes with a handy CLI command that will create all required files,
| and walk you through generating the meta data. You can also create the
| meta.json manually and place it in the model data directory, or supply a
| path to it using the #[code --meta] flag. For more info on this, see the
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
| path to it using the #[code --meta] flag. For more info on this, see
| the #[+api("cli#package") #[code package]] docs.
+aside-code("meta.json", "json").
{
"name": "example_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=1.7.0,&lt;2.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"description": "Example model for spaCy",
"author": "You",
"email": "you@example.com",
"license": "CC BY-SA 3.0"
"license": "CC BY-SA 3.0",
"pipeline": ["token_vectors", "tagger"]
}
+code(false, "bash").
@ -58,52 +100,112 @@ p This command will create a model package directory that should look like this:
p
| You can also find templates for all files in our
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
| #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources].
| If you're creating the package manually, keep in mind that the directories
| need to be named according to the naming conventions of
| #[code [language]_[name]] and #[code [language]_[name]-[version]]. The
| #[code lang] setting in the meta.json is also used to create the
| respective #[code Language] class in spaCy, which will later be returned
| by the model's #[code load()] method.
| #[code lang_name] and #[code lang_name-version].
+h(2, "building") Building a model package
+h(3, "models-custom") Customising the model setup
p
| The meta.json includes the model details, like name, requirements and
| license, and lets you customise how the model should be initialised and
| loaded. You can define the language data to be loaded and the
| #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to
| execute.
+table(["Setting", "Type", "Description"])
+row
+cell #[code lang]
+cell unicode
+cell ID of the language class to initialise.
+row
+cell #[code pipeline]
+cell list
+cell
| A list of strings mapping to the IDs of pipeline factories to
| apply in that order. If not set, spaCy's
| #[+a("/docs/usage/language-processing/pipelines") default pipeline]
| will be used.
p
| The #[code load()] method that comes with our model package
| templates will take care of putting all this together and returning a
| #[code Language] object with the loaded pipeline and data. If your model
| requires custom pipeline components, you should
| #[strong ship then with your model] and register their
| #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories]
| via #[+api("spacy#set_factory") #[code set_factory()]].
+aside-code("Factory example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+code.
spacy.set_factory('custom_component', custom_component_factory)
+infobox("Custom models with pipeline components")
| For more details and an example of how to package a sentiment model
| with a custom pipeline component, see the usage workflow on
| #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].
+h(3, "models-building") Building the model package
p
| To build the package, run the following command from within the
| directory. This will create a #[code .tar.gz] archive in a directory
| #[code /dist].
| directory. For more information on building Python packages, see the
| docs on Python's
| #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools].
+code(false, "bash").
python setup.py sdist
p
| For more information on building Python packages, see the
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
+h(2, "loading") Loading a model package
p
| Model packages can be installed by pointing pip to the model's
| #[code .tar.gz] archive:
| This will create a #[code .tar.gz] archive in a directory #[code /dist].
| The model can be installed by pointing pip to the path of the archive:
+code(false, "bash").
pip install /path/to/en_example_model-1.0.0.tar.gz
p You'll then be able to load the model as follows:
p
| You can then load the model via its name, #[code en_example_model], or
| import it directly as a module and then call its #[code load()] method.
+code.
import en_example_model
nlp = en_example_model.load()
+h(2, "loading") Loading a custom model package
p
| To load the model via #[code spacy.load()], you can also
| create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
| package name to a custom model name of your choice:
+code(false, "bash").
python -m spacy link en_example_model example
| To load a model from a data directory, you can use
| #[+api("spacy#load") #[code spacy.load()]] with the local path. This will
| look for a meta.json in the directory and use the #[code lang] and
| #[code pipeline] settings to initialise a #[code Language] class with a
| processing pipeline and load in the model data.
+code.
import spacy
nlp = spacy.load('example')
nlp = spacy.load('/path/to/model')
p
| If you want to #[strong load only the binary data], you'll have to create
| a #[code Language] class and call
| #[+api("language#from_disk") #[code from_disk]] instead.
+code.
from spacy.lang.en import English
nlp = English().from_disk('/path/to/data')
+infobox("Important note: Loading data in v2.x")
.o-block
| In spaCy 1.x, the distinction between #[code spacy.load()] and the
| #[code Language] class constructor was quite unclear. You could call
| #[code spacy.load()] when no model was present, and it would silently
| return an empty object. Likewise, you could pass a path to
| #[code English], even if the mode required a different language.
| spaCy v2.0 solves this with a clear distinction between setting up
| the instance and loading the data.
+code-new nlp = English().from_disk('/path/to/data')
+code-old nlp = spacy.load('en', path='/path/to/data')

View File

@ -2,9 +2,256 @@
include ../../_includes/_mixins
+h(2, "features") Features
+under-construction
+aside
| If one of spaCy's functionalities #[strong needs a model], it means that
| you need to have one our the available
| #[+a("/docs/usage/models") statistical models] installed. Models are used
| to #[strong predict] linguistic annotations for example, if a word is
| a verb or a noun.
+table(["Name", "Description", "Needs model"])
+row
+cell #[strong Tokenization]
+cell
+cell #[+procon("con")]
+row
+cell #[strong Part-of-speech Tagging]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Dependency Parsing]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Sentence Boundary Detection]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Named Entity Recongition] (NER)
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
+cell #[+procon("con")]
+row
+cell #[strong Similarity]
+cell
+cell #[+procon("pro")]
+row
+cell #[strong Training]
+cell
+cell #[+procon("neutral")]
+row
+cell #[strong Serialization]
+cell
+cell #[+procon("neutral")]
+h(2, "annotations") Linguistic annotations
p
| spaCy provides a variety of linguistic annotations to give you insights
| into a text's grammatical structure. This includes the word types,
| i.e. the parts of speech, and how the words are related to each other.
| For example, if you're analysing text, it makes a huge difference
| whether a noun is the subject of a sentence, or the object or whether
| "google" is used as a verb, or refers to the website or company in a
| specific context.
p
| Once you've downloaded and installed a #[+a("/docs/usage/models") model],
| you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will
| return a #[code Language] object contaning all components and data needed
| to process text. We usually call it #[code nlp]. Calling the #[code nlp]
| object on a string of text will return a processed #[code Doc]:
+code.
import spacy
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
p
| Even though a #[code Doc] is processed e.g. split into individual words
| and annotated it still holds #[strong all information of the original text],
| like whitespace characters. This way, you'll never lose any information
| when processing text with spaCy.
+h(3, "annotations-token") Tokenization
include _spacy-101/_tokenization
+infobox
| To learn more about how spaCy's tokenization rules work in detail,
| how to #[strong customise and replace] the default tokenizer and how to
| #[strong add language-specific data], see the usage guides on
| #[+a("/docs/usage/adding-languages") adding languages] and
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
+tag-model("dependency parse")
include _spacy-101/_pos-deps
+infobox
| To learn more about #[strong part-of-speech tagging] and rule-based
| morphology, and how to #[strong navigate and use the parse tree]
| effectively, see the usage guides on
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
+h(3, "annotations-ner") Named Entities
+tag-model("named entities")
include _spacy-101/_named-entities
+infobox
| To learn more about entity recognition in spaCy, how to
| #[strong add your own entities] to a document and how to
| #[strong train and update] the entity predictions of a model, see the
| usage guides on
| #[+a("/docs/usage/entity-recognition") named entity recognition] and
| #[+a("/docs/usage/training-ner") training the named entity recognizer].
+h(2, "vectors-similarity") Word vectors and similarity
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+infobox
| To learn more about word vectors, how to #[strong customise them] and
| how to load #[strong your own vectors] into spaCy, see the usage
| guide on
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
+h(2, "pipelines") Pipelines
include _spacy-101/_pipelines
+infobox
| To learn more about #[strong how processing pipelines work] in detail,
| how to enable and disable their components, and how to
| #[strong create your own], see the usage guide on
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
include _spacy-101/_vocab-stringstore
+h(2, "serialization") Serialization
include _spacy-101/_serialization
+infobox
| To learn more about #[strong serialization] and how to
| #[strong save and load your own models], see the usage guide on
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
+h(2, "training") Training
include _spacy-101/_training
+h(2, "architecture") Architecture
+under-construction
+image
include ../../assets/img/docs/architecture.svg
.u-text-right
+button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic
+table(["Name", "Description"])
+row
+cell #[+api("language") #[code Language]]
+cell
| A text-processing pipeline. Usually you'll load this once per
| process as #[code nlp] and pass the instance around your application.
+row
+cell #[+api("doc") #[code Doc]]
+cell A container for accessing linguistic annotations.
+row
+cell #[+api("span") #[code Span]]
+cell A slice from a #[code Doc] object.
+row
+cell #[+api("token") #[code Token]]
+cell
| An individual token — i.e. a word, punctuation symbol, whitespace,
| etc.
+row
+cell #[+api("lexeme") #[code Lexeme]]
+cell
| An entry in the vocabulary. It's a word type with no context, as
| opposed to a word token. It therefore has no part-of-speech tag,
| dependency parse etc.
+row
+cell #[+api("vocab") #[code Vocab]]
+cell
| A lookup table for the vocabulary that allows you to access
| #[code Lexeme] objects.
+row
+cell #[code Morphology]
+cell
+row
+cell #[+api("stringstore") #[code StringStore]]
+cell Map strings to and from integer IDs.
+row
+row
+cell #[+api("tokenizer") #[code Tokenizer]]
+cell
| Segment text, and create #[code Doc] objects with the discovered
| segment boundaries.
+row
+cell #[+api("tagger") #[code Tagger]]
+cell Annotate part-of-speech tags on #[code Doc] objects.
+row
+cell #[+api("dependencyparser") #[code DependencyParser]]
+cell Annotate syntactic dependencies on #[code Doc] objects.
+row
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
+cell
| Annotate named entities, e.g. persons or products, on #[code Doc]
| objects.
+row
+cell #[+api("matcher") #[code Matcher]]
+cell
| Match sequences of tokens, based on pattern rules, similar to
| regular expressions.
+h(3, "architecture-other") Other
+table(["Name", "Description"])
+row
+cell #[+api("goldparse") #[code GoldParse]]
+cell Collection for training annotations.
+row
+cell #[+api("goldcorpus") #[code GoldCorpus]]
+cell
| An annotated corpus, using the JSON file format. Manages
| annotations for tagging, dependency parsing and NER.

View File

@ -64,44 +64,10 @@ p
| predicts the new category with minimal difference from the previous
| output.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the #[code Language.save_to_directory()]
| method:
+code.
nlp.save_to_directory('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
| to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading") saving and loading models].
p
| After you've generated and installed the package, you'll be able to
| load the model as follows:
+code.
import en_technology
nlp = en_technology.load()
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
+under-construction
p
| This script shows how to add a new entity type to an existing pre-trained
| NER model. To keep the example short and simple, only four sentences are
@ -170,5 +136,33 @@ p
p
| After training your model, you can
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
| models as Python packages, for ease of deployment.
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
| wrapping models as Python packages, for ease of deployment.
+h(2, "saving-loading") Saving and loading
p
| After training our model, you'll usually want to save its state, and load
| it back later. You can do this with the
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
+code.
nlp.to_disk('/home/me/data/en_technology')
p
| To make the model more convenient to deploy, we recommend wrapping it as
| a Python package, so that you can install it via pip and load it as a
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
| CLI command to create all required files and directories.
+code(false, "bash").
python -m spacy package /home/me/data/en_technology /home/me/my_models
p
| To build the package and create a #[code .tar.gz] archive, run
| #[code python setup.py sdist] from within its directory.
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/docs/usage/saving-loading#models") saving and loading models].

View File

@ -6,6 +6,10 @@ p
| Once the model is trained, you can then
| #[+a("/docs/usage/saving-loading") save and load] it.
+h(2, "101") Training 101
include _spacy-101/_training
+h(2, "train-pos-tagger") Training the part-of-speech tagger
+code.
@ -77,59 +81,3 @@ p.o-inline-list
p
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
+h(2, "feature-templates") Customizing the feature extraction
p
| spaCy currently uses linear models for the tagger, parser and entity
| recognizer, with weights learned using the
| #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
+aside("Linear Model Feature Scheme")
| For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
p
| Because it's a linear model, it's important for accuracy to build
| conjunction features out of the atomic predictors. Let's say you have
| two atomic predictors asking, "What is the part-of-speech of the
| previous token?", and "What is the part-of-speech of the previous
| previous token?". These predictors will introduce a number of features,
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
p
| The feature extraction proceeds in two passes. In the first pass, we
| fill an array with the values of all of the atomic predictors. In the
| second pass, we iterate over the feature templates, and fill a small
| temporary array with the predictors that will be combined into a
| conjunction feature. Finally, we hash this array into a 64-bit integer,
| using the MurmurHash algorithm. You can see this at work in the
| #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
p
| It's very easy to change the feature templates, to create novel
| combinations of the existing atomic predictors. There's currently no API
| available to add new atomic predictors, though. You'll have to create a
| subclass of the model, and write your own #[code set_featuresC] method.
p
| The feature templates are passed in using the #[code features] keyword
| argument to the constructors of the #[+api("tagger") #[code Tagger]],
| #[+api("dependencyparser") #[code DependencyParser]] and
| #[+api("entityrecognizer") #[code EntityRecognizer]]:
+code.
from spacy.vocab import Vocab
from spacy.pipeline import Tagger
from spacy.tagger import P2_orth, P1_orth
from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
(P2_orth,), (P1_orth,), (W_orth,),
(N1_orth,), (N2_orth,)])
p
| Custom feature templates can be passed to the #[code DependencyParser]
| and #[code EntityRecognizer] as well, also using the #[code features]
| keyword argument of the constructor.

View File

@ -1,190 +0,0 @@
//- 💫 DOCS > USAGE > TROUBLESHOOTING
include ../../_includes/_mixins
p
| This section collects some of the most common errors you may come
| across when installing, loading and using spaCy, as well as their solutions.
+aside("Help us improve this guide")
| Did you come across a problem like the ones listed here and want to
| share the solution? You can find the "Suggest edits" button at the
| bottom of this page that points you to the source. We always
| appreciate #[+a(gh("spaCy") + "/pulls") pull requests]!
+h(2, "install-loading") Installation and loading
+h(3, "compatible-model") No compatible model found
+code(false, "text").
No compatible model found for [lang] (spaCy v#{SPACY_VERSION}).
p
| This usually means that the model you're trying to download does not
| exist, or isn't available for your version of spaCy.
+infobox("Solutions")
| Check the #[+a(gh("spacy-models", "compatibility.json")) compatibility table]
| to see which models are available for your spaCy version. If you're using
| an old version, consider upgrading to the latest release. Note that while
| spaCy supports tokenization for
| #[+a("/docs/api/language-models/#alpha-support") a variety of languages],
| not all of them come with statistical models. To only use the tokenizer,
| import the language's #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "symlink-privilege") Symbolic link privilege not held
+code(false, "text").
OSError: symbolic link privilege not held
p
| To create #[+a("/docs/usage/models/#usage") shortcut links] that let you
| load models by name, spaCy creates a symbolic link in the
| #[code spacy/data] directory. This means your user needs permission to do
| this. The above error mostly occurs when doing a system-wide installation,
| which will create the symlinks in a system directory.
+infobox("Solutions")
| Run the #[code download] or #[code link] command as administrator,
| or use a #[code virtualenv] to install spaCy in a user directory, instead
| of doing a system-wide installation.
+h(3, "no-cache-dir") No such option: --no-cache-dir
+code(false, "text").
no such option: --no-cache-dir
p
| The #[code download] command uses pip to install the models and sets the
| #[code --no-cache-dir] flag to prevent it from requiring too much memory.
| #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting]
| requires pip v6.0 or newer.
+infobox("Solution")
| Run #[code pip install -U pip] to upgrade to the latest version of pip.
| To see which version you have installed, run #[code pip --version].
+h(3, "import-error") Import error
+code(false, "text").
Import Error: No module named spacy
p
| This error means that the spaCy module can't be located on your system, or in
| your environment.
+infobox("Solutions")
| Make sure you have spaCy installed. If you're using a #[code virtualenv],
| make sure it's activated and check that spaCy is installed in that
| environment otherwise, you're trying to load a system installation. You
| can also run #[code which python] to find out where your Python
| executable is located.
+h(3, "import-error-models") Import error: models
+code(false, "text").
ImportError: No module named 'en_core_web_sm'
p
| As of spaCy v1.7, all models can be installed as Python packages. This means
| that they'll become importable modules of your application. When creating
| #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try
| to import the model to load its meta data. If this fails, it's usually a
| sign that the package is not installed in the current environment.
+infobox("Solutions")
| Run #[code pip list] or #[code pip freeze] to check which model packages
| you have installed, and install the
| #[+a("/docs/usage/models#available") correct models] if necessary. If you're
| importing a model manually at the top of a file, make sure to use the name
| of the package, not the shortcut link you've created.
+h(3, "vocab-strings") File not found: vocab/strings.json
+code(false, "text").
FileNotFoundError: No such file or directory: [...]/vocab/strings.json
p
| This error may occur when using #[code spacy.load()] to load
| a language model either because you haven't set up a
| #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it
| doesn't actually exist.
+infobox("Solutions")
| Set up a #[+a("/docs/usage/models/#usage") shortcut link] for the model
| you want to load. This can either be an installed model package, or a
| local directory containing the model data. If you want to use one of the
| #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for
| languages that don't yet have a statistical model, you should import its
| #[code Language] class instead, for example
| #[code from spacy.fr import French].
+h(3, "command-not-found") Command not found
+code(false, "text").
command not found: spacy
p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Instead, commands need to be prefixed with
| #[code python -m].
+infobox("Solution")
| Run the command with #[code python -m], for example
| #[code python -m spacy download en]. For more info on this, see the
| #[+a("/docs/usage/cli") CLI documentation].
+h(3, "module-load") 'module' object has no attribute 'load'
+code(false, "text").
AttributeError: 'module' object has no attribute 'load'
p
| While this could technically have many causes, including spaCy being
| broken, the most likely one is that your script's file or directory name
| is "shadowing" the module e.g. your file is called #[code spacy.py],
| or a directory you're importing from is called #[code spacy].
+infobox("Solution")
| When using spaCy, never call anything else #[code spacy].
+h(2, "usage") Using spaCy
+h(3, "pos-lemma-number") POS tag or lemma is returned as number
+code.
doc = nlp(u'This is text.')
print([word.pos for word in doc])
# [88, 98, 90, 95]
p
| Like many NLP libraries, spaCy encodes all strings to integers. This
| reduces memory usage and improves efficiency. The integer mapping also
| makes it easy to interoperate with numpy. To access the string
| representation instead of the integer ID, add an underscore #[code _]
| after the attribute.
+infobox("Solutions")
| Use #[code pos_] or #[code lemma_] instead. See the
| #[+api("token#attributes") #[code Token] attributes] for a list of available
| attributes and their string representations.
+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-]
+code.
doc = nlp(u'They are')
print(doc[0].lemma_)
# -PRON-
p
| This is in fact expected behaviour and not a bug.
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns. For more info on this, see the
| #[+api("annotation#lemmatization") annotation specs] on lemmatization.

View File

@ -8,6 +8,65 @@ p
+h(2, "features") New features
+h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example").
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
# Register a factory to create a component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', mycomponent])
p
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
+image
include ../../assets/img/docs/pipeline.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
+h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
nlp.to_disk('/path/to/nlp')
nlp = English().from_disk('/path/to/nlp')
p
| spay's serialization API has been made consistent across classes and
| objects. All container classes, i.e. #[code Language], #[code Doc],
| #[code Vocab] and #[code StringStore] now have a #[code to_bytes()],
| #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method
| that supports the Pickle protocol.
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package] or a path.
| The #[code Language] class to initialise will be determined based on the
| model's settings. For a blank language, you can import the class directly,
| e.g. #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
@ -28,78 +87,32 @@ p
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+h(3, "features-loading") Loading
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+h(3, "features-language") Improved language data and lazy loading
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code path] keyword argument is now deprecated.
| Language-specfic data now lives in its own submodule, #[code spacy.lang].
| Languages are lazy-loaded, i.e. only loaded when you import a
| #[code Language] class, or load a model that initialises one. This allows
| languages to contain more custom data, e.g. lemmatizer lookup tables, or
| complex regular expressions. The language data has also been tidied up
| and simplified. spaCy now also supports simple lookup-based lemmatization.
p
| The #[code Language] class to initialise will be determined based on the
| model's settings. If no model is found, spaCy will let you know and won't
| just return an empty #[code Language] object anymore. If you want a blank
| language, you can always import the class directly, e.g.
| #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-language") Improved language data and processing pipelines
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
+aside-code("Example").
LOOKUP = {
"aba": "abar",
"ababa": "abar",
"ababais": "abar",
"ababan": "abar",
"ababanes": "ababán"
}
p
| spaCy now supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma. To determine a token's
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
| the lookup table, and should be returned by the #[code create_lemmatizer]
| classmethod of the language's defaults.
+image
include ../../assets/img/docs/language_data.svg
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
assert len(matcher) == 1
assert 'HelloWorld' in matcher
assert 'HEARTS' in matcher
p
| Patterns can now be added to the matcher by calling
@ -113,12 +126,6 @@ p
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(3, "features-serializer") Serialization
+infobox
| #[strong API:] #[+api("serializer") #[code Serializer]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-models") Neural network models for English, German, French and Spanish
+infobox
@ -128,33 +135,25 @@ p
+h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell
| #[code spacy.en]
| #[code spacy.xx]
+cell
| #[code spacy.lang.en]
| #[code spacy.lang.xx]
+row
+cell #[code spacy.orth]
+cell #[code spacy.lang.xx.lex_attrs]
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+cell #[code Language.create_make_doc]
+cell #[+api("language#attributes") #[code Language.tokenizer]]
+row
+cell
@ -188,6 +187,28 @@ p
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Tokenizer.load]
+cell -
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell #[code Matcher.load]
+cell -
@ -208,12 +229,100 @@ p
+row
+cell #[code Doc.read_bytes]
+cell
+cell #[+api("binder") #[code Binder]]
+row
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+h(2, "migrating") Migrating from spaCy 1.x
+list
+item Saving, loading and serialization.
+item Processing pipelines and language data.
+item Adding patterns and callbacks to the matcher.
+item Models trained with spaCy 1.x.
+infobox("Some tips")
| Before migrating, we strongly recommend writing a few
| #[strong simple tests] specific to how you're using spaCy in your
| application. This makes it easier to check whether your code requires
| changes, and if so, which parts are affected.
| (By the way, feel free contribute your tests to
| #[+src(gh("spaCy", "spacy/tests")) our test suite] this will also ensure
| we never accidentally introduce a bug in a workflow that's
| important to you.) If you've trained your own models, keep in mind that
| your train and runtime inputs must match. This means you'll have to
| #[strong retrain your models] with spaCy v2.0 to make them compatible.
+h(3, "migrating-saving-loading") Saving, loading and serialization
p
| Double-check all calls to #[code spacy.load()] and make sure they don't
| use the #[code path] keyword argument. If you're only loading in binary
| data and not a model package that can construct its own #[code Language]
| class and pipeline, you should now use the
| #[+api("language#from_disk") #[code Language.from_disk()]] method.
+code-new.
nlp = spacy.load('/model')
nlp = English().from_disk('/model/data')
+code-old nlp = spacy.load('en', path='/model')
p
| Review all other code that writes state to disk or bytes.
| All containers, now share the same, consistent API for saving and
| loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and
| loading with #[code from_disk()] and #[code from_bytes()].
+code-new.
nlp.to_disk('/model')
nlp.vocab.to_disk('/vocab')
+code-old.
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
+h(3, "migrating-languages") Processing pipelines and language data
p
| If you're importing language data or #[code Language] classes, make sure
| to change your import statements to import from #[code spacy.lang]. If
| you've added your own custom language, it needs to be moved to
| #[code spacy/lang/xx] and adjusted accordingly.
+code-new from spacy.lang.en import English
+code-old from spacy.en import English
p
| If you've been using custom pipeline components, check out the new
| guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able
| to make this more convenient by registering "component factories".
| Components of the processing pipeline can now be disabled by passing a
| list of their names to the #[code disable] keyword argument on loading
| or processing.
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)
+h(3, "migrating-matcher") Adding patterns and callbacks to the matcher
p
| If you're using the matcher, you can now add patterns in one step. This
| should be easy to update simply merge the ID, callback and patterns
| into one call to #[+api("matcher#add") #[code matcher.add]].
+code-new.
matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}])
+code-old.
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
+h(3, "migrating-models") Trained models

View File

@ -180,8 +180,8 @@ p
p
| If you don't need the web server and just want to generate the markup
| for example, to export it to a file or serve it in a custom
| way you can use #[+api("displacy#render") #[code displacy.render]]
| instead. It works the same, but returns a string containing the markup.
| way you can use #[+api("displacy#render") #[code displacy.render]].
| It works the same way, but returns a string containing the markup.
+code("Example").
import spacy
@ -220,10 +220,32 @@ p
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
| loop over them and export them separately.
+h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
+code("Example").
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load('en')
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sentence)
svg = displacy.render(doc, style='dep')
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
output_path = Path('/images/' + file_name)
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+h(2, "jupyter") Using displaCy in Jupyter notebooks
p
| displaCy is able to detect whether you're within a
| displaCy is able to detect whether you're working in a
| #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
| that can be rendered in a cell straight away. When you export your
| notebook, the visualizations will be included as HTML.
@ -257,28 +279,6 @@ p
html = displacy.render(doc, style='dep')
return display(HTML(html))
+h(2, "examples") Usage examples
+h(3, "examples-export-svg") Export SVG graphics of dependency parses
+code("Example").
import spacy
from spacy import displacy
from pathlib import Path
nlp = spacy.load('en')
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sentence)
svg = displacy.render(doc, style='dep')
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
output_path = Path('/images/' + file_name)
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+h(2, "manual-usage") Rendering data manually
p
@ -314,3 +314,62 @@ p
'text': 'But Google is starting from behind.',
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
'title': None
}
+h(2, "webapp") Using displaCy in a web application
p
| If you want to use the visualizers as part of a web application, for
| example to create something like our
| #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to
| simply wrap and serve the displaCy renderer. Instead, you should only
| rely on the server to perform spaCy's processing capabilities, and use
| #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output.
+aside("Why not return the HTML by the server?")
| It's certainly possible to just have your server return the markup.
| But outputting raw, unsanitised HTML is risky and makes your app vulnerable to
| #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting]
| (XSS). All your user needs to do is find a way to make spaCy return one
| token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
| Instead of relying on the server to render and sanitize HTML, you
| can do this on the client in JavaScript. displaCy.js creates
| the markup as DOM nodes and will never insert raw HTML.
p
| The #[code parse_deps] function takes a #[code Doc] object and returns
| a dictionary in a format that can be rendered by displaCy.
+code("Example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
def displacy_service(text):
doc = nlp(text)
return displacy.parse_deps(doc)
p
| Using a library like #[+a("https://falconframework.org/") Falcon] or
| #[+a("http://www.hug.rest/") Hug], you can easily turn the above code
| into a simple REST API that receives a text and returns a JSON-formatted
| parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and
| initialise it with the API URL and the ID or query selector of the
| container to render the visualisation in, e.g. #[code '#displacy'] for
| #[code &lt;div id="displacy"&gt;].
+code("script.js", "javascript").
var displacy = new displaCy('http://localhost:8080', {
container: '#displacy'
})
function parse(text) {
displacy.parse(text);
}
p
| When you call #[code parse()], it will make a request to your API,
| receive the JSON-formatted parse and render it in your container. To
| create an interactive experience, you could trigger this function by
| a button and read the text from an #[code &lt;input&gt;] field.

View File

@ -6,61 +6,37 @@ p
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
| family of algorithms.
+aside("Tip")
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
p
| spaCy makes using word vectors very easy. The
| #[+api("lexeme") #[code Lexeme]], #[+api("token") #[code Token]],
| #[+api("span") #[code Span]] and #[+api("doc") #[code Doc]] classes all
| have a #[code .vector] property, which is a 1-dimensional numpy array of
| 32-bit floats:
+code.
import numpy
apples, and_, oranges = nlp(u'apples and oranges')
print(apples.vector.shape)
# (1,)
apples.similarity(oranges)
p
| By default, #[code Token.vector] returns the vector for its underlying
| lexeme, while #[code Doc.vector] and #[code Span.vector] return an
| average of the vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+aside-code("Example").
# TODO
p
| The default English model installs vectors for one million vocabulary
| entries, using the 300-dimensional vectors trained on the Common Crawl
| family of algorithms. The default
| #[+a("/docs/usage/models#available") English model] installs
| 300-dimensional vectors trained on the Common Crawl
| corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe]
| algorithm. The GloVe common crawl vectors have become a de facto
| standard for practical NLP.
+aside-code("Example").
# TODO
+aside("Tip: Training a word2vec model")
| If you need to train a word2vec model, we recommend the implementation in
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+h(2, "101") Similarity and word vectors 101
+tag-model("vectors")
include _spacy-101/_similarity
include _spacy-101/_word-vectors
+h(2, "custom") Customising word vectors
+under-construction
p
| You can load new word vectors from a file-like buffer using the
| #[code vocab.load_vectors()] method. The file should be a
| whitespace-delimited text file, where the word is in the first column,
| and subsequent columns provide the vector data. For faster loading, you
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
| path to a binary file written by #[code vocab.dump_vectors()].
| By default, #[+api("token#vector") #[code Token.vector]] returns the
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
| #[+api("doc#vector") #[code Doc.vector]] and
| #[+api("span#vector") #[code Span.vector]] return an average of the
| vectors of their tokens. You can customize these
| behaviours by modifying the #[code doc.user_hooks],
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
| dictionaries.
+aside-code("Example").
# TODO
+h(2, "similarity") Similarity
p
| You can also load vectors from memory, by writing to the #[code lexeme.vector]
| property. If the vectors you are writing are of different dimensionality
| from the ones currently loaded, you should first call
| #[code vocab.resize_vectors(new_size)].
+under-construction