mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Merge remote-tracking branch 'upstream/develop' into indonesian
This commit is contained in:
commit
7ae45bffcf
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -40,7 +40,6 @@ venv/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
env/
|
env/
|
||||||
bin/
|
|
||||||
build/
|
build/
|
||||||
develop-eggs/
|
develop-eggs/
|
||||||
dist/
|
dist/
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.rst
|
include README.rst
|
||||||
|
include bin/spacy
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -187,6 +187,7 @@ def setup_package():
|
||||||
url=about['__uri__'],
|
url=about['__uri__'],
|
||||||
license=about['__license__'],
|
license=about['__license__'],
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
|
scripts=['bin/spacy'],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'numpy>=1.7',
|
'numpy>=1.7',
|
||||||
'murmurhash>=0.28,<0.29',
|
'murmurhash>=0.28,<0.29',
|
||||||
|
|
88
spacy/_ml.py
88
spacy/_ml.py
|
@ -5,12 +5,10 @@ from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
from thinc.neural._classes.convolution import ExtractWindow
|
from thinc.neural._classes.convolution import ExtractWindow
|
||||||
from thinc.neural._classes.static_vectors import StaticVectors
|
from thinc.neural._classes.static_vectors import StaticVectors
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm
|
from thinc.neural._classes.batchnorm import BatchNorm
|
||||||
from thinc.neural._classes.layernorm import LayerNorm as LN
|
|
||||||
from thinc.neural._classes.resnet import Residual
|
from thinc.neural._classes.resnet import Residual
|
||||||
from thinc.neural import ReLu
|
from thinc.neural import ReLu
|
||||||
from thinc.neural._classes.selu import SELU
|
from thinc.neural._classes.selu import SELU
|
||||||
|
@ -21,7 +19,7 @@ from thinc.api import FeatureExtracter, with_getitem
|
||||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||||
from thinc.neural._classes.attention import ParametricAttention
|
from thinc.neural._classes.attention import ParametricAttention
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.api import uniqued, wrap, flatten_add_lengths
|
from thinc.api import uniqued, wrap
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||||
from .tokens.doc import Doc
|
from .tokens.doc import Doc
|
||||||
|
@ -55,27 +53,6 @@ def _logistic(X, drop=0.):
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
|
||||||
def add_tuples(X, drop=0.):
|
|
||||||
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
|
||||||
sum the values, returning a single sequence.
|
|
||||||
|
|
||||||
If input is:
|
|
||||||
((vals1, length), (vals2, length)
|
|
||||||
Output is:
|
|
||||||
(vals1+vals2, length)
|
|
||||||
|
|
||||||
vals are a single tensor for the whole batch.
|
|
||||||
"""
|
|
||||||
(vals1, length1), (vals2, length2) = X
|
|
||||||
assert length1 == length2
|
|
||||||
|
|
||||||
def add_tuples_bwd(dY, sgd=None):
|
|
||||||
return (dY, dY)
|
|
||||||
|
|
||||||
return (vals1+vals2, length), add_tuples_bwd
|
|
||||||
|
|
||||||
|
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
@ -84,7 +61,6 @@ def _zero_init(model):
|
||||||
model.W.fill(0.)
|
model.W.fill(0.)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc(docs, drop=0.):
|
def _preprocess_doc(docs, drop=0.):
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
|
@ -96,6 +72,7 @@ def _preprocess_doc(docs, drop=0.):
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _init_for_precomputed(W, ops):
|
def _init_for_precomputed(W, ops):
|
||||||
if (W**2).sum() != 0.:
|
if (W**2).sum() != 0.:
|
||||||
return
|
return
|
||||||
|
@ -103,7 +80,6 @@ def _init_for_precomputed(W, ops):
|
||||||
ops.xavier_uniform_init(reshaped)
|
ops.xavier_uniform_init(reshaped)
|
||||||
W[:] = reshaped.reshape(W.shape)
|
W[:] = reshaped.reshape(W.shape)
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
@describe.on_data(_set_dimensions_if_needed)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
|
@ -209,7 +185,7 @@ class PrecomputableMaxouts(Model):
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||||
|
@ -220,9 +196,9 @@ def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
with_flatten(
|
with_flatten(
|
||||||
asarray(Model.ops, dtype='uint64')
|
asarray(Model.ops, dtype='uint64')
|
||||||
>> uniqued(embed, column=5)
|
>> embed
|
||||||
>> LN(Maxout(width, width*4, pieces=3))
|
>> Maxout(width, width*4, pieces=3)
|
||||||
>> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
||||||
|
@ -321,7 +297,7 @@ def zero_init(model):
|
||||||
|
|
||||||
|
|
||||||
def doc2feats(cols=None):
|
def doc2feats(cols=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -347,36 +323,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
def fine_tune(embedding, combine=None):
|
|
||||||
if combine is not None:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"fine_tune currently only supports addition. Set combine=None")
|
|
||||||
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
|
||||||
docs, tokvecs = docs_tokvecs
|
|
||||||
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
|
||||||
|
|
||||||
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
|
||||||
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
|
||||||
flat_vecs = embedding.ops.flatten(vecs)
|
|
||||||
output = embedding.ops.unflatten(
|
|
||||||
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
|
|
||||||
lengths)
|
|
||||||
|
|
||||||
def fine_tune_bwd(d_output, sgd=None):
|
|
||||||
bp_vecs(d_output, sgd=sgd)
|
|
||||||
flat_grad = model.ops.flatten(d_output)
|
|
||||||
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
|
|
||||||
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
|
|
||||||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
|
||||||
return d_output
|
|
||||||
return output, fine_tune_bwd
|
|
||||||
model = wrap(fine_tune_fwd, embedding)
|
|
||||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
|
||||||
model.mix.fill(1.)
|
|
||||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.):
|
||||||
if isinstance(seqs[0], numpy.ndarray):
|
if isinstance(seqs[0], numpy.ndarray):
|
||||||
|
@ -423,26 +369,6 @@ def preprocess_doc(docs, drop=0.):
|
||||||
vals = ops.allocate(keys.shape[0]) + 1
|
vals = ops.allocate(keys.shape[0]) + 1
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
def getitem(i):
|
|
||||||
def getitem_fwd(X, drop=0.):
|
|
||||||
return X[i], None
|
|
||||||
return layerize(getitem_fwd)
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
|
||||||
# Input: (doc, tensor) tuples
|
|
||||||
private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
|
|
||||||
|
|
||||||
model = (
|
|
||||||
fine_tune(private_tok2vec)
|
|
||||||
>> with_flatten(
|
|
||||||
Maxout(token_vector_width, token_vector_width)
|
|
||||||
>> Softmax(nr_class, token_vector_width)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model.nI = None
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
nr_vector = cfg.get('nr_vector', 200)
|
nr_vector = cfg.get('nr_vector', 200)
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = 'spacy-nightly'
|
||||||
__version__ = '2.0.0a7'
|
__version__ = '2.0.0a9'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
|
|
|
@ -8,7 +8,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .link import link
|
from .link import link
|
||||||
from ..util import prints
|
from ..util import prints, get_package_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,7 +32,11 @@ def download(cmd, model, direct=False):
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||||
try:
|
try:
|
||||||
link(None, model_name, model, force=True)
|
# Get package path here because link uses
|
||||||
|
# pip.get_installed_distributions() to check if model is a package,
|
||||||
|
# which fails if model was just installed via subprocess
|
||||||
|
package_path = get_package_path(model_name)
|
||||||
|
link(None, model_name, model, force=True, model_path=package_path)
|
||||||
except:
|
except:
|
||||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||||
# a convenience wrapper, it's best to show a success message and
|
# a convenience wrapper, it's best to show a success message and
|
||||||
|
|
|
@ -14,7 +14,7 @@ from .. import util
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def link(cmd, origin, link_name, force=False):
|
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
|
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
|
||||||
if util.is_package(origin):
|
if util.is_package(origin):
|
||||||
model_path = util.get_package_path(origin)
|
model_path = util.get_package_path(origin)
|
||||||
else:
|
else:
|
||||||
model_path = Path(origin)
|
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
prints("The data should be located in %s" % path2str(model_path),
|
prints("The data should be located in %s" % path2str(model_path),
|
||||||
title="Can't locate model data", exits=1)
|
title="Can't locate model data", exits=1)
|
||||||
|
|
|
@ -15,10 +15,11 @@ from .. import about
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("output parent directory", "positional", None, str),
|
||||||
meta=("path to meta.json", "option", "m", str),
|
meta_path=("path to meta.json", "option", "m", str),
|
||||||
|
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
|
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
"""
|
"""
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
prints(input_path, title="Model directory not found", exits=1)
|
prints(input_path, title="Model directory not found", exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
|
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
template_manifest = get_template('MANIFEST.in')
|
template_manifest = get_template('MANIFEST.in')
|
||||||
template_init = get_template('xx_model_name/__init__.py')
|
template_init = get_template('xx_model_name/__init__.py')
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / 'meta.json'
|
||||||
if meta_path.is_file():
|
if not create_meta and meta_path.is_file():
|
||||||
prints(meta_path, title="Reading meta.json from file")
|
prints(meta_path, title="Reading meta.json from file")
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -91,8 +91,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses,
|
drop=next(dropout_rates), losses=losses)
|
||||||
update_tensors=True)
|
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
|
|
|
@ -15,7 +15,7 @@ def depr_model_download(lang):
|
||||||
lang (unicode): Language shortcut, 'en' or 'de'.
|
lang (unicode): Language shortcut, 'en' or 'de'.
|
||||||
"""
|
"""
|
||||||
prints("The spacy.%s.download command is now deprecated. Please use "
|
prints("The spacy.%s.download command is now deprecated. Please use "
|
||||||
"python -m spacy download [model name or shortcut] instead. For "
|
"spacy download [model name or shortcut] instead. For "
|
||||||
"more info, see the documentation:" % lang,
|
"more info, see the documentation:" % lang,
|
||||||
about.__docs_models__,
|
about.__docs_models__,
|
||||||
"Downloading default '%s' model now..." % lang,
|
"Downloading default '%s' model now..." % lang,
|
||||||
|
|
|
@ -277,8 +277,7 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
update_tensors=False):
|
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -311,7 +310,7 @@ class Language(object):
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if update_tensors and d_tokvecses is not None:
|
if d_tokvecses is not None:
|
||||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
|
@ -382,18 +381,9 @@ class Language(object):
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
scorer = Scorer()
|
|
||||||
docs, golds = zip(*docs_golds)
|
docs, golds = zip(*docs_golds)
|
||||||
docs = list(docs)
|
scorer = Scorer()
|
||||||
golds = list(golds)
|
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
||||||
for pipe in self.pipeline:
|
|
||||||
if not hasattr(pipe, 'pipe'):
|
|
||||||
for doc in docs:
|
|
||||||
pipe(doc)
|
|
||||||
else:
|
|
||||||
docs = list(pipe.pipe(docs))
|
|
||||||
assert len(docs) == len(golds)
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
doc.tensor = None
|
doc.tensor = None
|
||||||
return scorer
|
return scorer
|
||||||
|
|
|
@ -42,7 +42,7 @@ from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
from ._ml import build_text_classifier
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
|
@ -253,25 +253,23 @@ class NeuralTagger(BaseThincComponent):
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict(([doc], [doc.tensor]))
|
tags = self.predict([doc.tensor])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [d.tensor for d in docs]
|
||||||
tag_ids = self.predict((docs, tokvecs))
|
tag_ids = self.predict(tokvecs)
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs_tokvecs):
|
def predict(self, tokvecs):
|
||||||
scores = self.model(docs_tokvecs)
|
scores = self.model(tokvecs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
tokvecs = docs_tokvecs[1]
|
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses,
|
||||||
[tv.shape[0] for tv in tokvecs])
|
[tv.shape[0] for tv in tokvecs])
|
||||||
return guesses
|
return guesses
|
||||||
|
@ -296,7 +294,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
self.model.nI = tokvecs[0].shape[1]
|
self.model.nI = tokvecs[0].shape[1]
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
|
||||||
|
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
@ -347,7 +346,9 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return build_tagger_model(n_tags, token_vector_width)
|
return with_flatten(
|
||||||
|
chain(Maxout(token_vector_width, token_vector_width),
|
||||||
|
Softmax(n_tags, token_vector_width)))
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
@ -431,7 +432,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.cfg.setdefault('labels', {})
|
return self.cfg.get('labels', {})
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
|
@ -454,7 +455,9 @@ class NeuralLabeller(NeuralTagger):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return build_tagger_model(n_tags, token_vector_width)
|
return with_flatten(
|
||||||
|
chain(Maxout(token_vector_width, token_vector_width),
|
||||||
|
Softmax(n_tags, token_vector_width)))
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
|
|
|
@ -385,7 +385,6 @@ cdef class ArcEager(TransitionSystem):
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
return Transition(clas=0, move=MISSING, label=0)
|
|
||||||
|
|
||||||
def move_name(self, int move, attr_t label):
|
def move_name(self, int move, attr_t label):
|
||||||
label_str = self.strings[label]
|
label_str = self.strings[label]
|
||||||
|
|
|
@ -14,4 +14,8 @@ cdef class Parser:
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
|
|
||||||
|
cdef void _parse_step(self, StateC* state,
|
||||||
|
const float* feat_weights,
|
||||||
|
int nr_class, int nr_feat, int nr_piece) nogil
|
||||||
|
|
||||||
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
||||||
|
|
|
@ -44,7 +44,7 @@ from thinc.neural.util import get_array_module
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_async, get_cuda_stream
|
from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
from .._ml import Tok2Vec, doc2feats, rebatch
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -237,7 +237,6 @@ cdef class Parser:
|
||||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||||
tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()))
|
|
||||||
if parser_maxout_pieces == 1:
|
if parser_maxout_pieces == 1:
|
||||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
|
@ -249,10 +248,15 @@ cdef class Parser:
|
||||||
nI=token_vector_width)
|
nI=token_vector_width)
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
|
if depth == 0:
|
||||||
|
upper = chain()
|
||||||
|
upper.is_noop = True
|
||||||
|
else:
|
||||||
upper = chain(
|
upper = chain(
|
||||||
clone(Maxout(hidden_width), (depth-1)),
|
clone(Maxout(hidden_width), (depth-1)),
|
||||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
upper.is_noop = False
|
||||||
# TODO: This is an unfortunate hack atm!
|
# TODO: This is an unfortunate hack atm!
|
||||||
# Used to set input dimensions in network.
|
# Used to set input dimensions in network.
|
||||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||||
|
@ -264,7 +268,7 @@ cdef class Parser:
|
||||||
'hidden_width': hidden_width,
|
'hidden_width': hidden_width,
|
||||||
'maxout_pieces': parser_maxout_pieces
|
'maxout_pieces': parser_maxout_pieces
|
||||||
}
|
}
|
||||||
return (tensors, lower, upper), cfg
|
return (lower, upper), cfg
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||||
"""
|
"""
|
||||||
|
@ -340,10 +344,12 @@ cdef class Parser:
|
||||||
The number of threads with which to work on the buffer in parallel.
|
The number of threads with which to work on the buffer in parallel.
|
||||||
Yields (Doc): Documents, in order.
|
Yields (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
|
cdef StateClass parse_state
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
queue = []
|
||||||
for docs in cytoolz.partition_all(batch_size, docs):
|
for docs in cytoolz.partition_all(batch_size, docs):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecs = [doc.tensor for doc in docs]
|
tokvecs = [d.tensor for d in docs]
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
parse_states = self.parse_batch(docs, tokvecs)
|
parse_states = self.parse_batch(docs, tokvecs)
|
||||||
else:
|
else:
|
||||||
|
@ -363,11 +369,8 @@ cdef class Parser:
|
||||||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
if isinstance(tokvecses, np.ndarray):
|
|
||||||
tokvecses = [tokvecses]
|
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
|
@ -391,7 +394,14 @@ cdef class Parser:
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
c_token_ids = <int*>token_ids.data
|
c_token_ids = <int*>token_ids.data
|
||||||
c_is_valid = <int*>is_valid.data
|
c_is_valid = <int*>is_valid.data
|
||||||
|
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||||
while not next_step.empty():
|
while not next_step.empty():
|
||||||
|
if not has_hidden:
|
||||||
|
for i in cython.parallel.prange(
|
||||||
|
next_step.size(), num_threads=6, nogil=True):
|
||||||
|
self._parse_step(next_step[i],
|
||||||
|
feat_weights, nr_class, nr_feat, nr_piece)
|
||||||
|
else:
|
||||||
for i in range(next_step.size()):
|
for i in range(next_step.size()):
|
||||||
st = next_step[i]
|
st = next_step[i]
|
||||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||||
|
@ -419,7 +429,6 @@ cdef class Parser:
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
cdef StateClass stcls, output
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||||
cuda_stream, 0.0)
|
cuda_stream, 0.0)
|
||||||
|
@ -452,6 +461,28 @@ cdef class Parser:
|
||||||
beams.append(beam)
|
beams.append(beam)
|
||||||
return beams
|
return beams
|
||||||
|
|
||||||
|
cdef void _parse_step(self, StateC* state,
|
||||||
|
const float* feat_weights,
|
||||||
|
int nr_class, int nr_feat, int nr_piece) nogil:
|
||||||
|
'''This only works with no hidden layers -- fast but inaccurate'''
|
||||||
|
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
|
||||||
|
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
|
||||||
|
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||||
|
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
|
||||||
|
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||||
|
|
||||||
|
state.set_context_tokens(token_ids, nr_feat)
|
||||||
|
sum_state_features(scores,
|
||||||
|
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
|
||||||
|
self.moves.set_valid(is_valid, state)
|
||||||
|
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
|
||||||
|
action = self.moves.c[guess]
|
||||||
|
action.do(state, action.label)
|
||||||
|
|
||||||
|
free(is_valid)
|
||||||
|
free(scores)
|
||||||
|
free(token_ids)
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
@ -460,9 +491,6 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.)
|
|
||||||
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
|
||||||
tokvecs += my_tokvecs
|
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
|
@ -512,9 +540,7 @@ cdef class Parser:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
#bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -577,7 +603,7 @@ cdef class Parser:
|
||||||
return names
|
return names
|
||||||
|
|
||||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||||
_, lower, upper = self.model
|
lower, upper = self.model
|
||||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||||
lower, stream, drop=dropout)
|
lower, stream, drop=dropout)
|
||||||
return state2vec, upper
|
return state2vec, upper
|
||||||
|
@ -667,12 +693,10 @@ cdef class Parser:
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
serializers = {
|
serializers = {
|
||||||
'tok2vec_model': lambda p: p.open('wb').write(
|
|
||||||
self.model[0].to_bytes()),
|
|
||||||
'lower_model': lambda p: p.open('wb').write(
|
'lower_model': lambda p: p.open('wb').write(
|
||||||
self.model[1].to_bytes()),
|
self.model[0].to_bytes()),
|
||||||
'upper_model': lambda p: p.open('wb').write(
|
'upper_model': lambda p: p.open('wb').write(
|
||||||
self.model[2].to_bytes()),
|
self.model[1].to_bytes()),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||||
|
@ -693,29 +717,24 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
with (path / 'tok2vec_model').open('rb') as file_:
|
|
||||||
bytes_data = file_.read()
|
|
||||||
self.model[0].from_bytes(bytes_data)
|
|
||||||
with (path / 'lower_model').open('rb') as file_:
|
with (path / 'lower_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[1].from_bytes(bytes_data)
|
self.model[0].from_bytes(bytes_data)
|
||||||
with (path / 'upper_model').open('rb') as file_:
|
with (path / 'upper_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[2].from_bytes(bytes_data)
|
self.model[1].from_bytes(bytes_data)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('tok2vec_model', lambda: self.model[0].to_bytes()),
|
('lower_model', lambda: self.model[0].to_bytes()),
|
||||||
('lower_model', lambda: self.model[1].to_bytes()),
|
('upper_model', lambda: self.model[1].to_bytes()),
|
||||||
('upper_model', lambda: self.model[2].to_bytes()),
|
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: ujson.dumps(self.cfg))
|
('cfg', lambda: ujson.dumps(self.cfg))
|
||||||
))
|
))
|
||||||
if 'model' in exclude:
|
if 'model' in exclude:
|
||||||
exclude['tok2vec_model'] = True
|
|
||||||
exclude['lower_model'] = True
|
exclude['lower_model'] = True
|
||||||
exclude['upper_model'] = True
|
exclude['upper_model'] = True
|
||||||
exclude.pop('model')
|
exclude.pop('model')
|
||||||
|
@ -726,7 +745,6 @@ cdef class Parser:
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
('tok2vec_model', lambda b: None),
|
|
||||||
('lower_model', lambda b: None),
|
('lower_model', lambda b: None),
|
||||||
('upper_model', lambda b: None)
|
('upper_model', lambda b: None)
|
||||||
))
|
))
|
||||||
|
@ -736,12 +754,10 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves)
|
self.model, cfg = self.Model(self.moves.n_moves)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
if 'tok2vec_model' in msg:
|
|
||||||
self.model[0].from_bytes(msg['tok2vec_model'])
|
|
||||||
if 'lower_model' in msg:
|
if 'lower_model' in msg:
|
||||||
self.model[1].from_bytes(msg['lower_model'])
|
self.model[0].from_bytes(msg['lower_model'])
|
||||||
if 'upper_model' in msg:
|
if 'upper_model' in msg:
|
||||||
self.model[2].from_bytes(msg['upper_model'])
|
self.model[1].from_bytes(msg['upper_model'])
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -107,8 +107,6 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def is_valid(self, StateClass stcls, move_name):
|
def is_valid(self, StateClass stcls, move_name):
|
||||||
action = self.lookup_transition(move_name)
|
action = self.lookup_transition(move_name)
|
||||||
if action.move == 0:
|
|
||||||
return False
|
|
||||||
return action.is_valid(stcls.c, action.label)
|
return action.is_valid(stcls.c, action.label)
|
||||||
|
|
||||||
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
||||||
|
|
|
@ -113,7 +113,7 @@ def load_model(name, **overrides):
|
||||||
def load_model_from_link(name, **overrides):
|
def load_model_from_link(name, **overrides):
|
||||||
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
||||||
init_file = get_data_path() / name / '__init__.py'
|
init_file = get_data_path() / name / '__init__.py'
|
||||||
spec = importlib.util.spec_from_file_location(name, init_file)
|
spec = importlib.util.spec_from_file_location(name, str(init_file))
|
||||||
try:
|
try:
|
||||||
cls = importlib.util.module_from_spec(spec)
|
cls = importlib.util.module_from_spec(spec)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
|
|
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
|
||||||
label - [string] aside title (optional or false for no label)
|
label - [string] aside title (optional or false for no label)
|
||||||
language - [string] language for syntax highlighting (default: "python")
|
language - [string] language for syntax highlighting (default: "python")
|
||||||
supports basic relevant languages available for PrismJS
|
supports basic relevant languages available for PrismJS
|
||||||
icon - [string] icon to display next to code block, mostly used for old/new
|
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
|
||||||
height - [integer] optional height to clip code block to
|
height - [integer] optional height to clip code block to
|
||||||
|
|
||||||
mixin code(label, language, icon, height)
|
mixin code(label, language, prompt, height)
|
||||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
||||||
if label
|
if label
|
||||||
h4.u-text-label.u-text-label--dark=label
|
h4.u-text-label.u-text-label--dark=label
|
||||||
|
- var icon = (prompt == 'accept' || prompt == 'reject')
|
||||||
if icon
|
if icon
|
||||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||||
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
||||||
+icon(icon, 18)
|
+icon(icon, 18)
|
||||||
|
|
||||||
code.c-code-block__content
|
code.c-code-block__content(data-prompt=icon ? null : prompt)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,13 @@
|
||||||
font: normal normal 1.1rem/#{2} $font-code
|
font: normal normal 1.1rem/#{2} $font-code
|
||||||
padding: 1em 2em
|
padding: 1em 2em
|
||||||
|
|
||||||
|
&[data-prompt]:before,
|
||||||
|
content: attr(data-prompt)
|
||||||
|
margin-right: 0.65em
|
||||||
|
display: inline-block
|
||||||
|
vertical-align: middle
|
||||||
|
opacity: 0.5
|
||||||
|
|
||||||
|
|
||||||
//- Inline code
|
//- Inline code
|
||||||
|
|
||||||
|
|
|
@ -5,16 +5,7 @@ include ../../_includes/_mixins
|
||||||
p
|
p
|
||||||
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
| As of v1.7.0, spaCy comes with new command line helpers to download and
|
||||||
| link models and show useful debugging information. For a list of available
|
| link models and show useful debugging information. For a list of available
|
||||||
| commands, type #[code python -m spacy]. To make the command even more
|
| commands, type #[code spacy --help].
|
||||||
| convenient, we recommend
|
|
||||||
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
|
|
||||||
| mapping #[code python -m spacy] to #[code spacy].
|
|
||||||
|
|
||||||
+aside("Why python -m?")
|
|
||||||
| The problem with a global entry point is that it's resolved by looking up
|
|
||||||
| entries in your #[code PATH] environment variable. This can give you
|
|
||||||
| unexpected results, like executing the wrong spaCy installation.
|
|
||||||
| #[code python -m] prevents fallbacks to system modules.
|
|
||||||
|
|
||||||
+infobox("⚠️ Deprecation note")
|
+infobox("⚠️ Deprecation note")
|
||||||
| As of spaCy 2.0, the #[code model] command to initialise a model data
|
| As of spaCy 2.0, the #[code model] command to initialise a model data
|
||||||
|
@ -33,8 +24,8 @@ p
|
||||||
| Direct downloads don't perform any compatibility checks and require the
|
| Direct downloads don't perform any compatibility checks and require the
|
||||||
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
|
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy download [model] [--direct]
|
spacy download [model] [--direct]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -80,8 +71,8 @@ p
|
||||||
| or use the #[+api("cli#package") #[code package]] command to create a
|
| or use the #[+api("cli#package") #[code package]] command to create a
|
||||||
| model package.
|
| model package.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy link [origin] [link_name] [--force]
|
spacy link [origin] [link_name] [--force]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -112,8 +103,8 @@ p
|
||||||
| markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues].
|
| markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues].
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy info [--markdown]
|
spacy info [--markdown]
|
||||||
python -m spacy info [model] [--markdown]
|
spacy info [model] [--markdown]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -139,8 +130,8 @@ p
|
||||||
| functions. The right converter is chosen based on the file extension of
|
| functions. The right converter is chosen based on the file extension of
|
||||||
| the input file. Currently only supports #[code .conllu].
|
| the input file. Currently only supports #[code .conllu].
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -174,8 +165,8 @@ p
|
||||||
| Train a model. Expects data in spaCy's
|
| Train a model. Expects data in spaCy's
|
||||||
| #[+a("/docs/api/annotation#json-input") JSON format].
|
| #[+a("/docs/api/annotation#json-input") JSON format].
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -345,8 +336,8 @@ p
|
||||||
| sure you're always using the latest versions. This means you need to be
|
| sure you're always using the latest versions. This means you need to be
|
||||||
| connected to the internet to use this command.
|
| connected to the internet to use this command.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy package [input_dir] [output_dir] [--meta] [--force]
|
spacy package [input_dir] [output_dir] [--meta] [--force]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -360,10 +351,17 @@ p
|
||||||
+cell Directory to create package folder in.
|
+cell Directory to create package folder in.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code meta]
|
+cell #[code --meta-path], #[code -m]
|
||||||
+cell option
|
+cell option
|
||||||
+cell Path to meta.json file (optional).
|
+cell Path to meta.json file (optional).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --create-meta], #[code -c]
|
||||||
|
+cell flag
|
||||||
|
+cell
|
||||||
|
| Create a meta.json file on the command line, even if one already
|
||||||
|
| exists in the directory.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --force], #[code -f]
|
+cell #[code --force], #[code -f]
|
||||||
+cell flag
|
+cell flag
|
||||||
|
|
|
@ -8,9 +8,9 @@ p
|
||||||
|
|
||||||
|
|
||||||
+aside-code("Download language models", "bash").
|
+aside-code("Download language models", "bash").
|
||||||
python -m spacy download en
|
spacy download en
|
||||||
python -m spacy download de
|
spacy download de
|
||||||
python -m spacy download fr
|
spacy download fr
|
||||||
|
|
||||||
+table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
|
+table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -205,7 +205,7 @@ p
|
||||||
|
|
||||||
+infobox("Why lazy-loading?")
|
+infobox("Why lazy-loading?")
|
||||||
| Some languages contain large volumes of custom data, like lemmatizer
|
| Some languages contain large volumes of custom data, like lemmatizer
|
||||||
| loopup tables, or complex regular expression that are expensive to
|
| lookup tables, or complex regular expression that are expensive to
|
||||||
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
||||||
| initialisation and are only loaded when you import them directly, or load
|
| initialisation and are only loaded when you import them directly, or load
|
||||||
| a model that requires a language to be loaded. To lazy-load languages in
|
| a model that requires a language to be loaded. To lazy-load languages in
|
||||||
|
@ -789,4 +789,4 @@ p
|
||||||
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
|
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
|
||||||
|
|
|
@ -32,10 +32,10 @@ p
|
||||||
+qs({package: 'source'}) pip install -r requirements.txt
|
+qs({package: 'source'}) pip install -r requirements.txt
|
||||||
+qs({package: 'source'}) pip install -e .
|
+qs({package: 'source'}) pip install -e .
|
||||||
|
|
||||||
+qs({model: 'en'}) python -m spacy download en
|
+qs({model: 'en'}) spacy download en
|
||||||
+qs({model: 'de'}) python -m spacy download de
|
+qs({model: 'de'}) spacy download de
|
||||||
+qs({model: 'fr'}) python -m spacy download fr
|
+qs({model: 'fr'}) spacy download fr
|
||||||
+qs({model: 'es'}) python -m spacy download es
|
+qs({model: 'es'}) spacy download es
|
||||||
|
|
||||||
+h(2, "installation") Installation instructions
|
+h(2, "installation") Installation instructions
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ p Using pip, spaCy releases are currently only available as source packages.
|
||||||
| and available models, see the #[+a("/docs/usage/models") docs on models].
|
| and available models, see the #[+a("/docs/usage/models") docs on models].
|
||||||
|
|
||||||
+code.o-no-block.
|
+code.o-no-block.
|
||||||
python -m spacy download en
|
spacy download en
|
||||||
|
|
||||||
>>> import spacy
|
>>> import spacy
|
||||||
>>> nlp = spacy.load('en')
|
>>> nlp = spacy.load('en')
|
||||||
|
@ -312,7 +312,9 @@ p
|
||||||
| This error may occur when running the #[code spacy] command from the
|
| This error may occur when running the #[code spacy] command from the
|
||||||
| command line. spaCy does not currently add an entry to our #[code PATH]
|
| command line. spaCy does not currently add an entry to our #[code PATH]
|
||||||
| environment variable, as this can lead to unexpected results, especially
|
| environment variable, as this can lead to unexpected results, especially
|
||||||
| when using #[code virtualenv]. Run the command with #[code python -m],
|
| when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
|
||||||
|
| maps #[code spacy] to #[code python -m spacy]. If this is not working as
|
||||||
|
| expected, run the command with #[code python -m], yourself –
|
||||||
| for example #[code python -m spacy download en]. For more info on this,
|
| for example #[code python -m spacy download en]. For more info on this,
|
||||||
| see #[+api("cli#download") download].
|
| see #[+api("cli#download") download].
|
||||||
|
|
||||||
|
|
|
@ -10,8 +10,8 @@ p
|
||||||
+h(2, "models") Install models and process text
|
+h(2, "models") Install models and process text
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy download en
|
spacy download en
|
||||||
python -m spacy download de
|
spacy download de
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
import spacy
|
||||||
|
|
|
@ -20,7 +20,7 @@ p
|
||||||
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
|
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
|
||||||
for models, lang in MODELS
|
for models, lang in MODELS
|
||||||
- var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
|
- var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
|
||||||
+qs({lang: lang}) python -m spacy download #{lang}
|
+qs({lang: lang}) spacy download #{lang}
|
||||||
+qs({lang: lang}, "divider")
|
+qs({lang: lang}, "divider")
|
||||||
+qs({lang: lang, load: "module"}, "python") import #{package.id}
|
+qs({lang: lang, load: "module"}, "python") import #{package.id}
|
||||||
+qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
|
+qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
|
||||||
|
@ -52,16 +52,16 @@ p
|
||||||
| #[+api("cli#download") #[code download]] command. It takes care of
|
| #[+api("cli#download") #[code download]] command. It takes care of
|
||||||
| finding the best-matching model compatible with your spaCy installation.
|
| finding the best-matching model compatible with your spaCy installation.
|
||||||
|
|
||||||
- var models = Object.keys(MODELS).map(function(lang) { return "python -m spacy download " + lang })
|
- var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang })
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
# out-of-the-box: download best-matching default model
|
# out-of-the-box: download best-matching default model
|
||||||
#{Object.keys(MODELS).map(function(l) {return "python -m spacy download " + l}).join('\n')}
|
#{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')}
|
||||||
|
|
||||||
# download best-matching version of specific model for your spaCy installation
|
# download best-matching version of specific model for your spaCy installation
|
||||||
python -m spacy download en_core_web_md
|
spacy download en_core_web_md
|
||||||
|
|
||||||
# download exact model version (doesn't create shortcut link)
|
# download exact model version (doesn't create shortcut link)
|
||||||
python -m spacy download en_core_web_md-1.2.0 --direct
|
spacy download en_core_web_md-1.2.0 --direct
|
||||||
|
|
||||||
p
|
p
|
||||||
| The download command will #[+a("#download-pip") install the model] via
|
| The download command will #[+a("#download-pip") install the model] via
|
||||||
|
@ -72,7 +72,7 @@ p
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
pip install spacy
|
pip install spacy
|
||||||
python -m spacy download en
|
spacy download en
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -179,8 +179,8 @@ p
|
||||||
| model names or IDs. And your system already comes with a native solution
|
| model names or IDs. And your system already comes with a native solution
|
||||||
| to mapping unicode aliases to file paths: symbolic links.
|
| to mapping unicode aliases to file paths: symbolic links.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash", "$").
|
||||||
python -m spacy link [package name or path] [shortcut] [--force]
|
spacy link [package name or path] [shortcut] [--force]
|
||||||
|
|
||||||
p
|
p
|
||||||
| The first argument is the #[strong package name] (if the model was
|
| The first argument is the #[strong package name] (if the model was
|
||||||
|
|
|
@ -85,7 +85,7 @@ p
|
||||||
}
|
}
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy package /home/me/data/en_example_model /home/me/my_models
|
spacy package /home/me/data/en_example_model /home/me/my_models
|
||||||
|
|
||||||
p This command will create a model package directory that should look like this:
|
p This command will create a model package directory that should look like this:
|
||||||
|
|
||||||
|
|
|
@ -102,7 +102,7 @@ p
|
||||||
| CLI command to create all required files and directories.
|
| CLI command to create all required files and directories.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
spacy package /home/me/data/en_technology /home/me/my_models
|
||||||
|
|
||||||
p
|
p
|
||||||
| To build the package and create a #[code .tar.gz] archive, run
|
| To build the package and create a #[code .tar.gz] archive, run
|
||||||
|
|
|
@ -238,11 +238,11 @@ p
|
||||||
+h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER
|
+h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER
|
||||||
|
|
||||||
+aside-code("Example", "bash").
|
+aside-code("Example", "bash").
|
||||||
python -m spacy download en # default English model
|
spacy download en # default English model
|
||||||
python -m spacy download de # default German model
|
spacy download de # default German model
|
||||||
python -m spacy download fr # default French model
|
spacy download fr # default French model
|
||||||
python -m spacy download es # default Spanish model
|
spacy download es # default Spanish model
|
||||||
python -m spacy download xx_ent_wiki_sm # multi-language NER
|
spacy download xx_ent_wiki_sm # multi-language NER
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy v2.0 comes with new and improved neural network models for English,
|
| spaCy v2.0 comes with new and improved neural network models for English,
|
||||||
|
|
|
@ -259,7 +259,7 @@ p
|
||||||
| notebook, the visualizations will be included as HTML.
|
| notebook, the visualizations will be included as HTML.
|
||||||
|
|
||||||
+code("Jupyter Example").
|
+code("Jupyter Example").
|
||||||
# don't forget to install a model, e.g.: python -m spacy download en
|
# don't forget to install a model, e.g.: spacy download en
|
||||||
import spacy
|
import spacy
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ include _includes/_mixins
|
||||||
+grid
|
+grid
|
||||||
+grid-col("two-thirds")
|
+grid-col("two-thirds")
|
||||||
+terminal("lightning_tour.py").
|
+terminal("lightning_tour.py").
|
||||||
# Install: pip install spacy && python -m spacy download en
|
# Install: pip install spacy && spacy download en
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
# Load English tokenizer, tagger, parser, NER and word vectors
|
# Load English tokenizer, tagger, parser, NER and word vectors
|
||||||
|
|
Loading…
Reference in New Issue
Block a user