mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/develop' into indonesian
This commit is contained in:
		
						commit
						7ae45bffcf
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							|  | @ -40,7 +40,6 @@ venv/ | |||
| 
 | ||||
| # Distribution / packaging | ||||
| env/ | ||||
| bin/ | ||||
| build/ | ||||
| develop-eggs/ | ||||
| dist/ | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| recursive-include include *.h | ||||
| include LICENSE | ||||
| include README.rst | ||||
| include bin/spacy | ||||
|  |  | |||
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -187,6 +187,7 @@ def setup_package(): | |||
|             url=about['__uri__'], | ||||
|             license=about['__license__'], | ||||
|             ext_modules=ext_modules, | ||||
|             scripts=['bin/spacy'], | ||||
|             install_requires=[ | ||||
|                 'numpy>=1.7', | ||||
|                 'murmurhash>=0.28,<0.29', | ||||
|  |  | |||
							
								
								
									
										96
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										96
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -5,12 +5,10 @@ from thinc.neural._classes.hash_embed import HashEmbed | |||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.util import get_array_module | ||||
| import random | ||||
| import cytoolz | ||||
| 
 | ||||
| from thinc.neural._classes.convolution import ExtractWindow | ||||
| from thinc.neural._classes.static_vectors import StaticVectors | ||||
| from thinc.neural._classes.batchnorm import BatchNorm | ||||
| from thinc.neural._classes.layernorm import LayerNorm as LN | ||||
| from thinc.neural._classes.resnet import Residual | ||||
| from thinc.neural import ReLu | ||||
| from thinc.neural._classes.selu import SELU | ||||
|  | @ -21,7 +19,7 @@ from thinc.api import FeatureExtracter, with_getitem | |||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool | ||||
| from thinc.neural._classes.attention import ParametricAttention | ||||
| from thinc.linear.linear import LinearModel | ||||
| from thinc.api import uniqued, wrap, flatten_add_lengths | ||||
| from thinc.api import uniqued, wrap | ||||
| 
 | ||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP | ||||
| from .tokens.doc import Doc | ||||
|  | @ -55,27 +53,6 @@ def _logistic(X, drop=0.): | |||
|     return Y, logistic_bwd | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def add_tuples(X, drop=0.): | ||||
|     """Give inputs of sequence pairs, where each sequence is (vals, length), | ||||
|     sum the values, returning a single sequence. | ||||
| 
 | ||||
|     If input is: | ||||
|     ((vals1, length), (vals2, length) | ||||
|     Output is: | ||||
|     (vals1+vals2, length) | ||||
| 
 | ||||
|     vals are a single tensor for the whole batch. | ||||
|     """ | ||||
|     (vals1, length1), (vals2, length2) = X | ||||
|     assert length1 == length2 | ||||
| 
 | ||||
|     def add_tuples_bwd(dY, sgd=None): | ||||
|         return (dY, dY) | ||||
| 
 | ||||
|     return (vals1+vals2, length), add_tuples_bwd | ||||
| 
 | ||||
| 
 | ||||
| def _zero_init(model): | ||||
|     def _zero_init_impl(self, X, y): | ||||
|         self.W.fill(0) | ||||
|  | @ -84,7 +61,6 @@ def _zero_init(model): | |||
|         model.W.fill(0.) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def _preprocess_doc(docs, drop=0.): | ||||
|     keys = [doc.to_array([LOWER]) for doc in docs] | ||||
|  | @ -96,6 +72,7 @@ def _preprocess_doc(docs, drop=0.): | |||
|     return (keys, vals, lengths), None | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def _init_for_precomputed(W, ops): | ||||
|     if (W**2).sum() != 0.: | ||||
|         return | ||||
|  | @ -103,7 +80,6 @@ def _init_for_precomputed(W, ops): | |||
|     ops.xavier_uniform_init(reshaped) | ||||
|     W[:] = reshaped.reshape(W.shape) | ||||
| 
 | ||||
| 
 | ||||
| @describe.on_data(_set_dimensions_if_needed) | ||||
| @describe.attributes( | ||||
|     nI=Dimension("Input size"), | ||||
|  | @ -209,9 +185,9 @@ class PrecomputableMaxouts(Model): | |||
| 
 | ||||
| 
 | ||||
| def Tok2Vec(width, embed_size, preprocess=None): | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] | ||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): | ||||
|         norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower') | ||||
|         norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower') | ||||
|         prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') | ||||
|         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') | ||||
|         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape') | ||||
|  | @ -220,13 +196,13 @@ def Tok2Vec(width, embed_size, preprocess=None): | |||
|         tok2vec = ( | ||||
|             with_flatten( | ||||
|                 asarray(Model.ops, dtype='uint64') | ||||
|                 >> uniqued(embed, column=5) | ||||
|                 >> LN(Maxout(width, width*4, pieces=3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) | ||||
|                 >> embed | ||||
|                 >> Maxout(width, width*4, pieces=3) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), | ||||
|                 pad=4) | ||||
|             pad=4) | ||||
|         ) | ||||
|         if preprocess not in (False, None): | ||||
|             tok2vec = preprocess >> tok2vec | ||||
|  | @ -321,7 +297,7 @@ def zero_init(model): | |||
| 
 | ||||
| 
 | ||||
| def doc2feats(cols=None): | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] | ||||
|     def forward(docs, drop=0.): | ||||
|         feats = [] | ||||
|         for doc in docs: | ||||
|  | @ -347,36 +323,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): | |||
|     return vectors, backward | ||||
| 
 | ||||
| 
 | ||||
| def fine_tune(embedding, combine=None): | ||||
|     if combine is not None: | ||||
|         raise NotImplementedError( | ||||
|             "fine_tune currently only supports addition. Set combine=None") | ||||
|     def fine_tune_fwd(docs_tokvecs, drop=0.): | ||||
|         docs, tokvecs = docs_tokvecs | ||||
|         lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') | ||||
| 
 | ||||
|         vecs, bp_vecs = embedding.begin_update(docs, drop=drop) | ||||
|         flat_tokvecs = embedding.ops.flatten(tokvecs) | ||||
|         flat_vecs = embedding.ops.flatten(vecs) | ||||
|         output = embedding.ops.unflatten( | ||||
|                    (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs), | ||||
|                     lengths) | ||||
| 
 | ||||
|         def fine_tune_bwd(d_output, sgd=None): | ||||
|             bp_vecs(d_output, sgd=sgd) | ||||
|             flat_grad = model.ops.flatten(d_output) | ||||
|             model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() | ||||
|             model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() | ||||
|             sgd(model._mem.weights, model._mem.gradient, key=model.id) | ||||
|             return d_output | ||||
|         return output, fine_tune_bwd | ||||
|     model = wrap(fine_tune_fwd, embedding) | ||||
|     model.mix = model._mem.add((model.id, 'mix'), (2,)) | ||||
|     model.mix.fill(1.) | ||||
|     model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def flatten(seqs, drop=0.): | ||||
|     if isinstance(seqs[0], numpy.ndarray): | ||||
|  | @ -423,26 +369,6 @@ def preprocess_doc(docs, drop=0.): | |||
|     vals = ops.allocate(keys.shape[0]) + 1 | ||||
|     return (keys, vals, lengths), None | ||||
| 
 | ||||
| def getitem(i): | ||||
|     def getitem_fwd(X, drop=0.): | ||||
|         return X[i], None | ||||
|     return layerize(getitem_fwd) | ||||
| 
 | ||||
| def build_tagger_model(nr_class, token_vector_width, **cfg): | ||||
|     with Model.define_operators({'>>': chain, '+': add}): | ||||
|         # Input: (doc, tensor) tuples | ||||
|         private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats()) | ||||
| 
 | ||||
|         model = ( | ||||
|             fine_tune(private_tok2vec) | ||||
|             >> with_flatten( | ||||
|                 Maxout(token_vector_width, token_vector_width) | ||||
|                 >> Softmax(nr_class, token_vector_width) | ||||
|             ) | ||||
|         ) | ||||
|     model.nI = None | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| def build_text_classifier(nr_class, width=64, **cfg): | ||||
|     nr_vector = cfg.get('nr_vector', 200) | ||||
|  | @ -457,7 +383,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             >> _flatten_add_lengths | ||||
|             >> with_getitem(0, | ||||
|                 uniqued( | ||||
|                   (embed_lower | embed_prefix | embed_suffix | embed_shape) | ||||
|                   (embed_lower | embed_prefix | embed_suffix | embed_shape)  | ||||
|                   >> Maxout(width, width+(width//2)*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) | ||||
|  | @ -478,7 +404,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) | ||||
|             >> logistic | ||||
|         ) | ||||
| 
 | ||||
|   | ||||
|     model.lsuv = False | ||||
|     return model | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||
| 
 | ||||
| __title__ = 'spacy-nightly' | ||||
| __version__ = '2.0.0a7' | ||||
| __version__ = '2.0.0a9' | ||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||
| __uri__ = 'https://spacy.io' | ||||
| __author__ = 'Explosion AI' | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ import subprocess | |||
| import sys | ||||
| 
 | ||||
| from .link import link | ||||
| from ..util import prints | ||||
| from ..util import prints, get_package_path | ||||
| from .. import about | ||||
| 
 | ||||
| 
 | ||||
|  | @ -32,7 +32,11 @@ def download(cmd, model, direct=False): | |||
|         version = get_version(model_name, compatibility) | ||||
|         download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) | ||||
|         try: | ||||
|             link(None, model_name, model, force=True) | ||||
|             # Get package path here because link uses | ||||
|             # pip.get_installed_distributions() to check if model is a package, | ||||
|             # which fails if model was just installed via subprocess | ||||
|             package_path = get_package_path(model_name) | ||||
|             link(None, model_name, model, force=True, model_path=package_path) | ||||
|         except: | ||||
|             # Dirty, but since spacy.download and the auto-linking is mostly | ||||
|             # a convenience wrapper, it's best to show a success message and | ||||
|  |  | |||
|  | @ -14,7 +14,7 @@ from .. import util | |||
|     link_name=("name of shortuct link to create", "positional", None, str), | ||||
|     force=("force overwriting of existing link", "flag", "f", bool) | ||||
| ) | ||||
| def link(cmd, origin, link_name, force=False): | ||||
| def link(cmd, origin, link_name, force=False, model_path=None): | ||||
|     """ | ||||
|     Create a symlink for models within the spacy/data directory. Accepts | ||||
|     either the name of a pip package, or the local path to the model data | ||||
|  | @ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False): | |||
|     if util.is_package(origin): | ||||
|         model_path = util.get_package_path(origin) | ||||
|     else: | ||||
|         model_path = Path(origin) | ||||
|         model_path = Path(origin) if model_path is None else Path(model_path) | ||||
|     if not model_path.exists(): | ||||
|         prints("The data should be located in %s" % path2str(model_path), | ||||
|                title="Can't locate model data", exits=1) | ||||
|  |  | |||
|  | @ -15,10 +15,11 @@ from .. import about | |||
| @plac.annotations( | ||||
|     input_dir=("directory with model data", "positional", None, str), | ||||
|     output_dir=("output parent directory", "positional", None, str), | ||||
|     meta=("path to meta.json", "option", "m", str), | ||||
|     meta_path=("path to meta.json", "option", "m", str), | ||||
|     create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), | ||||
|     force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||
| ) | ||||
| def package(cmd, input_dir, output_dir, meta=None, force=False): | ||||
| def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): | ||||
|     """ | ||||
|     Generate Python package for model data, including meta and required | ||||
|     installation files. A new directory will be created in the specified | ||||
|  | @ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): | |||
|     """ | ||||
|     input_path = util.ensure_path(input_dir) | ||||
|     output_path = util.ensure_path(output_dir) | ||||
|     meta_path = util.ensure_path(meta) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     if not input_path or not input_path.exists(): | ||||
|         prints(input_path, title="Model directory not found", exits=1) | ||||
|     if not output_path or not output_path.exists(): | ||||
|  | @ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): | |||
|     template_manifest = get_template('MANIFEST.in') | ||||
|     template_init = get_template('xx_model_name/__init__.py') | ||||
|     meta_path = meta_path or input_path / 'meta.json' | ||||
|     if meta_path.is_file(): | ||||
|     if not create_meta and meta_path.is_file(): | ||||
|         prints(meta_path, title="Reading meta.json from file") | ||||
|         meta = util.read_json(meta_path) | ||||
|     else: | ||||
|  |  | |||
|  | @ -91,8 +91,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|                 for batch in minibatch(train_docs, size=batch_sizes): | ||||
|                     docs, golds = zip(*batch) | ||||
|                     nlp.update(docs, golds, sgd=optimizer, | ||||
|                                drop=next(dropout_rates), losses=losses, | ||||
|                                update_tensors=True) | ||||
|                                drop=next(dropout_rates), losses=losses) | ||||
|                     pbar.update(sum(len(doc) for doc in docs)) | ||||
| 
 | ||||
|             with nlp.use_params(optimizer.averages): | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ def depr_model_download(lang): | |||
|     lang (unicode): Language shortcut, 'en' or 'de'. | ||||
|     """ | ||||
|     prints("The spacy.%s.download command is now deprecated. Please use " | ||||
|            "python -m spacy download [model name or shortcut] instead. For " | ||||
|            "spacy download [model name or shortcut] instead. For " | ||||
|            "more info, see the documentation:" % lang, | ||||
|            about.__docs_models__, | ||||
|            "Downloading default '%s' model now..." % lang, | ||||
|  |  | |||
|  | @ -277,8 +277,7 @@ class Language(object): | |||
|     def make_doc(self, text): | ||||
|         return self.tokenizer(text) | ||||
| 
 | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None, | ||||
|             update_tensors=False): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         """Update the models in the pipeline. | ||||
| 
 | ||||
|         docs (iterable): A batch of `Doc` objects. | ||||
|  | @ -311,7 +310,7 @@ class Language(object): | |||
|             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) | ||||
|             d_tokvecses = proc.update((docs, tokvecses), golds, | ||||
|                                       drop=drop, sgd=get_grads, losses=losses) | ||||
|             if update_tensors and d_tokvecses is not None: | ||||
|             if d_tokvecses is not None: | ||||
|                 bp_tokvecses(d_tokvecses, sgd=sgd) | ||||
|         for key, (W, dW) in grads.items(): | ||||
|             sgd(W, dW, key=key) | ||||
|  | @ -382,18 +381,9 @@ class Language(object): | |||
|         return optimizer | ||||
| 
 | ||||
|     def evaluate(self, docs_golds): | ||||
|         scorer = Scorer() | ||||
|         docs, golds = zip(*docs_golds) | ||||
|         docs = list(docs) | ||||
|         golds = list(golds) | ||||
|         for pipe in self.pipeline: | ||||
|             if not hasattr(pipe, 'pipe'): | ||||
|                 for doc in docs: | ||||
|                     pipe(doc) | ||||
|             else: | ||||
|                 docs = list(pipe.pipe(docs)) | ||||
|         assert len(docs) == len(golds) | ||||
|         for doc, gold in zip(docs, golds): | ||||
|         scorer = Scorer() | ||||
|         for doc, gold in zip(self.pipe(docs, batch_size=32), golds): | ||||
|             scorer.score(doc, gold) | ||||
|             doc.tensor = None | ||||
|         return scorer | ||||
|  |  | |||
|  | @ -42,7 +42,7 @@ from .compat import json_dumps | |||
| 
 | ||||
| from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | ||||
| from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats | ||||
| from ._ml import build_text_classifier, build_tagger_model | ||||
| from ._ml import build_text_classifier | ||||
| from .parts_of_speech import X | ||||
| 
 | ||||
| 
 | ||||
|  | @ -253,25 +253,23 @@ class NeuralTagger(BaseThincComponent): | |||
|         self.cfg = dict(cfg) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         tags = self.predict(([doc], [doc.tensor])) | ||||
|         tags = self.predict([doc.tensor]) | ||||
|         self.set_annotations([doc], tags) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||
|         for docs in cytoolz.partition_all(batch_size, stream): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [d.tensor for d in docs] | ||||
|             tag_ids = self.predict((docs, tokvecs)) | ||||
|             tag_ids = self.predict(tokvecs) | ||||
|             self.set_annotations(docs, tag_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs_tokvecs): | ||||
|         scores = self.model(docs_tokvecs) | ||||
|     def predict(self, tokvecs): | ||||
|         scores = self.model(tokvecs) | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         if not isinstance(guesses, numpy.ndarray): | ||||
|             guesses = guesses.get() | ||||
|         tokvecs = docs_tokvecs[1] | ||||
|         guesses = self.model.ops.unflatten(guesses, | ||||
|                     [tv.shape[0] for tv in tokvecs]) | ||||
|         return guesses | ||||
|  | @ -296,7 +294,8 @@ class NeuralTagger(BaseThincComponent): | |||
| 
 | ||||
|         if self.model.nI is None: | ||||
|             self.model.nI = tokvecs[0].shape[1] | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) | ||||
| 
 | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) | ||||
|         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) | ||||
| 
 | ||||
|         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) | ||||
|  | @ -347,8 +346,10 @@ class NeuralTagger(BaseThincComponent): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|   | ||||
|         return with_flatten( | ||||
|             chain(Maxout(token_vector_width, token_vector_width), | ||||
|                   Softmax(n_tags, token_vector_width))) | ||||
| 
 | ||||
|     def use_params(self, params): | ||||
|         with self.model.use_params(params): | ||||
|             yield | ||||
|  | @ -431,7 +432,7 @@ class NeuralLabeller(NeuralTagger): | |||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|         return self.cfg.setdefault('labels', {}) | ||||
|         return self.cfg.get('labels', {}) | ||||
| 
 | ||||
|     @labels.setter | ||||
|     def labels(self, value): | ||||
|  | @ -454,8 +455,10 @@ class NeuralLabeller(NeuralTagger): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|      | ||||
|         return with_flatten( | ||||
|             chain(Maxout(token_vector_width, token_vector_width), | ||||
|                   Softmax(n_tags, token_vector_width))) | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         cdef int idx = 0 | ||||
|  |  | |||
|  | @ -385,7 +385,6 @@ cdef class ArcEager(TransitionSystem): | |||
|         for i in range(self.n_moves): | ||||
|             if self.c[i].move == move and self.c[i].label == label: | ||||
|                 return self.c[i] | ||||
|         return Transition(clas=0, move=MISSING, label=0) | ||||
| 
 | ||||
|     def move_name(self, int move, attr_t label): | ||||
|         label_str = self.strings[label] | ||||
|  |  | |||
|  | @ -14,4 +14,8 @@ cdef class Parser: | |||
|     cdef readonly TransitionSystem moves | ||||
|     cdef readonly object cfg | ||||
| 
 | ||||
|     cdef void _parse_step(self, StateC* state, | ||||
|             const float* feat_weights, | ||||
|             int nr_class, int nr_feat, int nr_piece) nogil | ||||
| 
 | ||||
|     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil | ||||
|  |  | |||
|  | @ -44,7 +44,7 @@ from thinc.neural.util import get_array_module | |||
| from .. import util | ||||
| from ..util import get_async, get_cuda_stream | ||||
| from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts | ||||
| from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune | ||||
| from .._ml import Tok2Vec, doc2feats, rebatch | ||||
| from ..compat import json_dumps | ||||
| 
 | ||||
| from . import _parse_features | ||||
|  | @ -237,7 +237,6 @@ cdef class Parser: | |||
|         token_vector_width = util.env_opt('token_vector_width', token_vector_width) | ||||
|         hidden_width = util.env_opt('hidden_width', hidden_width) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) | ||||
|         tensors = fine_tune(Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())) | ||||
|         if parser_maxout_pieces == 1: | ||||
|             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, | ||||
|                         nF=cls.nr_feature, | ||||
|  | @ -249,10 +248,15 @@ cdef class Parser: | |||
|                         nI=token_vector_width) | ||||
| 
 | ||||
|         with Model.use_device('cpu'): | ||||
|             upper = chain( | ||||
|                 clone(Maxout(hidden_width), (depth-1)), | ||||
|                 zero_init(Affine(nr_class, drop_factor=0.0)) | ||||
|             ) | ||||
|             if depth == 0: | ||||
|                 upper = chain() | ||||
|                 upper.is_noop = True | ||||
|             else: | ||||
|                 upper = chain( | ||||
|                     clone(Maxout(hidden_width), (depth-1)), | ||||
|                     zero_init(Affine(nr_class, drop_factor=0.0)) | ||||
|                 ) | ||||
|                 upper.is_noop = False | ||||
|         # TODO: This is an unfortunate hack atm! | ||||
|         # Used to set input dimensions in network. | ||||
|         lower.begin_training(lower.ops.allocate((500, token_vector_width))) | ||||
|  | @ -264,7 +268,7 @@ cdef class Parser: | |||
|             'hidden_width': hidden_width, | ||||
|             'maxout_pieces': parser_maxout_pieces | ||||
|         } | ||||
|         return (tensors, lower, upper), cfg | ||||
|         return (lower, upper), cfg | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, moves=True, model=True, **cfg): | ||||
|         """ | ||||
|  | @ -340,10 +344,12 @@ cdef class Parser: | |||
|                 The number of threads with which to work on the buffer in parallel. | ||||
|         Yields (Doc): Documents, in order. | ||||
|         """ | ||||
|         cdef StateClass parse_state | ||||
|         cdef Doc doc | ||||
|         queue = [] | ||||
|         for docs in cytoolz.partition_all(batch_size, docs): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [doc.tensor for doc in docs] | ||||
|             tokvecs = [d.tensor for d in docs] | ||||
|             if beam_width == 1: | ||||
|                 parse_states = self.parse_batch(docs, tokvecs) | ||||
|             else: | ||||
|  | @ -363,11 +369,8 @@ cdef class Parser: | |||
|             int nr_class, nr_feat, nr_piece, nr_dim, nr_state | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         if isinstance(tokvecses, np.ndarray): | ||||
|             tokvecses = [tokvecses] | ||||
| 
 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
| 
 | ||||
|         nr_state = len(docs) | ||||
|         nr_class = self.moves.n_moves | ||||
|  | @ -391,20 +394,27 @@ cdef class Parser: | |||
|         cdef np.ndarray scores | ||||
|         c_token_ids = <int*>token_ids.data | ||||
|         c_is_valid = <int*>is_valid.data | ||||
|         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) | ||||
|         while not next_step.empty(): | ||||
|             for i in range(next_step.size()): | ||||
|                 st = next_step[i] | ||||
|                 st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||
|                 self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||
|             if not has_hidden: | ||||
|                 for i in cython.parallel.prange( | ||||
|                         next_step.size(), num_threads=6, nogil=True): | ||||
|                     self._parse_step(next_step[i], | ||||
|                         feat_weights, nr_class, nr_feat, nr_piece) | ||||
|             else: | ||||
|                 for i in range(next_step.size()): | ||||
|                     st = next_step[i] | ||||
|                     st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||
|                     self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||
|                 vectors = state2vec(token_ids[:next_step.size()]) | ||||
|             scores = vec2scores(vectors) | ||||
|             c_scores = <float*>scores.data | ||||
|             for i in range(next_step.size()): | ||||
|                 st = next_step[i] | ||||
|                 guess = arg_max_if_valid( | ||||
|                     &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||
|                 action = self.moves.c[guess] | ||||
|                 action.do(st, action.label) | ||||
|                 scores = vec2scores(vectors) | ||||
|                 c_scores = <float*>scores.data | ||||
|                 for i in range(next_step.size()): | ||||
|                     st = next_step[i] | ||||
|                     guess = arg_max_if_valid( | ||||
|                         &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||
|                     action = self.moves.c[guess] | ||||
|                     action.do(st, action.label) | ||||
|             this_step, next_step = next_step, this_step | ||||
|             next_step.clear() | ||||
|             for st in this_step: | ||||
|  | @ -419,7 +429,6 @@ cdef class Parser: | |||
|         cdef int nr_class = self.moves.n_moves | ||||
|         cdef StateClass stcls, output | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, | ||||
|                                                      cuda_stream, 0.0) | ||||
|  | @ -452,6 +461,28 @@ cdef class Parser: | |||
|             beams.append(beam) | ||||
|         return beams | ||||
| 
 | ||||
|     cdef void _parse_step(self, StateC* state, | ||||
|             const float* feat_weights, | ||||
|             int nr_class, int nr_feat, int nr_piece) nogil: | ||||
|         '''This only works with no hidden layers -- fast but inaccurate''' | ||||
|         #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): | ||||
|         #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) | ||||
|         token_ids = <int*>calloc(nr_feat, sizeof(int)) | ||||
|         scores = <float*>calloc(nr_class * nr_piece, sizeof(float)) | ||||
|         is_valid = <int*>calloc(nr_class, sizeof(int)) | ||||
| 
 | ||||
|         state.set_context_tokens(token_ids, nr_feat) | ||||
|         sum_state_features(scores, | ||||
|             feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece) | ||||
|         self.moves.set_valid(is_valid, state) | ||||
|         guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece) | ||||
|         action = self.moves.c[guess] | ||||
|         action.do(state, action.label) | ||||
| 
 | ||||
|         free(is_valid) | ||||
|         free(scores) | ||||
|         free(token_ids) | ||||
| 
 | ||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|  | @ -460,9 +491,6 @@ cdef class Parser: | |||
|         if isinstance(docs, Doc) and isinstance(golds, GoldParse): | ||||
|             docs = [docs] | ||||
|             golds = [golds] | ||||
|         my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=0.) | ||||
|         my_tokvecs = self.model[0].ops.flatten(my_tokvecs) | ||||
|         tokvecs += my_tokvecs | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
| 
 | ||||
|  | @ -512,9 +540,7 @@ cdef class Parser: | |||
|                 break | ||||
|         self._make_updates(d_tokvecs, | ||||
|             backprops, sgd, cuda_stream) | ||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
|         #bp_my_tokvecs(d_tokvecs, sgd=sgd) | ||||
|         return d_tokvecs | ||||
|         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
| 
 | ||||
|     def _init_gold_batch(self, whole_docs, whole_golds): | ||||
|         """Make a square batch, of length equal to the shortest doc. A long | ||||
|  | @ -577,7 +603,7 @@ cdef class Parser: | |||
|         return names | ||||
| 
 | ||||
|     def get_batch_model(self, batch_size, tokvecs, stream, dropout): | ||||
|         _, lower, upper = self.model | ||||
|         lower, upper = self.model | ||||
|         state2vec = precompute_hiddens(batch_size, tokvecs, | ||||
|                         lower, stream, drop=dropout) | ||||
|         return state2vec, upper | ||||
|  | @ -667,12 +693,10 @@ cdef class Parser: | |||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         serializers = { | ||||
|             'tok2vec_model': lambda p: p.open('wb').write( | ||||
|                 self.model[0].to_bytes()), | ||||
|             'lower_model': lambda p: p.open('wb').write( | ||||
|                 self.model[1].to_bytes()), | ||||
|                 self.model[0].to_bytes()), | ||||
|             'upper_model': lambda p: p.open('wb').write( | ||||
|                 self.model[2].to_bytes()), | ||||
|                 self.model[1].to_bytes()), | ||||
|             'vocab': lambda p: self.vocab.to_disk(p), | ||||
|             'moves': lambda p: self.moves.to_disk(p, strings=False), | ||||
|             'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) | ||||
|  | @ -693,29 +717,24 @@ cdef class Parser: | |||
|                 self.model, cfg = self.Model(**self.cfg) | ||||
|             else: | ||||
|                 cfg = {} | ||||
|             with (path / 'tok2vec_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[0].from_bytes(bytes_data) | ||||
|             with (path / 'lower_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[1].from_bytes(bytes_data) | ||||
|             self.model[0].from_bytes(bytes_data) | ||||
|             with (path / 'upper_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[2].from_bytes(bytes_data) | ||||
|             self.model[1].from_bytes(bytes_data) | ||||
|             self.cfg.update(cfg) | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         serializers = OrderedDict(( | ||||
|             ('tok2vec_model', lambda: self.model[0].to_bytes()), | ||||
|             ('lower_model', lambda: self.model[1].to_bytes()), | ||||
|             ('upper_model', lambda: self.model[2].to_bytes()), | ||||
|             ('lower_model', lambda: self.model[0].to_bytes()), | ||||
|             ('upper_model', lambda: self.model[1].to_bytes()), | ||||
|             ('vocab', lambda: self.vocab.to_bytes()), | ||||
|             ('moves', lambda: self.moves.to_bytes(strings=False)), | ||||
|             ('cfg', lambda: ujson.dumps(self.cfg)) | ||||
|         )) | ||||
|         if 'model' in exclude: | ||||
|             exclude['tok2vec_model'] = True | ||||
|             exclude['lower_model'] = True | ||||
|             exclude['upper_model'] = True | ||||
|             exclude.pop('model') | ||||
|  | @ -726,7 +745,6 @@ cdef class Parser: | |||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('moves', lambda b: self.moves.from_bytes(b, strings=False)), | ||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||
|             ('tok2vec_model', lambda b: None), | ||||
|             ('lower_model', lambda b: None), | ||||
|             ('upper_model', lambda b: None) | ||||
|         )) | ||||
|  | @ -736,12 +754,10 @@ cdef class Parser: | |||
|                 self.model, cfg = self.Model(self.moves.n_moves) | ||||
|             else: | ||||
|                 cfg = {} | ||||
|             if 'tok2vec_model' in msg: | ||||
|                 self.model[0].from_bytes(msg['tok2vec_model']) | ||||
|             if 'lower_model' in msg: | ||||
|                 self.model[1].from_bytes(msg['lower_model']) | ||||
|                 self.model[0].from_bytes(msg['lower_model']) | ||||
|             if 'upper_model' in msg: | ||||
|                 self.model[2].from_bytes(msg['upper_model']) | ||||
|                 self.model[1].from_bytes(msg['upper_model']) | ||||
|             self.cfg.update(cfg) | ||||
|         return self | ||||
| 
 | ||||
|  |  | |||
|  | @ -107,8 +107,6 @@ cdef class TransitionSystem: | |||
| 
 | ||||
|     def is_valid(self, StateClass stcls, move_name): | ||||
|         action = self.lookup_transition(move_name) | ||||
|         if action.move == 0: | ||||
|             return False | ||||
|         return action.is_valid(stcls.c, action.label) | ||||
| 
 | ||||
|     cdef int set_valid(self, int* is_valid, const StateC* st) nogil: | ||||
|  |  | |||
|  | @ -113,7 +113,7 @@ def load_model(name, **overrides): | |||
| def load_model_from_link(name, **overrides): | ||||
|     """Load a model from a shortcut link, or directory in spaCy data path.""" | ||||
|     init_file = get_data_path() / name / '__init__.py' | ||||
|     spec = importlib.util.spec_from_file_location(name, init_file) | ||||
|     spec = importlib.util.spec_from_file_location(name, str(init_file)) | ||||
|     try: | ||||
|         cls = importlib.util.module_from_spec(spec) | ||||
|     except AttributeError: | ||||
|  |  | |||
|  | @ -103,20 +103,20 @@ mixin button(url, trusted, ...style) | |||
|     label    - [string] aside title (optional or false for no label) | ||||
|     language - [string] language for syntax highlighting (default: "python") | ||||
|                supports basic relevant languages available for PrismJS | ||||
|     icon     - [string] icon to display next to code block, mostly used for old/new | ||||
|     prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new) | ||||
|     height   - [integer] optional height to clip code block to | ||||
| 
 | ||||
| mixin code(label, language, icon, height) | ||||
| mixin code(label, language, prompt, height) | ||||
|     pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) | ||||
|         if label | ||||
|             h4.u-text-label.u-text-label--dark=label | ||||
| 
 | ||||
|         - var icon = (prompt == 'accept' || prompt == 'reject') | ||||
|         if icon | ||||
|             - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} | ||||
|             .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) | ||||
|                 +icon(icon, 18) | ||||
| 
 | ||||
|         code.c-code-block__content | ||||
|         code.c-code-block__content(data-prompt=icon ? null : prompt) | ||||
|             block | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -35,6 +35,13 @@ | |||
|     font: normal normal 1.1rem/#{2} $font-code | ||||
|     padding: 1em 2em | ||||
| 
 | ||||
|     &[data-prompt]:before, | ||||
|         content: attr(data-prompt) | ||||
|         margin-right: 0.65em | ||||
|         display: inline-block | ||||
|         vertical-align: middle | ||||
|         opacity: 0.5 | ||||
| 
 | ||||
| 
 | ||||
| //- Inline code | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,16 +5,7 @@ include ../../_includes/_mixins | |||
| p | ||||
|     |  As of v1.7.0, spaCy comes with new command line helpers to download and | ||||
|     |  link models and show useful debugging information. For a list of available | ||||
|     |  commands, type #[code python -m spacy]. To make the command even more | ||||
|     |  convenient, we recommend | ||||
|     |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias] | ||||
|     |  mapping #[code python -m spacy] to #[code spacy]. | ||||
| 
 | ||||
| +aside("Why python -m?") | ||||
|     |  The problem with a global entry point is that it's resolved by looking up | ||||
|     |  entries in your #[code PATH] environment variable. This can give you | ||||
|     |  unexpected results, like executing the wrong spaCy installation. | ||||
|     |  #[code python -m] prevents fallbacks to system modules. | ||||
|     |  commands, type #[code spacy --help]. | ||||
| 
 | ||||
| +infobox("⚠️ Deprecation note") | ||||
|     |  As of spaCy 2.0, the #[code model] command to initialise a model data | ||||
|  | @ -33,8 +24,8 @@ p | |||
|     |  Direct downloads don't perform any compatibility checks and require the | ||||
|     |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy download [model] [--direct] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy download [model] [--direct] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -80,8 +71,8 @@ p | |||
|     |  or use the #[+api("cli#package") #[code package]] command to create a | ||||
|     |  model package. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy link [origin] [link_name] [--force] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy link [origin] [link_name] [--force] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -112,8 +103,8 @@ p | |||
|     |  markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues]. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy info [--markdown] | ||||
|     python -m spacy info [model] [--markdown] | ||||
|     spacy info [--markdown] | ||||
|     spacy info [model] [--markdown] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -139,8 +130,8 @@ p | |||
|     |  functions. The right converter is chosen based on the file extension of | ||||
|     |  the input file. Currently only supports #[code .conllu]. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy convert [input_file] [output_dir] [--n-sents] [--morphology] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -174,8 +165,8 @@ p | |||
|     |  Train a model. Expects data in spaCy's | ||||
|     |  #[+a("/docs/api/annotation#json-input") JSON format]. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -345,8 +336,8 @@ p | |||
|     |  sure you're always using the latest versions. This means you need to be | ||||
|     |  connected to the internet to use this command. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy package [input_dir] [output_dir] [--meta] [--force] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy package [input_dir] [output_dir] [--meta] [--force] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -360,10 +351,17 @@ p | |||
|         +cell Directory to create package folder in. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code meta] | ||||
|         +cell #[code --meta-path], #[code -m] | ||||
|         +cell option | ||||
|         +cell Path to meta.json file (optional). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --create-meta], #[code -c] | ||||
|         +cell flag | ||||
|         +cell | ||||
|             |  Create a meta.json file on the command line, even if one already | ||||
|             |  exists in the directory. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --force], #[code -f] | ||||
|         +cell flag | ||||
|  |  | |||
|  | @ -8,9 +8,9 @@ p | |||
| 
 | ||||
| 
 | ||||
| +aside-code("Download language models", "bash"). | ||||
|     python -m spacy download en | ||||
|     python -m spacy download de | ||||
|     python -m spacy download fr | ||||
|     spacy download en | ||||
|     spacy download de | ||||
|     spacy download fr | ||||
| 
 | ||||
| +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"]) | ||||
|     +row | ||||
|  |  | |||
|  | @ -205,7 +205,7 @@ p | |||
| 
 | ||||
| +infobox("Why lazy-loading?") | ||||
|     |  Some languages contain large volumes of custom data, like lemmatizer | ||||
|     |  loopup tables, or complex regular expression that are expensive to | ||||
|     |  lookup tables, or complex regular expression that are expensive to | ||||
|     |  compute. As of spaCy v2.0, #[code Language] classes are not imported on | ||||
|     |  initialisation and are only loaded when you import them directly, or load | ||||
|     |  a model that requires a language to be loaded. To lazy-load languages in | ||||
|  | @ -789,4 +789,4 @@ p | |||
|     |  model use the using spaCy's #[+api("cli#train") #[code train]] command: | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] | ||||
|     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] | ||||
|  |  | |||
|  | @ -32,10 +32,10 @@ p | |||
|     +qs({package: 'source'}) pip install -r requirements.txt | ||||
|     +qs({package: 'source'}) pip install -e . | ||||
| 
 | ||||
|     +qs({model: 'en'}) python -m spacy download en | ||||
|     +qs({model: 'de'}) python -m spacy download de | ||||
|     +qs({model: 'fr'}) python -m spacy download fr | ||||
|     +qs({model: 'es'}) python -m spacy download es | ||||
|     +qs({model: 'en'}) spacy download en | ||||
|     +qs({model: 'de'}) spacy download de | ||||
|     +qs({model: 'fr'}) spacy download fr | ||||
|     +qs({model: 'es'}) spacy download es | ||||
| 
 | ||||
| +h(2, "installation") Installation instructions | ||||
| 
 | ||||
|  | @ -52,7 +52,7 @@ p Using pip, spaCy releases are currently only available as source packages. | |||
|     |  and available models, see the #[+a("/docs/usage/models") docs on models]. | ||||
| 
 | ||||
|     +code.o-no-block. | ||||
|         python -m spacy download en | ||||
|         spacy download en | ||||
| 
 | ||||
|         >>> import spacy | ||||
|         >>> nlp = spacy.load('en') | ||||
|  | @ -312,7 +312,9 @@ p | |||
|     |  This error may occur when running the #[code spacy] command from the | ||||
|     |  command line. spaCy does not currently add an entry to our #[code PATH] | ||||
|     |  environment variable, as this can lead to unexpected results, especially | ||||
|     |  when using #[code virtualenv]. Run the command with #[code python -m], | ||||
|     |  when using #[code virtualenv]. Instead, spaCy adds an auto-alias that | ||||
|     |  maps #[code spacy] to #[code python -m spacy]. If this is not working as | ||||
|     |  expected, run the command with #[code python -m], yourself – | ||||
|     |  for example #[code python -m spacy download en]. For more info on this, | ||||
|     |  see #[+api("cli#download") download]. | ||||
| 
 | ||||
|  |  | |||
|  | @ -10,8 +10,8 @@ p | |||
| +h(2, "models") Install models and process text | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy download en | ||||
|     python -m spacy download de | ||||
|     spacy download en | ||||
|     spacy download de | ||||
| 
 | ||||
| +code. | ||||
|     import spacy | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ p | |||
| +quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.") | ||||
|     for models, lang in MODELS | ||||
|         - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def }) | ||||
|         +qs({lang: lang}) python -m spacy download #{lang} | ||||
|         +qs({lang: lang}) spacy download #{lang} | ||||
|         +qs({lang: lang}, "divider") | ||||
|         +qs({lang: lang, load: "module"}, "python") import #{package.id} | ||||
|         +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load() | ||||
|  | @ -52,16 +52,16 @@ p | |||
|     |  #[+api("cli#download") #[code download]] command. It takes care of | ||||
|     |  finding the best-matching model compatible with your spaCy installation. | ||||
| 
 | ||||
| - var models = Object.keys(MODELS).map(function(lang) { return "python -m spacy download " + lang }) | ||||
| - var models = Object.keys(MODELS).map(function(lang) { return "spacy download " + lang }) | ||||
| +code(false, "bash"). | ||||
|     # out-of-the-box: download best-matching default model | ||||
|     #{Object.keys(MODELS).map(function(l) {return "python -m spacy download " + l}).join('\n')} | ||||
|     #{Object.keys(MODELS).map(function(l) {return "spacy download " + l}).join('\n')} | ||||
| 
 | ||||
|     # download best-matching version of specific model for your spaCy installation | ||||
|     python -m spacy download en_core_web_md | ||||
|     spacy download en_core_web_md | ||||
| 
 | ||||
|     # download exact model version (doesn't create shortcut link) | ||||
|     python -m spacy download en_core_web_md-1.2.0 --direct | ||||
|     spacy download en_core_web_md-1.2.0 --direct | ||||
| 
 | ||||
| p | ||||
|     |  The download command will #[+a("#download-pip") install the model] via | ||||
|  | @ -72,7 +72,7 @@ p | |||
| 
 | ||||
| +code(false, "bash"). | ||||
|     pip install spacy | ||||
|     python -m spacy download en | ||||
|     spacy download en | ||||
| 
 | ||||
| +code. | ||||
|     import spacy | ||||
|  | @ -179,8 +179,8 @@ p | |||
|     |  model names or IDs. And your system already comes with a native solution | ||||
|     |  to mapping unicode aliases to file paths: symbolic links. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy link [package name or path] [shortcut] [--force] | ||||
| +code(false, "bash", "$"). | ||||
|     spacy link [package name or path] [shortcut] [--force] | ||||
| 
 | ||||
| p | ||||
|     |  The first argument is the #[strong package name] (if the model was | ||||
|  |  | |||
|  | @ -85,7 +85,7 @@ p | |||
|     } | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy package /home/me/data/en_example_model /home/me/my_models | ||||
|     spacy package /home/me/data/en_example_model /home/me/my_models | ||||
| 
 | ||||
| p This command will create a model package directory that should look like this: | ||||
| 
 | ||||
|  |  | |||
|  | @ -102,7 +102,7 @@ p | |||
|     |  CLI command to create all required files and directories. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy package /home/me/data/en_technology /home/me/my_models | ||||
|     spacy package /home/me/data/en_technology /home/me/my_models | ||||
| 
 | ||||
| p | ||||
|     |  To build the package and create a #[code .tar.gz] archive, run | ||||
|  |  | |||
|  | @ -238,11 +238,11 @@ p | |||
| +h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER | ||||
| 
 | ||||
| +aside-code("Example", "bash"). | ||||
|     python -m spacy download en # default English model | ||||
|     python -m spacy download de # default German model | ||||
|     python -m spacy download fr # default French model | ||||
|     python -m spacy download es # default Spanish model | ||||
|     python -m spacy download xx_ent_wiki_sm # multi-language NER | ||||
|     spacy download en # default English model | ||||
|     spacy download de # default German model | ||||
|     spacy download fr # default French model | ||||
|     spacy download es # default Spanish model | ||||
|     spacy download xx_ent_wiki_sm # multi-language NER | ||||
| 
 | ||||
| p | ||||
|     |  spaCy v2.0 comes with new and improved neural network models for English, | ||||
|  |  | |||
|  | @ -259,7 +259,7 @@ p | |||
|     |  notebook, the visualizations will be included as HTML. | ||||
| 
 | ||||
| +code("Jupyter Example"). | ||||
|     # don't forget to install a model, e.g.: python -m spacy download en | ||||
|     # don't forget to install a model, e.g.: spacy download en | ||||
|     import spacy | ||||
|     from spacy import displacy | ||||
| 
 | ||||
|  |  | |||
|  | @ -68,7 +68,7 @@ include _includes/_mixins | |||
|     +grid | ||||
|         +grid-col("two-thirds") | ||||
|             +terminal("lightning_tour.py"). | ||||
|                 # Install: pip install spacy && python -m spacy download en | ||||
|                 # Install: pip install spacy && spacy download en | ||||
|                 import spacy | ||||
| 
 | ||||
|                 # Load English tokenizer, tagger, parser, NER and word vectors | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user