mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
		
						commit
						9c9cd99144
					
				|  | @ -1,130 +0,0 @@ | ||||||
| #!/usr/bin/env python |  | ||||||
| from __future__ import division |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| import os |  | ||||||
| from os import path |  | ||||||
| import shutil |  | ||||||
| import codecs |  | ||||||
| import random |  | ||||||
| import time |  | ||||||
| import gzip |  | ||||||
| 
 |  | ||||||
| import plac |  | ||||||
| import cProfile |  | ||||||
| import pstats |  | ||||||
| 
 |  | ||||||
| import spacy.util |  | ||||||
| from spacy.en import English |  | ||||||
| from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir |  | ||||||
| 
 |  | ||||||
| from spacy.syntax.parser import GreedyParser |  | ||||||
| from spacy.syntax.parser import OracleError |  | ||||||
| from spacy.syntax.util import Config |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def is_punct_label(label): |  | ||||||
|     return label == 'P' or label.lower() == 'punct' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def read_gold(file_): |  | ||||||
|     """Read a standard CoNLL/MALT-style format""" |  | ||||||
|     sents = [] |  | ||||||
|     for sent_str in file_.read().strip().split('\n\n'): |  | ||||||
|         ids = [] |  | ||||||
|         words = [] |  | ||||||
|         heads = [] |  | ||||||
|         labels = [] |  | ||||||
|         tags = [] |  | ||||||
|         for i, line in enumerate(sent_str.split('\n')): |  | ||||||
|             id_, word, pos_string, head_idx, label = _parse_line(line) |  | ||||||
|             words.append(word) |  | ||||||
|             if head_idx == -1: |  | ||||||
|                 head_idx = i |  | ||||||
|             ids.append(id_) |  | ||||||
|             heads.append(head_idx) |  | ||||||
|             labels.append(label) |  | ||||||
|             tags.append(pos_string) |  | ||||||
|         text = ' '.join(words) |  | ||||||
|         sents.append((text, [words], ids, words, tags, heads, labels)) |  | ||||||
|     return sents |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _parse_line(line): |  | ||||||
|     pieces = line.split() |  | ||||||
|     id_ = int(pieces[0]) |  | ||||||
|     word = pieces[1] |  | ||||||
|     pos = pieces[3] |  | ||||||
|     head_idx = int(pieces[6]) |  | ||||||
|     label = pieces[7] |  | ||||||
|     return id_, word, pos, head_idx, label |  | ||||||
| 
 |  | ||||||
|          |  | ||||||
| def iter_data(paragraphs, tokenizer, gold_preproc=False): |  | ||||||
|     for raw, tokenized, ids, words, tags, heads, labels in paragraphs: |  | ||||||
|         assert len(words) == len(heads) |  | ||||||
|         for words in tokenized: |  | ||||||
|             sent_ids = ids[:len(words)] |  | ||||||
|             sent_tags = tags[:len(words)] |  | ||||||
|             sent_heads = heads[:len(words)] |  | ||||||
|             sent_labels = labels[:len(words)] |  | ||||||
|             sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) |  | ||||||
|             tokens = tokenizer.tokens_from_list(words) |  | ||||||
|             yield tokens, sent_tags, sent_heads, sent_labels |  | ||||||
|             ids = ids[len(words):] |  | ||||||
|             tags = tags[len(words):] |  | ||||||
|             heads = heads[len(words):] |  | ||||||
|             labels = labels[len(words):] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _map_indices_to_tokens(ids, heads): |  | ||||||
|     mapped = [] |  | ||||||
|     for head in heads: |  | ||||||
|         if head not in ids: |  | ||||||
|             mapped.append(None) |  | ||||||
|         else: |  | ||||||
|             mapped.append(ids.index(head)) |  | ||||||
|     return mapped |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def evaluate(Language, dev_loc, model_dir): |  | ||||||
|     global loss |  | ||||||
|     nlp = Language() |  | ||||||
|     n_corr = 0 |  | ||||||
|     pos_corr = 0 |  | ||||||
|     n_tokens = 0 |  | ||||||
|     total = 0 |  | ||||||
|     skipped = 0 |  | ||||||
|     loss = 0 |  | ||||||
|     with codecs.open(dev_loc, 'r', 'utf8') as file_: |  | ||||||
|         paragraphs = read_gold(file_) |  | ||||||
|     for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer): |  | ||||||
|         assert len(tokens) == len(labels) |  | ||||||
|         nlp.tagger.tag_from_strings(tokens, tag_strs) |  | ||||||
|         nlp.parser(tokens) |  | ||||||
|         for i, token in enumerate(tokens): |  | ||||||
|             try: |  | ||||||
|                 pos_corr += token.tag_ == tag_strs[i] |  | ||||||
|             except: |  | ||||||
|                 print i, token.orth_, token.tag |  | ||||||
|                 raise |  | ||||||
|             n_tokens += 1 |  | ||||||
|             if heads[i] is None: |  | ||||||
|                 skipped += 1 |  | ||||||
|                 continue |  | ||||||
|             if is_punct_label(labels[i]): |  | ||||||
|                 continue |  | ||||||
|             n_corr += token.head.i == heads[i] |  | ||||||
|             total += 1 |  | ||||||
|     print loss, skipped, (loss+skipped + total) |  | ||||||
|     print pos_corr / n_tokens |  | ||||||
|     return float(n_corr) / (total + loss) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def main(dev_loc, model_dir): |  | ||||||
|     print evaluate(English, dev_loc, model_dir) |  | ||||||
|      |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     plac.call(main) |  | ||||||
|  | @ -1,261 +0,0 @@ | ||||||
| #!/usr/bin/env python |  | ||||||
| from __future__ import division |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| import os |  | ||||||
| from os import path |  | ||||||
| import shutil |  | ||||||
| import codecs |  | ||||||
| import random |  | ||||||
| 
 |  | ||||||
| import plac |  | ||||||
| import cProfile |  | ||||||
| import pstats |  | ||||||
| import re |  | ||||||
| 
 |  | ||||||
| import spacy.util |  | ||||||
| from spacy.en import English |  | ||||||
| from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir |  | ||||||
| 
 |  | ||||||
| from spacy.syntax.util import Config |  | ||||||
| from spacy.gold import read_json_file |  | ||||||
| from spacy.gold import GoldParse |  | ||||||
| 
 |  | ||||||
| from spacy.scorer import Scorer |  | ||||||
| 
 |  | ||||||
| from spacy.syntax.parser import Parser, get_templates |  | ||||||
| from spacy._theano import TheanoModel |  | ||||||
| 
 |  | ||||||
| import theano |  | ||||||
| import theano.tensor as T |  | ||||||
| 
 |  | ||||||
| from theano.printing import Print |  | ||||||
| 
 |  | ||||||
| import numpy |  | ||||||
| from collections import OrderedDict, defaultdict |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| theano.config.profile = False |  | ||||||
| theano.config.floatX = 'float32' |  | ||||||
| floatX = theano.config.floatX |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def L1(L1_reg, *weights): |  | ||||||
|     return L1_reg * sum(abs(w).sum() for w in weights) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def L2(L2_reg, *weights): |  | ||||||
|     return L2_reg * sum((w ** 2).sum() for w in weights) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): |  | ||||||
|     updates = OrderedDict() |  | ||||||
|     for param in params: |  | ||||||
|         value = param.get_value(borrow=True) |  | ||||||
|         accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), |  | ||||||
|                              broadcastable=param.broadcastable) |  | ||||||
| 
 |  | ||||||
|         grad = T.grad(loss, param) |  | ||||||
|         accu_new = rho * accu + (1 - rho) * grad ** 2 |  | ||||||
|         updates[accu] = accu_new |  | ||||||
|         updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) |  | ||||||
|     return updates |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def relu(x): |  | ||||||
|     return x * (x > 0) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def feed_layer(activation, weights, bias, input_): |  | ||||||
|     return activation(T.dot(input_, weights) + bias) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def init_weights(n_in, n_out): |  | ||||||
|     rng = numpy.random.RandomState(1235) |  | ||||||
|      |  | ||||||
|     weights = numpy.asarray( |  | ||||||
|         rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), |  | ||||||
|         dtype=theano.config.floatX |  | ||||||
|     ) |  | ||||||
|     bias = numpy.zeros((n_out,), dtype=theano.config.floatX) |  | ||||||
|     return [wrapper(weights, name='W'), wrapper(bias, name='b')] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def compile_model(n_classes, n_hidden, n_in, optimizer): |  | ||||||
|     x = T.vector('x')  |  | ||||||
|     costs = T.ivector('costs') |  | ||||||
|     loss = T.scalar('loss') |  | ||||||
| 
 |  | ||||||
|     maxent_W, maxent_b = init_weights(n_hidden, n_classes) |  | ||||||
|     hidden_W, hidden_b = init_weights(n_in, n_hidden) |  | ||||||
| 
 |  | ||||||
|     # Feed the inputs forward through the network |  | ||||||
|     p_y_given_x = feed_layer( |  | ||||||
|                     T.nnet.softmax, |  | ||||||
|                     maxent_W, |  | ||||||
|                     maxent_b, |  | ||||||
|                       feed_layer( |  | ||||||
|                         relu, |  | ||||||
|                         hidden_W, |  | ||||||
|                         hidden_b, |  | ||||||
|                         x)) |  | ||||||
| 
 |  | ||||||
|     loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) |  | ||||||
| 
 |  | ||||||
|     train_model = theano.function( |  | ||||||
|         name='train_model', |  | ||||||
|         inputs=[x, costs], |  | ||||||
|         outputs=[p_y_given_x[0], T.grad(loss, x), loss], |  | ||||||
|         updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), |  | ||||||
|         on_unused_input='warn' |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
|     evaluate_model = theano.function( |  | ||||||
|         name='evaluate_model', |  | ||||||
|         inputs=[x], |  | ||||||
|         outputs=[ |  | ||||||
|             feed_layer( |  | ||||||
|               T.nnet.softmax, |  | ||||||
|               maxent_W, |  | ||||||
|               maxent_b, |  | ||||||
|               feed_layer( |  | ||||||
|                 relu, |  | ||||||
|                 hidden_W, |  | ||||||
|                 hidden_b, |  | ||||||
|                 x |  | ||||||
|               ) |  | ||||||
|             )[0] |  | ||||||
|         ] |  | ||||||
|     ) |  | ||||||
|     return train_model, evaluate_model |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def score_model(scorer, nlp, annot_tuples, verbose=False): |  | ||||||
|     tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) |  | ||||||
|     nlp.tagger(tokens) |  | ||||||
|     nlp.parser(tokens) |  | ||||||
|     gold = GoldParse(tokens, annot_tuples) |  | ||||||
|     scorer.score(tokens, gold, verbose=verbose) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', |  | ||||||
|           eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, |  | ||||||
|           seed=0, n_sents=0,  verbose=False): |  | ||||||
| 
 |  | ||||||
|     dep_model_dir = path.join(model_dir, 'deps') |  | ||||||
|     pos_model_dir = path.join(model_dir, 'pos') |  | ||||||
|     if path.exists(dep_model_dir): |  | ||||||
|         shutil.rmtree(dep_model_dir) |  | ||||||
|     if path.exists(pos_model_dir): |  | ||||||
|         shutil.rmtree(pos_model_dir) |  | ||||||
|     os.mkdir(dep_model_dir) |  | ||||||
|     os.mkdir(pos_model_dir) |  | ||||||
|     setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) |  | ||||||
| 
 |  | ||||||
|     Config.write(dep_model_dir, 'config', |  | ||||||
|         seed=seed, |  | ||||||
|         templates=tuple(), |  | ||||||
|         labels=Language.ParserTransitionSystem.get_labels(gold_tuples), |  | ||||||
|         vector_lengths=(nv_word, nv_tag, nv_label), |  | ||||||
|         hidden_nodes=nv_hidden, |  | ||||||
|         eta=eta, |  | ||||||
|         mu=mu |  | ||||||
|     ) |  | ||||||
|    |  | ||||||
|     # Bake-in hyper-parameters |  | ||||||
|     optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) |  | ||||||
|     nlp = Language(data_dir=model_dir) |  | ||||||
|     n_classes = nlp.parser.model.n_classes |  | ||||||
|     train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) |  | ||||||
|     nlp.parser.model = TheanoModel(n_classes, input_spec, train, |  | ||||||
|                                    predict, model_loc) |  | ||||||
|   |  | ||||||
|     if n_sents > 0: |  | ||||||
|         gold_tuples = gold_tuples[:n_sents] |  | ||||||
|     print "Itn.\tP.Loss\tUAS\tTag %\tToken %" |  | ||||||
|     log_loc = path.join(model_dir, 'job.log') |  | ||||||
|     for itn in range(n_iter): |  | ||||||
|         scorer = Scorer() |  | ||||||
|         loss = 0 |  | ||||||
|         for _, sents in gold_tuples: |  | ||||||
|             for annot_tuples, ctnt in sents: |  | ||||||
|                 if len(annot_tuples[1]) == 1: |  | ||||||
|                     continue |  | ||||||
|                 score_model(scorer, nlp, annot_tuples) |  | ||||||
|                 tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) |  | ||||||
|                 nlp.tagger(tokens) |  | ||||||
|                 gold = GoldParse(tokens, annot_tuples, make_projective=True) |  | ||||||
|                 assert gold.is_projective |  | ||||||
|                 loss += nlp.parser.train(tokens, gold) |  | ||||||
|                 nlp.tagger.train(tokens, gold.tags) |  | ||||||
|         random.shuffle(gold_tuples) |  | ||||||
|         logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, |  | ||||||
|                                                  scorer.tags_acc, |  | ||||||
|                                                  scorer.token_acc) |  | ||||||
|         print logline |  | ||||||
|         with open(log_loc, 'aw') as file_: |  | ||||||
|             file_.write(logline + '\n') |  | ||||||
|     nlp.parser.model.end_training() |  | ||||||
|     nlp.tagger.model.end_training() |  | ||||||
|     nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) |  | ||||||
|     return nlp |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def evaluate(nlp, gold_tuples, gold_preproc=True): |  | ||||||
|     scorer = Scorer() |  | ||||||
|     for raw_text, sents in gold_tuples: |  | ||||||
|         for annot_tuples, brackets in sents: |  | ||||||
|             tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) |  | ||||||
|             nlp.tagger(tokens) |  | ||||||
|             nlp.parser(tokens) |  | ||||||
|             gold = GoldParse(tokens, annot_tuples) |  | ||||||
|             scorer.score(tokens, gold) |  | ||||||
|     return scorer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @plac.annotations( |  | ||||||
|     train_loc=("Location of training file or directory"), |  | ||||||
|     dev_loc=("Location of development file or directory"), |  | ||||||
|     model_dir=("Location of output model directory",), |  | ||||||
|     eval_only=("Skip training, and only evaluate", "flag", "e", bool), |  | ||||||
|     n_sents=("Number of training sentences", "option", "n", int), |  | ||||||
|     n_iter=("Number of training iterations", "option", "i", int), |  | ||||||
|     verbose=("Verbose error reporting", "flag", "v", bool), |  | ||||||
| 
 |  | ||||||
|     nv_word=("Word vector length", "option", "W", int), |  | ||||||
|     nv_tag=("Tag vector length", "option", "T", int), |  | ||||||
|     nv_label=("Label vector length", "option", "L", int), |  | ||||||
|     nv_hidden=("Hidden nodes length", "option", "H", int), |  | ||||||
|     eta=("Learning rate", "option", "E", float), |  | ||||||
|     mu=("Momentum", "option", "M", float), |  | ||||||
| ) |  | ||||||
| def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, |  | ||||||
|          nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, |  | ||||||
|          eta=0.1, mu=0.9, eval_only=False): |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) |  | ||||||
| 
 |  | ||||||
|     nlp = train(English, gold_train, model_dir, |  | ||||||
|                feat_set='embed', |  | ||||||
|                eta=eta, mu=mu, |  | ||||||
|                nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, |  | ||||||
|                n_sents=n_sents, n_iter=n_iter, |  | ||||||
|                verbose=verbose) |  | ||||||
| 
 |  | ||||||
|     scorer = evaluate(nlp, list(read_json_file(dev_loc))) |  | ||||||
|      |  | ||||||
|     print 'TOK', 100-scorer.token_acc |  | ||||||
|     print 'POS', scorer.tags_acc |  | ||||||
|     print 'UAS', scorer.uas |  | ||||||
|     print 'LAS', scorer.las |  | ||||||
| 
 |  | ||||||
|     print 'NER P', scorer.ents_p |  | ||||||
|     print 'NER R', scorer.ents_r |  | ||||||
|     print 'NER F', scorer.ents_f |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     plac.call(main) |  | ||||||
|  | @ -1,18 +1,13 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import plac | import plac | ||||||
| import json | import json | ||||||
| from os import path |  | ||||||
| import shutil |  | ||||||
| import os |  | ||||||
| import random | import random | ||||||
| import io |  | ||||||
| import pathlib | import pathlib | ||||||
| 
 | 
 | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
| from spacy.syntax.nonproj import PseudoProjectivity | from spacy.syntax.nonproj import PseudoProjectivity | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.gold import GoldParse | from spacy.gold import GoldParse | ||||||
| from spacy.vocab import Vocab |  | ||||||
| from spacy.tagger import Tagger | from spacy.tagger import Tagger | ||||||
| from spacy.pipeline import DependencyParser, BeamDependencyParser | from spacy.pipeline import DependencyParser, BeamDependencyParser | ||||||
| from spacy.syntax.parser import get_templates | from spacy.syntax.parser import get_templates | ||||||
|  | @ -23,7 +18,6 @@ import spacy.attrs | ||||||
| import io | import io | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def read_conllx(loc, n=0): | def read_conllx(loc, n=0): | ||||||
|     with io.open(loc, 'r', encoding='utf8') as file_: |     with io.open(loc, 'r', encoding='utf8') as file_: | ||||||
|         text = file_.read() |         text = file_.read() | ||||||
|  | @ -35,7 +29,8 @@ def read_conllx(loc, n=0): | ||||||
|                 lines.pop(0) |                 lines.pop(0) | ||||||
|             tokens = [] |             tokens = [] | ||||||
|             for line in lines: |             for line in lines: | ||||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() |                 id_, word, lemma, pos, tag, morph, head, dep, _1, \ | ||||||
|  |                 _2 = line.split('\t') | ||||||
|                 if '-' in id_ or '.' in id_: |                 if '-' in id_ or '.' in id_: | ||||||
|                     continue |                     continue | ||||||
|                 try: |                 try: | ||||||
|  | @ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): | ||||||
|         random.shuffle(train_sents) |         random.shuffle(train_sents) | ||||||
|         scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) |         scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) | ||||||
|         print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) |         print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) | ||||||
|     nlp = Language(vocab=vocab, tagger=tagger, parser=parser) |     nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) | ||||||
|     nlp.end_training(model_dir) |     nlp.end_training(model_dir) | ||||||
|     scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) |     scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) | ||||||
|     print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) |     print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) | ||||||
|  |  | ||||||
|  | @ -5,7 +5,7 @@ import json | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from .util import set_lang_class, get_lang_class, parse_package_meta | from .util import set_lang_class, get_lang_class, parse_package_meta | ||||||
| from .deprecated import resolve_model_name | from .deprecated import resolve_model_name | ||||||
| from .cli.info import info | from .cli import info | ||||||
| 
 | 
 | ||||||
| from . import en | from . import en | ||||||
| from . import de | from . import de | ||||||
|  | @ -49,7 +49,3 @@ def load(name, **overrides): | ||||||
|         overrides['path'] = model_path |         overrides['path'] = model_path | ||||||
| 
 | 
 | ||||||
|     return cls(**overrides) |     return cls(**overrides) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def info(name, markdown): |  | ||||||
|     info(name, markdown) |  | ||||||
|  |  | ||||||
|  | @ -1,5 +1,4 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| #  |  | ||||||
| from __future__ import print_function | from __future__ import print_function | ||||||
| # NB! This breaks in plac on Python 2!! | # NB! This breaks in plac on Python 2!! | ||||||
| #from __future__ import unicode_literals, | #from __future__ import unicode_literals, | ||||||
|  | @ -8,12 +7,13 @@ import plac | ||||||
| from spacy.cli import download as cli_download | from spacy.cli import download as cli_download | ||||||
| from spacy.cli import link as cli_link | from spacy.cli import link as cli_link | ||||||
| from spacy.cli import info as cli_info | from spacy.cli import info as cli_info | ||||||
|  | from spacy.cli import package as cli_package | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class CLI(object): | class CLI(object): | ||||||
|     """Command-line interface for spaCy""" |     """Command-line interface for spaCy""" | ||||||
| 
 | 
 | ||||||
|     commands = ('download', 'link', 'info') |     commands = ('download', 'link', 'info', 'package') | ||||||
| 
 | 
 | ||||||
|     @plac.annotations( |     @plac.annotations( | ||||||
|         model=("model to download (shortcut or model name)", "positional", None, str), |         model=("model to download (shortcut or model name)", "positional", None, str), | ||||||
|  | @ -32,8 +32,8 @@ class CLI(object): | ||||||
| 
 | 
 | ||||||
|     @plac.annotations( |     @plac.annotations( | ||||||
|         origin=("package name or local path to model", "positional", None, str), |         origin=("package name or local path to model", "positional", None, str), | ||||||
|         link_name=("Name of shortuct link to create", "positional", None, str), |         link_name=("name of shortuct link to create", "positional", None, str), | ||||||
|         force=("Force overwriting of existing link", "flag", "f", bool) |         force=("force overwriting of existing link", "flag", "f", bool) | ||||||
|     ) |     ) | ||||||
|     def link(self, origin, link_name, force=False): |     def link(self, origin, link_name, force=False): | ||||||
|         """ |         """ | ||||||
|  | @ -59,6 +59,21 @@ class CLI(object): | ||||||
|         cli_info(model, markdown) |         cli_info(model, markdown) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |     @plac.annotations( | ||||||
|  |         input_dir=("directory with model data", "positional", None, str), | ||||||
|  |         output_dir=("output directory", "positional", None, str), | ||||||
|  |         force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||||
|  |     ) | ||||||
|  |     def package(self, input_dir, output_dir, force=False): | ||||||
|  |         """ | ||||||
|  |         Generate Python package for model data, including meta and required | ||||||
|  |         installation files. A new directory will be created in the specified | ||||||
|  |         output directory, and model data will be copied over. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         cli_package(input_dir, output_dir, force) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|     def __missing__(self, name): |     def __missing__(self, name): | ||||||
|         print("\n   Command %r does not exist\n" % name) |         print("\n   Command %r does not exist\n" % name) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| from libc.stdio cimport fopen, fclose, fread, fwrite, FILE | from libc.stdio cimport fopen, fclose, fread, fwrite | ||||||
| from libc.string cimport memcpy | from libc.string cimport memcpy | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,3 +1,4 @@ | ||||||
| from .download import download | from .download import download | ||||||
| from .info import info | from .info import info | ||||||
| from .link import link | from .link import link | ||||||
|  | from .package import package | ||||||
|  |  | ||||||
							
								
								
									
										91
									
								
								spacy/cli/package.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								spacy/cli/package.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,91 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import json | ||||||
|  | import shutil | ||||||
|  | import requests | ||||||
|  | from pathlib import Path | ||||||
|  | 
 | ||||||
|  | from .. import about | ||||||
|  | from .. import util | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def package(input_dir, output_dir, force): | ||||||
|  |     input_path = Path(input_dir) | ||||||
|  |     output_path = Path(output_dir) | ||||||
|  |     check_dirs(input_path, output_path) | ||||||
|  | 
 | ||||||
|  |     template_setup = get_template('setup.py') | ||||||
|  |     template_manifest = get_template('MANIFEST.in') | ||||||
|  |     template_init = get_template('en_model_name/__init__.py') | ||||||
|  |     meta = generate_meta() | ||||||
|  | 
 | ||||||
|  |     model_name = meta['lang'] + '_' + meta['name'] | ||||||
|  |     model_name_v = model_name + '-' + meta['version'] | ||||||
|  |     main_path = output_path / model_name_v | ||||||
|  |     package_path = main_path / model_name | ||||||
|  | 
 | ||||||
|  |     create_dirs(package_path, force) | ||||||
|  |     shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) | ||||||
|  |     create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) | ||||||
|  |     create_file(main_path / 'setup.py', template_setup) | ||||||
|  |     create_file(main_path / 'MANIFEST.in', template_manifest) | ||||||
|  |     create_file(package_path / '__init__.py', template_init) | ||||||
|  | 
 | ||||||
|  |     util.print_msg( | ||||||
|  |         main_path.as_posix(), | ||||||
|  |         "To build the package, run `python setup.py sdist` in that directory.", | ||||||
|  |         title="Successfully created package {p}".format(p=model_name_v)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def check_dirs(input_path, output_path): | ||||||
|  |     if not input_path.exists(): | ||||||
|  |         util.sys_exit(input_path.as_poisx(), title="Model directory not found") | ||||||
|  |     if not output_path.exists(): | ||||||
|  |         util.sys_exit(output_path.as_posix(), title="Output directory not found") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_dirs(package_path, force): | ||||||
|  |     if package_path.exists(): | ||||||
|  |         if force: | ||||||
|  |             shutil.rmtree(package_path.as_posix()) | ||||||
|  |         else: | ||||||
|  |             util.sys_exit(package_path.as_posix(), | ||||||
|  |                 "Please delete the directory and try again.", | ||||||
|  |                 title="Package directory already exists") | ||||||
|  |     Path.mkdir(package_path, parents=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_file(file_path, contents): | ||||||
|  |     file_path.touch() | ||||||
|  |     file_path.open('w').write(contents, encoding='utf-8') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def generate_meta(): | ||||||
|  |     settings = [('lang', 'Model language', 'en'), | ||||||
|  |                 ('name', 'Model name', 'model'), | ||||||
|  |                 ('version', 'Model version', '0.0.0'), | ||||||
|  |                 ('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'), | ||||||
|  |                 ('description', 'Model description', False), | ||||||
|  |                 ('author', 'Author', False), | ||||||
|  |                 ('email', 'Author email', False), | ||||||
|  |                 ('url', 'Author website', False), | ||||||
|  |                 ('license', 'License', 'CC BY-NC 3.0')] | ||||||
|  | 
 | ||||||
|  |     util.print_msg("Enter the package settings for your model.", title="Generating meta.json") | ||||||
|  | 
 | ||||||
|  |     meta = {} | ||||||
|  |     for setting, desc, default in settings: | ||||||
|  |         response = util.get_raw_input(desc, default) | ||||||
|  |         meta[setting] = default if response == '' and default else response | ||||||
|  |     return meta | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_template(filepath): | ||||||
|  |     url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' | ||||||
|  |     r = requests.get(url + filepath) | ||||||
|  |     if r.status_code != 200: | ||||||
|  |         util.sys_exit( | ||||||
|  |             "Couldn't fetch template files from GitHub.", | ||||||
|  |             title="Server error ({c})".format(c=r.status_code)) | ||||||
|  |     return r.text | ||||||
|  | @ -21,7 +21,6 @@ MORPH_RULES = { | ||||||
|         "them":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, |         "them":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, | ||||||
| 
 | 
 | ||||||
|         "mine":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, |         "mine":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, | ||||||
|         "yours":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, |  | ||||||
|         "his":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, |         "his":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, | ||||||
|         "hers":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"}, |         "hers":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"}, | ||||||
|         "its":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, |         "its":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, | ||||||
|  |  | ||||||
|  | @ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = { | ||||||
|     "vm.": [ |     "vm.": [ | ||||||
|         {ORTH: "vm.", LEMMA: "viimeksi mainittu"} |         {ORTH: "vm.", LEMMA: "viimeksi mainittu"} | ||||||
|     ], |     ], | ||||||
|     "siht.": [ |  | ||||||
|         {ORTH: "siht.", LEMMA: "sihteeri"} |  | ||||||
|     ], |  | ||||||
|     "srk.": [ |     "srk.": [ | ||||||
|         {ORTH: "srk.", LEMMA: "seurakunta"} |         {ORTH: "srk.", LEMMA: "seurakunta"} | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  | @ -1,16 +1,12 @@ | ||||||
| # cython: profile=True | # cython: profile=True | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import numpy |  | ||||||
| import io | import io | ||||||
| import json | import json | ||||||
| import random |  | ||||||
| import re | import re | ||||||
| import os | import os | ||||||
| from os import path | from os import path | ||||||
| 
 | 
 | ||||||
| from libc.string cimport memset |  | ||||||
| 
 |  | ||||||
| import ujson as json | import ujson as json | ||||||
| 
 | 
 | ||||||
| from .syntax import nonproj | from .syntax import nonproj | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| from warnings import warn |  | ||||||
| import pathlib | import pathlib | ||||||
| from contextlib import contextmanager | from contextlib import contextmanager | ||||||
| import shutil | import shutil | ||||||
|  | @ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP | ||||||
| from .syntax.parser import get_templates | from .syntax.parser import get_templates | ||||||
| from .syntax.nonproj import PseudoProjectivity | from .syntax.nonproj import PseudoProjectivity | ||||||
| from .pipeline import DependencyParser, EntityRecognizer | from .pipeline import DependencyParser, EntityRecognizer | ||||||
| from .pipeline import BeamDependencyParser, BeamEntityRecognizer |  | ||||||
| from .syntax.arc_eager import ArcEager | from .syntax.arc_eager import ArcEager | ||||||
| from .syntax.ner import BiluoPushDown | from .syntax.ner import BiluoPushDown | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,13 +2,10 @@ | ||||||
| # cython: infer_types=True | # cython: infer_types=True | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from os import path |  | ||||||
| 
 |  | ||||||
| from .typedefs cimport attr_t | from .typedefs cimport attr_t | ||||||
| from .typedefs cimport hash_t | from .typedefs cimport hash_t | ||||||
| from .attrs cimport attr_id_t | from .attrs cimport attr_id_t | ||||||
| from .structs cimport TokenC, LexemeC | from .structs cimport TokenC | ||||||
| from .lexeme cimport Lexeme |  | ||||||
| 
 | 
 | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from preshed.maps cimport PreshMap | from preshed.maps cimport PreshMap | ||||||
|  | @ -17,7 +14,7 @@ from libcpp.pair cimport pair | ||||||
| from murmurhash.mrmr cimport hash64 | from murmurhash.mrmr cimport hash64 | ||||||
| from libc.stdint cimport int32_t | from libc.stdint cimport int32_t | ||||||
| 
 | 
 | ||||||
| from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE | from .attrs cimport ID, ENT_TYPE | ||||||
| from . import attrs | from . import attrs | ||||||
| from .tokens.doc cimport get_token_attr | from .tokens.doc cimport get_token_attr | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
|  |  | ||||||
|  | @ -1,12 +1,8 @@ | ||||||
| # cython: infer_types | # cython: infer_types | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from os import path |  | ||||||
| 
 |  | ||||||
| from libc.string cimport memset | from libc.string cimport memset | ||||||
| 
 | 
 | ||||||
| from .lemmatizer import Lemmatizer |  | ||||||
| 
 |  | ||||||
| try: | try: | ||||||
|     import ujson as json |     import ujson as json | ||||||
| except ImportError: | except ImportError: | ||||||
|  |  | ||||||
|  | @ -2,7 +2,6 @@ from .syntax.parser cimport Parser | ||||||
| from .syntax.beam_parser cimport BeamParser | from .syntax.beam_parser cimport BeamParser | ||||||
| from .syntax.ner cimport BiluoPushDown | from .syntax.ner cimport BiluoPushDown | ||||||
| from .syntax.arc_eager cimport ArcEager | from .syntax.arc_eager cimport ArcEager | ||||||
| from .vocab cimport Vocab |  | ||||||
| from .tagger import Tagger | from .tagger import Tagger | ||||||
| 
 | 
 | ||||||
| # TODO: The disorganization here is pretty embarrassing. At least it's only | # TODO: The disorganization here is pretty embarrassing. At least it's only | ||||||
|  |  | ||||||
|  | @ -1,20 +1,16 @@ | ||||||
| import json | import json | ||||||
| import pathlib | import pathlib | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| from libc.string cimport memset |  | ||||||
| 
 | 
 | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from thinc.typedefs cimport atom_t, weight_t | from thinc.typedefs cimport atom_t | ||||||
| from thinc.extra.eg cimport Example | from thinc.extra.eg cimport Example | ||||||
| from thinc.structs cimport ExampleC | from thinc.structs cimport ExampleC | ||||||
| from thinc.linear.avgtron cimport AveragedPerceptron | from thinc.linear.avgtron cimport AveragedPerceptron | ||||||
| from thinc.linalg cimport VecVec | from thinc.linalg cimport VecVec | ||||||
| 
 | 
 | ||||||
| from .typedefs cimport attr_t |  | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .attrs cimport TAG | from .attrs cimport TAG | ||||||
| from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON |  | ||||||
| from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE |  | ||||||
| from .gold cimport GoldParse | from .gold cimport GoldParse | ||||||
| 
 | 
 | ||||||
| from .attrs cimport * | from .attrs cimport * | ||||||
|  |  | ||||||
|  | @ -1,13 +1,10 @@ | ||||||
| # cython: embedsignature=True | # cython: embedsignature=True | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import re |  | ||||||
| import pathlib | import pathlib | ||||||
| 
 | 
 | ||||||
| from cython.operator cimport dereference as deref | from cython.operator cimport dereference as deref | ||||||
| from cython.operator cimport preincrement as preinc | from cython.operator cimport preincrement as preinc | ||||||
| from cpython cimport Py_UNICODE_ISSPACE |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     import ujson as json |     import ujson as json | ||||||
|  |  | ||||||
|  | @ -8,10 +8,8 @@ import os.path | ||||||
| import pathlib | import pathlib | ||||||
| import sys | import sys | ||||||
| 
 | 
 | ||||||
| import six |  | ||||||
| import textwrap | import textwrap | ||||||
| 
 | 
 | ||||||
| from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE |  | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     basestring |     basestring | ||||||
|  | @ -19,6 +17,12 @@ except NameError: | ||||||
|     basestring = str |     basestring = str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | try: | ||||||
|  |     raw_input | ||||||
|  | except NameError: # Python 3 | ||||||
|  |     raw_input = input | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| LANGUAGES = {} | LANGUAGES = {} | ||||||
| _data_path = pathlib.Path(__file__).parent / 'data' | _data_path = pathlib.Path(__file__).parent / 'data' | ||||||
| 
 | 
 | ||||||
|  | @ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True): | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def get_raw_input(description, default=False): | ||||||
|  |     """Get user input via raw_input / input and return input value. Takes a | ||||||
|  |     description for the prompt, and an optional default value that's displayed | ||||||
|  |     with the prompt.""" | ||||||
|  | 
 | ||||||
|  |     additional = ' (default: {d})'.format(d=default) if default else '' | ||||||
|  |     prompt = '    {d}{a}: '.format(d=description, a=additional) | ||||||
|  |     user_input = raw_input(prompt) | ||||||
|  |     return user_input | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def print_table(data, **kwargs): | def print_table(data, **kwargs): | ||||||
|     """Print data in table format. Can either take a list of tuples or a |     """Print data in table format. Can either take a list of tuples or a | ||||||
|     dictionary, which will be converted to a list of tuples.""" |     dictionary, which will be converted to a list of tuples.""" | ||||||
|  |  | ||||||
|  | @ -44,7 +44,7 @@ $color-red: #d9515d | ||||||
| $color-green: #3ec930 | $color-green: #3ec930 | ||||||
| $color-yellow: #f4c025 | $color-yellow: #f4c025 | ||||||
| 
 | 
 | ||||||
| $syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 ) | $syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 ) | ||||||
| 
 | 
 | ||||||
| $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat | $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat | ||||||
| $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat | $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat | ||||||
|  |  | ||||||
|  | @ -103,3 +103,38 @@ p | ||||||
|         +cell #[code --help], #[code -h] |         +cell #[code --help], #[code -h] | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
|  | 
 | ||||||
|  | +h(2, "package") Package | ||||||
|  |     +tag experimental | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Generate a #[+a("/docs/usage/models#own-models") model Python package] | ||||||
|  |     |  from an existing model data directory. All data files are copied over, | ||||||
|  |     |  and the meta data can be entered directly from the command line. While | ||||||
|  |     |  this feature is still experimental, the required file templates are | ||||||
|  |     |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. | ||||||
|  |     |  This means you need to be connected to the internet to use this command. | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     python -m spacy package [input_dir] [output_dir] [--force] | ||||||
|  | 
 | ||||||
|  | +table(["Argument", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code input_dir] | ||||||
|  |         +cell positional | ||||||
|  |         +cell Path to directory containing model data. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code output_dir] | ||||||
|  |         +cell positional | ||||||
|  |         +cell Directory to create package folder in. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --force], #[code -f] | ||||||
|  |         +cell flag | ||||||
|  |         +cell Force overwriting of existing folder in output directory. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --help], #[code -h] | ||||||
|  |         +cell flag | ||||||
|  |         +cell Show help message and available arguments. | ||||||
|  |  | ||||||
|  | @ -14,9 +14,12 @@ p | ||||||
|     |  model name. |     |  model name. | ||||||
| 
 | 
 | ||||||
| +infobox("Important note") | +infobox("Important note") | ||||||
|     |  Due to improvements in the English lemmatizer in v1.7.0, you need to download the |     |  Due to improvements in the English lemmatizer in v1.7.0, you need to | ||||||
|     |  new English model. The German model is still compatible and will be |     |  #[strong download the new English models]. The German model is still | ||||||
|     |  recognised and linked automatically. |     |  compatible. If you've trained statistical models that use spaCy's | ||||||
|  |     |  annotations, you should #[strong retrain your models after updating spaCy]. | ||||||
|  |     |  If you don't retrain your models, you may suffer train/test skew, which | ||||||
|  |     |  might decrease your accuracy. | ||||||
| 
 | 
 | ||||||
| +aside-code("Quickstart"). | +aside-code("Quickstart"). | ||||||
|     # Install spaCy and download English model |     # Install spaCy and download English model | ||||||
|  | @ -235,7 +238,11 @@ p | ||||||
|     |  #[+a("/docs/usage/adding-languages") additional languages], you can |     |  #[+a("/docs/usage/adding-languages") additional languages], you can | ||||||
|     |  create a shortuct link for it by pointing #[code spacy.link] to the |     |  create a shortuct link for it by pointing #[code spacy.link] to the | ||||||
|     |  model's data directory. To allow your model to be downloaded and |     |  model's data directory. To allow your model to be downloaded and | ||||||
|     |  installed via pip, you'll also need to generate a package for it. |     |  installed via pip, you'll also need to generate a package for it. You can | ||||||
|  |     |  do this manually, or via the new | ||||||
|  |     |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will | ||||||
|  |     |  create all required files, and walk you through generating the meta data. | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| +infobox("Important note") | +infobox("Important note") | ||||||
|     |  The model packages are #[strong not suitable] for the public |     |  The model packages are #[strong not suitable] for the public | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user