mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
		
						commit
						9c9cd99144
					
				|  | @ -1,130 +0,0 @@ | |||
| #!/usr/bin/env python | ||||
| from __future__ import division | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import os | ||||
| from os import path | ||||
| import shutil | ||||
| import codecs | ||||
| import random | ||||
| import time | ||||
| import gzip | ||||
| 
 | ||||
| import plac | ||||
| import cProfile | ||||
| import pstats | ||||
| 
 | ||||
| import spacy.util | ||||
| from spacy.en import English | ||||
| from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir | ||||
| 
 | ||||
| from spacy.syntax.parser import GreedyParser | ||||
| from spacy.syntax.parser import OracleError | ||||
| from spacy.syntax.util import Config | ||||
| 
 | ||||
| 
 | ||||
| def is_punct_label(label): | ||||
|     return label == 'P' or label.lower() == 'punct' | ||||
| 
 | ||||
| 
 | ||||
| def read_gold(file_): | ||||
|     """Read a standard CoNLL/MALT-style format""" | ||||
|     sents = [] | ||||
|     for sent_str in file_.read().strip().split('\n\n'): | ||||
|         ids = [] | ||||
|         words = [] | ||||
|         heads = [] | ||||
|         labels = [] | ||||
|         tags = [] | ||||
|         for i, line in enumerate(sent_str.split('\n')): | ||||
|             id_, word, pos_string, head_idx, label = _parse_line(line) | ||||
|             words.append(word) | ||||
|             if head_idx == -1: | ||||
|                 head_idx = i | ||||
|             ids.append(id_) | ||||
|             heads.append(head_idx) | ||||
|             labels.append(label) | ||||
|             tags.append(pos_string) | ||||
|         text = ' '.join(words) | ||||
|         sents.append((text, [words], ids, words, tags, heads, labels)) | ||||
|     return sents | ||||
| 
 | ||||
| 
 | ||||
| def _parse_line(line): | ||||
|     pieces = line.split() | ||||
|     id_ = int(pieces[0]) | ||||
|     word = pieces[1] | ||||
|     pos = pieces[3] | ||||
|     head_idx = int(pieces[6]) | ||||
|     label = pieces[7] | ||||
|     return id_, word, pos, head_idx, label | ||||
| 
 | ||||
|          | ||||
| def iter_data(paragraphs, tokenizer, gold_preproc=False): | ||||
|     for raw, tokenized, ids, words, tags, heads, labels in paragraphs: | ||||
|         assert len(words) == len(heads) | ||||
|         for words in tokenized: | ||||
|             sent_ids = ids[:len(words)] | ||||
|             sent_tags = tags[:len(words)] | ||||
|             sent_heads = heads[:len(words)] | ||||
|             sent_labels = labels[:len(words)] | ||||
|             sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) | ||||
|             tokens = tokenizer.tokens_from_list(words) | ||||
|             yield tokens, sent_tags, sent_heads, sent_labels | ||||
|             ids = ids[len(words):] | ||||
|             tags = tags[len(words):] | ||||
|             heads = heads[len(words):] | ||||
|             labels = labels[len(words):] | ||||
| 
 | ||||
| 
 | ||||
| def _map_indices_to_tokens(ids, heads): | ||||
|     mapped = [] | ||||
|     for head in heads: | ||||
|         if head not in ids: | ||||
|             mapped.append(None) | ||||
|         else: | ||||
|             mapped.append(ids.index(head)) | ||||
|     return mapped | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def evaluate(Language, dev_loc, model_dir): | ||||
|     global loss | ||||
|     nlp = Language() | ||||
|     n_corr = 0 | ||||
|     pos_corr = 0 | ||||
|     n_tokens = 0 | ||||
|     total = 0 | ||||
|     skipped = 0 | ||||
|     loss = 0 | ||||
|     with codecs.open(dev_loc, 'r', 'utf8') as file_: | ||||
|         paragraphs = read_gold(file_) | ||||
|     for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer): | ||||
|         assert len(tokens) == len(labels) | ||||
|         nlp.tagger.tag_from_strings(tokens, tag_strs) | ||||
|         nlp.parser(tokens) | ||||
|         for i, token in enumerate(tokens): | ||||
|             try: | ||||
|                 pos_corr += token.tag_ == tag_strs[i] | ||||
|             except: | ||||
|                 print i, token.orth_, token.tag | ||||
|                 raise | ||||
|             n_tokens += 1 | ||||
|             if heads[i] is None: | ||||
|                 skipped += 1 | ||||
|                 continue | ||||
|             if is_punct_label(labels[i]): | ||||
|                 continue | ||||
|             n_corr += token.head.i == heads[i] | ||||
|             total += 1 | ||||
|     print loss, skipped, (loss+skipped + total) | ||||
|     print pos_corr / n_tokens | ||||
|     return float(n_corr) / (total + loss) | ||||
| 
 | ||||
| 
 | ||||
| def main(dev_loc, model_dir): | ||||
|     print evaluate(English, dev_loc, model_dir) | ||||
|      | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     plac.call(main) | ||||
|  | @ -1,261 +0,0 @@ | |||
| #!/usr/bin/env python | ||||
| from __future__ import division | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import os | ||||
| from os import path | ||||
| import shutil | ||||
| import codecs | ||||
| import random | ||||
| 
 | ||||
| import plac | ||||
| import cProfile | ||||
| import pstats | ||||
| import re | ||||
| 
 | ||||
| import spacy.util | ||||
| from spacy.en import English | ||||
| from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir | ||||
| 
 | ||||
| from spacy.syntax.util import Config | ||||
| from spacy.gold import read_json_file | ||||
| from spacy.gold import GoldParse | ||||
| 
 | ||||
| from spacy.scorer import Scorer | ||||
| 
 | ||||
| from spacy.syntax.parser import Parser, get_templates | ||||
| from spacy._theano import TheanoModel | ||||
| 
 | ||||
| import theano | ||||
| import theano.tensor as T | ||||
| 
 | ||||
| from theano.printing import Print | ||||
| 
 | ||||
| import numpy | ||||
| from collections import OrderedDict, defaultdict | ||||
| 
 | ||||
| 
 | ||||
| theano.config.profile = False | ||||
| theano.config.floatX = 'float32' | ||||
| floatX = theano.config.floatX | ||||
| 
 | ||||
| 
 | ||||
| def L1(L1_reg, *weights): | ||||
|     return L1_reg * sum(abs(w).sum() for w in weights) | ||||
| 
 | ||||
| 
 | ||||
| def L2(L2_reg, *weights): | ||||
|     return L2_reg * sum((w ** 2).sum() for w in weights) | ||||
| 
 | ||||
| 
 | ||||
| def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): | ||||
|     updates = OrderedDict() | ||||
|     for param in params: | ||||
|         value = param.get_value(borrow=True) | ||||
|         accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), | ||||
|                              broadcastable=param.broadcastable) | ||||
| 
 | ||||
|         grad = T.grad(loss, param) | ||||
|         accu_new = rho * accu + (1 - rho) * grad ** 2 | ||||
|         updates[accu] = accu_new | ||||
|         updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) | ||||
|     return updates | ||||
| 
 | ||||
| 
 | ||||
| def relu(x): | ||||
|     return x * (x > 0) | ||||
| 
 | ||||
| 
 | ||||
| def feed_layer(activation, weights, bias, input_): | ||||
|     return activation(T.dot(input_, weights) + bias) | ||||
| 
 | ||||
| 
 | ||||
| def init_weights(n_in, n_out): | ||||
|     rng = numpy.random.RandomState(1235) | ||||
|      | ||||
|     weights = numpy.asarray( | ||||
|         rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), | ||||
|         dtype=theano.config.floatX | ||||
|     ) | ||||
|     bias = numpy.zeros((n_out,), dtype=theano.config.floatX) | ||||
|     return [wrapper(weights, name='W'), wrapper(bias, name='b')] | ||||
| 
 | ||||
| 
 | ||||
| def compile_model(n_classes, n_hidden, n_in, optimizer): | ||||
|     x = T.vector('x')  | ||||
|     costs = T.ivector('costs') | ||||
|     loss = T.scalar('loss') | ||||
| 
 | ||||
|     maxent_W, maxent_b = init_weights(n_hidden, n_classes) | ||||
|     hidden_W, hidden_b = init_weights(n_in, n_hidden) | ||||
| 
 | ||||
|     # Feed the inputs forward through the network | ||||
|     p_y_given_x = feed_layer( | ||||
|                     T.nnet.softmax, | ||||
|                     maxent_W, | ||||
|                     maxent_b, | ||||
|                       feed_layer( | ||||
|                         relu, | ||||
|                         hidden_W, | ||||
|                         hidden_b, | ||||
|                         x)) | ||||
| 
 | ||||
|     loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) | ||||
| 
 | ||||
|     train_model = theano.function( | ||||
|         name='train_model', | ||||
|         inputs=[x, costs], | ||||
|         outputs=[p_y_given_x[0], T.grad(loss, x), loss], | ||||
|         updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), | ||||
|         on_unused_input='warn' | ||||
|     ) | ||||
| 
 | ||||
|     evaluate_model = theano.function( | ||||
|         name='evaluate_model', | ||||
|         inputs=[x], | ||||
|         outputs=[ | ||||
|             feed_layer( | ||||
|               T.nnet.softmax, | ||||
|               maxent_W, | ||||
|               maxent_b, | ||||
|               feed_layer( | ||||
|                 relu, | ||||
|                 hidden_W, | ||||
|                 hidden_b, | ||||
|                 x | ||||
|               ) | ||||
|             )[0] | ||||
|         ] | ||||
|     ) | ||||
|     return train_model, evaluate_model | ||||
| 
 | ||||
| 
 | ||||
| def score_model(scorer, nlp, annot_tuples, verbose=False): | ||||
|     tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) | ||||
|     nlp.tagger(tokens) | ||||
|     nlp.parser(tokens) | ||||
|     gold = GoldParse(tokens, annot_tuples) | ||||
|     scorer.score(tokens, gold, verbose=verbose) | ||||
| 
 | ||||
| 
 | ||||
| def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', | ||||
|           eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, | ||||
|           seed=0, n_sents=0,  verbose=False): | ||||
| 
 | ||||
|     dep_model_dir = path.join(model_dir, 'deps') | ||||
|     pos_model_dir = path.join(model_dir, 'pos') | ||||
|     if path.exists(dep_model_dir): | ||||
|         shutil.rmtree(dep_model_dir) | ||||
|     if path.exists(pos_model_dir): | ||||
|         shutil.rmtree(pos_model_dir) | ||||
|     os.mkdir(dep_model_dir) | ||||
|     os.mkdir(pos_model_dir) | ||||
|     setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) | ||||
| 
 | ||||
|     Config.write(dep_model_dir, 'config', | ||||
|         seed=seed, | ||||
|         templates=tuple(), | ||||
|         labels=Language.ParserTransitionSystem.get_labels(gold_tuples), | ||||
|         vector_lengths=(nv_word, nv_tag, nv_label), | ||||
|         hidden_nodes=nv_hidden, | ||||
|         eta=eta, | ||||
|         mu=mu | ||||
|     ) | ||||
|    | ||||
|     # Bake-in hyper-parameters | ||||
|     optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) | ||||
|     nlp = Language(data_dir=model_dir) | ||||
|     n_classes = nlp.parser.model.n_classes | ||||
|     train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) | ||||
|     nlp.parser.model = TheanoModel(n_classes, input_spec, train, | ||||
|                                    predict, model_loc) | ||||
|   | ||||
|     if n_sents > 0: | ||||
|         gold_tuples = gold_tuples[:n_sents] | ||||
|     print "Itn.\tP.Loss\tUAS\tTag %\tToken %" | ||||
|     log_loc = path.join(model_dir, 'job.log') | ||||
|     for itn in range(n_iter): | ||||
|         scorer = Scorer() | ||||
|         loss = 0 | ||||
|         for _, sents in gold_tuples: | ||||
|             for annot_tuples, ctnt in sents: | ||||
|                 if len(annot_tuples[1]) == 1: | ||||
|                     continue | ||||
|                 score_model(scorer, nlp, annot_tuples) | ||||
|                 tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) | ||||
|                 nlp.tagger(tokens) | ||||
|                 gold = GoldParse(tokens, annot_tuples, make_projective=True) | ||||
|                 assert gold.is_projective | ||||
|                 loss += nlp.parser.train(tokens, gold) | ||||
|                 nlp.tagger.train(tokens, gold.tags) | ||||
|         random.shuffle(gold_tuples) | ||||
|         logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, | ||||
|                                                  scorer.tags_acc, | ||||
|                                                  scorer.token_acc) | ||||
|         print logline | ||||
|         with open(log_loc, 'aw') as file_: | ||||
|             file_.write(logline + '\n') | ||||
|     nlp.parser.model.end_training() | ||||
|     nlp.tagger.model.end_training() | ||||
|     nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) | ||||
|     return nlp | ||||
| 
 | ||||
| 
 | ||||
| def evaluate(nlp, gold_tuples, gold_preproc=True): | ||||
|     scorer = Scorer() | ||||
|     for raw_text, sents in gold_tuples: | ||||
|         for annot_tuples, brackets in sents: | ||||
|             tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) | ||||
|             nlp.tagger(tokens) | ||||
|             nlp.parser(tokens) | ||||
|             gold = GoldParse(tokens, annot_tuples) | ||||
|             scorer.score(tokens, gold) | ||||
|     return scorer | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     train_loc=("Location of training file or directory"), | ||||
|     dev_loc=("Location of development file or directory"), | ||||
|     model_dir=("Location of output model directory",), | ||||
|     eval_only=("Skip training, and only evaluate", "flag", "e", bool), | ||||
|     n_sents=("Number of training sentences", "option", "n", int), | ||||
|     n_iter=("Number of training iterations", "option", "i", int), | ||||
|     verbose=("Verbose error reporting", "flag", "v", bool), | ||||
| 
 | ||||
|     nv_word=("Word vector length", "option", "W", int), | ||||
|     nv_tag=("Tag vector length", "option", "T", int), | ||||
|     nv_label=("Label vector length", "option", "L", int), | ||||
|     nv_hidden=("Hidden nodes length", "option", "H", int), | ||||
|     eta=("Learning rate", "option", "E", float), | ||||
|     mu=("Momentum", "option", "M", float), | ||||
| ) | ||||
| def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, | ||||
|          nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, | ||||
|          eta=0.1, mu=0.9, eval_only=False): | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) | ||||
| 
 | ||||
|     nlp = train(English, gold_train, model_dir, | ||||
|                feat_set='embed', | ||||
|                eta=eta, mu=mu, | ||||
|                nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, | ||||
|                n_sents=n_sents, n_iter=n_iter, | ||||
|                verbose=verbose) | ||||
| 
 | ||||
|     scorer = evaluate(nlp, list(read_json_file(dev_loc))) | ||||
|      | ||||
|     print 'TOK', 100-scorer.token_acc | ||||
|     print 'POS', scorer.tags_acc | ||||
|     print 'UAS', scorer.uas | ||||
|     print 'LAS', scorer.las | ||||
| 
 | ||||
|     print 'NER P', scorer.ents_p | ||||
|     print 'NER R', scorer.ents_r | ||||
|     print 'NER F', scorer.ents_f | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     plac.call(main) | ||||
|  | @ -1,18 +1,13 @@ | |||
| from __future__ import unicode_literals | ||||
| import plac | ||||
| import json | ||||
| from os import path | ||||
| import shutil | ||||
| import os | ||||
| import random | ||||
| import io | ||||
| import pathlib | ||||
| 
 | ||||
| from spacy.tokens import Doc | ||||
| from spacy.syntax.nonproj import PseudoProjectivity | ||||
| from spacy.language import Language | ||||
| from spacy.gold import GoldParse | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.tagger import Tagger | ||||
| from spacy.pipeline import DependencyParser, BeamDependencyParser | ||||
| from spacy.syntax.parser import get_templates | ||||
|  | @ -23,7 +18,6 @@ import spacy.attrs | |||
| import io | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def read_conllx(loc, n=0): | ||||
|     with io.open(loc, 'r', encoding='utf8') as file_: | ||||
|         text = file_.read() | ||||
|  | @ -35,7 +29,8 @@ def read_conllx(loc, n=0): | |||
|                 lines.pop(0) | ||||
|             tokens = [] | ||||
|             for line in lines: | ||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() | ||||
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, \ | ||||
|                 _2 = line.split('\t') | ||||
|                 if '-' in id_ or '.' in id_: | ||||
|                     continue | ||||
|                 try: | ||||
|  | @ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): | |||
|         random.shuffle(train_sents) | ||||
|         scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) | ||||
|         print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) | ||||
|     nlp = Language(vocab=vocab, tagger=tagger, parser=parser) | ||||
|     nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) | ||||
|     nlp.end_training(model_dir) | ||||
|     scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) | ||||
|     print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) | ||||
|  |  | |||
|  | @ -5,7 +5,7 @@ import json | |||
| from pathlib import Path | ||||
| from .util import set_lang_class, get_lang_class, parse_package_meta | ||||
| from .deprecated import resolve_model_name | ||||
| from .cli.info import info | ||||
| from .cli import info | ||||
| 
 | ||||
| from . import en | ||||
| from . import de | ||||
|  | @ -49,7 +49,3 @@ def load(name, **overrides): | |||
|         overrides['path'] = model_path | ||||
| 
 | ||||
|     return cls(**overrides) | ||||
| 
 | ||||
| 
 | ||||
| def info(name, markdown): | ||||
|     info(name, markdown) | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| # coding: utf8 | ||||
| #  | ||||
| from __future__ import print_function | ||||
| # NB! This breaks in plac on Python 2!! | ||||
| #from __future__ import unicode_literals, | ||||
|  | @ -8,12 +7,13 @@ import plac | |||
| from spacy.cli import download as cli_download | ||||
| from spacy.cli import link as cli_link | ||||
| from spacy.cli import info as cli_info | ||||
| from spacy.cli import package as cli_package | ||||
| 
 | ||||
| 
 | ||||
| class CLI(object): | ||||
|     """Command-line interface for spaCy""" | ||||
| 
 | ||||
|     commands = ('download', 'link', 'info') | ||||
|     commands = ('download', 'link', 'info', 'package') | ||||
| 
 | ||||
|     @plac.annotations( | ||||
|         model=("model to download (shortcut or model name)", "positional", None, str), | ||||
|  | @ -32,8 +32,8 @@ class CLI(object): | |||
| 
 | ||||
|     @plac.annotations( | ||||
|         origin=("package name or local path to model", "positional", None, str), | ||||
|         link_name=("Name of shortuct link to create", "positional", None, str), | ||||
|         force=("Force overwriting of existing link", "flag", "f", bool) | ||||
|         link_name=("name of shortuct link to create", "positional", None, str), | ||||
|         force=("force overwriting of existing link", "flag", "f", bool) | ||||
|     ) | ||||
|     def link(self, origin, link_name, force=False): | ||||
|         """ | ||||
|  | @ -59,6 +59,21 @@ class CLI(object): | |||
|         cli_info(model, markdown) | ||||
| 
 | ||||
| 
 | ||||
|     @plac.annotations( | ||||
|         input_dir=("directory with model data", "positional", None, str), | ||||
|         output_dir=("output directory", "positional", None, str), | ||||
|         force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||
|     ) | ||||
|     def package(self, input_dir, output_dir, force=False): | ||||
|         """ | ||||
|         Generate Python package for model data, including meta and required | ||||
|         installation files. A new directory will be created in the specified | ||||
|         output directory, and model data will be copied over. | ||||
|         """ | ||||
| 
 | ||||
|         cli_package(input_dir, output_dir, force) | ||||
| 
 | ||||
| 
 | ||||
|     def __missing__(self, name): | ||||
|         print("\n   Command %r does not exist\n" % name) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| from libc.stdio cimport fopen, fclose, fread, fwrite, FILE | ||||
| from libc.stdio cimport fopen, fclose, fread, fwrite | ||||
| from libc.string cimport memcpy | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| from .download import download | ||||
| from .info import info | ||||
| from .link import link | ||||
| from .package import package | ||||
|  |  | |||
							
								
								
									
										91
									
								
								spacy/cli/package.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								spacy/cli/package.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,91 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import json | ||||
| import shutil | ||||
| import requests | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from .. import about | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| def package(input_dir, output_dir, force): | ||||
|     input_path = Path(input_dir) | ||||
|     output_path = Path(output_dir) | ||||
|     check_dirs(input_path, output_path) | ||||
| 
 | ||||
|     template_setup = get_template('setup.py') | ||||
|     template_manifest = get_template('MANIFEST.in') | ||||
|     template_init = get_template('en_model_name/__init__.py') | ||||
|     meta = generate_meta() | ||||
| 
 | ||||
|     model_name = meta['lang'] + '_' + meta['name'] | ||||
|     model_name_v = model_name + '-' + meta['version'] | ||||
|     main_path = output_path / model_name_v | ||||
|     package_path = main_path / model_name | ||||
| 
 | ||||
|     create_dirs(package_path, force) | ||||
|     shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) | ||||
|     create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) | ||||
|     create_file(main_path / 'setup.py', template_setup) | ||||
|     create_file(main_path / 'MANIFEST.in', template_manifest) | ||||
|     create_file(package_path / '__init__.py', template_init) | ||||
| 
 | ||||
|     util.print_msg( | ||||
|         main_path.as_posix(), | ||||
|         "To build the package, run `python setup.py sdist` in that directory.", | ||||
|         title="Successfully created package {p}".format(p=model_name_v)) | ||||
| 
 | ||||
| 
 | ||||
| def check_dirs(input_path, output_path): | ||||
|     if not input_path.exists(): | ||||
|         util.sys_exit(input_path.as_poisx(), title="Model directory not found") | ||||
|     if not output_path.exists(): | ||||
|         util.sys_exit(output_path.as_posix(), title="Output directory not found") | ||||
| 
 | ||||
| 
 | ||||
| def create_dirs(package_path, force): | ||||
|     if package_path.exists(): | ||||
|         if force: | ||||
|             shutil.rmtree(package_path.as_posix()) | ||||
|         else: | ||||
|             util.sys_exit(package_path.as_posix(), | ||||
|                 "Please delete the directory and try again.", | ||||
|                 title="Package directory already exists") | ||||
|     Path.mkdir(package_path, parents=True) | ||||
| 
 | ||||
| 
 | ||||
| def create_file(file_path, contents): | ||||
|     file_path.touch() | ||||
|     file_path.open('w').write(contents, encoding='utf-8') | ||||
| 
 | ||||
| 
 | ||||
| def generate_meta(): | ||||
|     settings = [('lang', 'Model language', 'en'), | ||||
|                 ('name', 'Model name', 'model'), | ||||
|                 ('version', 'Model version', '0.0.0'), | ||||
|                 ('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'), | ||||
|                 ('description', 'Model description', False), | ||||
|                 ('author', 'Author', False), | ||||
|                 ('email', 'Author email', False), | ||||
|                 ('url', 'Author website', False), | ||||
|                 ('license', 'License', 'CC BY-NC 3.0')] | ||||
| 
 | ||||
|     util.print_msg("Enter the package settings for your model.", title="Generating meta.json") | ||||
| 
 | ||||
|     meta = {} | ||||
|     for setting, desc, default in settings: | ||||
|         response = util.get_raw_input(desc, default) | ||||
|         meta[setting] = default if response == '' and default else response | ||||
|     return meta | ||||
| 
 | ||||
| 
 | ||||
| def get_template(filepath): | ||||
|     url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' | ||||
|     r = requests.get(url + filepath) | ||||
|     if r.status_code != 200: | ||||
|         util.sys_exit( | ||||
|             "Couldn't fetch template files from GitHub.", | ||||
|             title="Server error ({c})".format(c=r.status_code)) | ||||
|     return r.text | ||||
|  | @ -21,7 +21,6 @@ MORPH_RULES = { | |||
|         "them":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, | ||||
| 
 | ||||
|         "mine":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, | ||||
|         "yours":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, | ||||
|         "his":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, | ||||
|         "hers":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"}, | ||||
|         "its":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, | ||||
|  |  | |||
|  | @ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = { | |||
|     "vm.": [ | ||||
|         {ORTH: "vm.", LEMMA: "viimeksi mainittu"} | ||||
|     ], | ||||
|     "siht.": [ | ||||
|         {ORTH: "siht.", LEMMA: "sihteeri"} | ||||
|     ], | ||||
|     "srk.": [ | ||||
|         {ORTH: "srk.", LEMMA: "seurakunta"} | ||||
|     ] | ||||
|  |  | |||
|  | @ -1,16 +1,12 @@ | |||
| # cython: profile=True | ||||
| from __future__ import unicode_literals, print_function | ||||
| 
 | ||||
| import numpy | ||||
| import io | ||||
| import json | ||||
| import random | ||||
| import re | ||||
| import os | ||||
| from os import path | ||||
| 
 | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| import ujson as json | ||||
| 
 | ||||
| from .syntax import nonproj | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from __future__ import absolute_import | ||||
| from __future__ import unicode_literals | ||||
| from warnings import warn | ||||
| import pathlib | ||||
| from contextlib import contextmanager | ||||
| import shutil | ||||
|  | @ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP | |||
| from .syntax.parser import get_templates | ||||
| from .syntax.nonproj import PseudoProjectivity | ||||
| from .pipeline import DependencyParser, EntityRecognizer | ||||
| from .pipeline import BeamDependencyParser, BeamEntityRecognizer | ||||
| from .syntax.arc_eager import ArcEager | ||||
| from .syntax.ner import BiluoPushDown | ||||
| 
 | ||||
|  |  | |||
|  | @ -2,13 +2,10 @@ | |||
| # cython: infer_types=True | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from os import path | ||||
| 
 | ||||
| from .typedefs cimport attr_t | ||||
| from .typedefs cimport hash_t | ||||
| from .attrs cimport attr_id_t | ||||
| from .structs cimport TokenC, LexemeC | ||||
| from .lexeme cimport Lexeme | ||||
| from .structs cimport TokenC | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from preshed.maps cimport PreshMap | ||||
|  | @ -17,7 +14,7 @@ from libcpp.pair cimport pair | |||
| from murmurhash.mrmr cimport hash64 | ||||
| from libc.stdint cimport int32_t | ||||
| 
 | ||||
| from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE | ||||
| from .attrs cimport ID, ENT_TYPE | ||||
| from . import attrs | ||||
| from .tokens.doc cimport get_token_attr | ||||
| from .tokens.doc cimport Doc | ||||
|  |  | |||
|  | @ -1,12 +1,8 @@ | |||
| # cython: infer_types | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from os import path | ||||
| 
 | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| from .lemmatizer import Lemmatizer | ||||
| 
 | ||||
| try: | ||||
|     import ujson as json | ||||
| except ImportError: | ||||
|  |  | |||
|  | @ -2,7 +2,6 @@ from .syntax.parser cimport Parser | |||
| from .syntax.beam_parser cimport BeamParser | ||||
| from .syntax.ner cimport BiluoPushDown | ||||
| from .syntax.arc_eager cimport ArcEager | ||||
| from .vocab cimport Vocab | ||||
| from .tagger import Tagger | ||||
| 
 | ||||
| # TODO: The disorganization here is pretty embarrassing. At least it's only | ||||
|  |  | |||
|  | @ -1,20 +1,16 @@ | |||
| import json | ||||
| import pathlib | ||||
| from collections import defaultdict | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from thinc.typedefs cimport atom_t, weight_t | ||||
| from thinc.typedefs cimport atom_t | ||||
| from thinc.extra.eg cimport Example | ||||
| from thinc.structs cimport ExampleC | ||||
| from thinc.linear.avgtron cimport AveragedPerceptron | ||||
| from thinc.linalg cimport VecVec | ||||
| 
 | ||||
| from .typedefs cimport attr_t | ||||
| from .tokens.doc cimport Doc | ||||
| from .attrs cimport TAG | ||||
| from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON | ||||
| from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE | ||||
| from .gold cimport GoldParse | ||||
| 
 | ||||
| from .attrs cimport * | ||||
|  |  | |||
|  | @ -1,13 +1,10 @@ | |||
| # cython: embedsignature=True | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import re | ||||
| import pathlib | ||||
| 
 | ||||
| from cython.operator cimport dereference as deref | ||||
| from cython.operator cimport preincrement as preinc | ||||
| from cpython cimport Py_UNICODE_ISSPACE | ||||
| 
 | ||||
| 
 | ||||
| try: | ||||
|     import ujson as json | ||||
|  |  | |||
|  | @ -8,10 +8,8 @@ import os.path | |||
| import pathlib | ||||
| import sys | ||||
| 
 | ||||
| import six | ||||
| import textwrap | ||||
| 
 | ||||
| from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE | ||||
| 
 | ||||
| try: | ||||
|     basestring | ||||
|  | @ -19,6 +17,12 @@ except NameError: | |||
|     basestring = str | ||||
| 
 | ||||
| 
 | ||||
| try: | ||||
|     raw_input | ||||
| except NameError: # Python 3 | ||||
|     raw_input = input | ||||
| 
 | ||||
| 
 | ||||
| LANGUAGES = {} | ||||
| _data_path = pathlib.Path(__file__).parent / 'data' | ||||
| 
 | ||||
|  | @ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True): | |||
|         return None | ||||
| 
 | ||||
| 
 | ||||
| def get_raw_input(description, default=False): | ||||
|     """Get user input via raw_input / input and return input value. Takes a | ||||
|     description for the prompt, and an optional default value that's displayed | ||||
|     with the prompt.""" | ||||
| 
 | ||||
|     additional = ' (default: {d})'.format(d=default) if default else '' | ||||
|     prompt = '    {d}{a}: '.format(d=description, a=additional) | ||||
|     user_input = raw_input(prompt) | ||||
|     return user_input | ||||
| 
 | ||||
| 
 | ||||
| def print_table(data, **kwargs): | ||||
|     """Print data in table format. Can either take a list of tuples or a | ||||
|     dictionary, which will be converted to a list of tuples.""" | ||||
|  |  | |||
|  | @ -44,7 +44,7 @@ $color-red: #d9515d | |||
| $color-green: #3ec930 | ||||
| $color-yellow: #f4c025 | ||||
| 
 | ||||
| $syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 ) | ||||
| $syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 ) | ||||
| 
 | ||||
| $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat | ||||
| $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat | ||||
|  |  | |||
|  | @ -103,3 +103,38 @@ p | |||
|         +cell #[code --help], #[code -h] | ||||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "package") Package | ||||
|     +tag experimental | ||||
| 
 | ||||
| p | ||||
|     |  Generate a #[+a("/docs/usage/models#own-models") model Python package] | ||||
|     |  from an existing model data directory. All data files are copied over, | ||||
|     |  and the meta data can be entered directly from the command line. While | ||||
|     |  this feature is still experimental, the required file templates are | ||||
|     |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. | ||||
|     |  This means you need to be connected to the internet to use this command. | ||||
| 
 | ||||
| +code(false, "bash"). | ||||
|     python -m spacy package [input_dir] [output_dir] [--force] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code input_dir] | ||||
|         +cell positional | ||||
|         +cell Path to directory containing model data. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code output_dir] | ||||
|         +cell positional | ||||
|         +cell Directory to create package folder in. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --force], #[code -f] | ||||
|         +cell flag | ||||
|         +cell Force overwriting of existing folder in output directory. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --help], #[code -h] | ||||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
|  |  | |||
|  | @ -14,9 +14,12 @@ p | |||
|     |  model name. | ||||
| 
 | ||||
| +infobox("Important note") | ||||
|     |  Due to improvements in the English lemmatizer in v1.7.0, you need to download the | ||||
|     |  new English model. The German model is still compatible and will be | ||||
|     |  recognised and linked automatically. | ||||
|     |  Due to improvements in the English lemmatizer in v1.7.0, you need to | ||||
|     |  #[strong download the new English models]. The German model is still | ||||
|     |  compatible. If you've trained statistical models that use spaCy's | ||||
|     |  annotations, you should #[strong retrain your models after updating spaCy]. | ||||
|     |  If you don't retrain your models, you may suffer train/test skew, which | ||||
|     |  might decrease your accuracy. | ||||
| 
 | ||||
| +aside-code("Quickstart"). | ||||
|     # Install spaCy and download English model | ||||
|  | @ -235,7 +238,11 @@ p | |||
|     |  #[+a("/docs/usage/adding-languages") additional languages], you can | ||||
|     |  create a shortuct link for it by pointing #[code spacy.link] to the | ||||
|     |  model's data directory. To allow your model to be downloaded and | ||||
|     |  installed via pip, you'll also need to generate a package for it. | ||||
|     |  installed via pip, you'll also need to generate a package for it. You can | ||||
|     |  do this manually, or via the new | ||||
|     |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will | ||||
|     |  create all required files, and walk you through generating the meta data. | ||||
| 
 | ||||
| 
 | ||||
| +infobox("Important note") | ||||
|     |  The model packages are #[strong not suitable] for the public | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user