mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
9c9cd99144
|
@ -1,130 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.parser import GreedyParser
|
|
||||||
from spacy.syntax.parser import OracleError
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
|
||||||
return label == 'P' or label.lower() == 'punct'
|
|
||||||
|
|
||||||
|
|
||||||
def read_gold(file_):
|
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
|
||||||
sents = []
|
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx == -1:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(id_)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
sents.append((text, [words], ids, words, tags, heads, labels))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
id_ = int(pieces[0])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[3]
|
|
||||||
head_idx = int(pieces[6])
|
|
||||||
label = pieces[7]
|
|
||||||
return id_, word, pos, head_idx, label
|
|
||||||
|
|
||||||
|
|
||||||
def iter_data(paragraphs, tokenizer, gold_preproc=False):
|
|
||||||
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
|
|
||||||
assert len(words) == len(heads)
|
|
||||||
for words in tokenized:
|
|
||||||
sent_ids = ids[:len(words)]
|
|
||||||
sent_tags = tags[:len(words)]
|
|
||||||
sent_heads = heads[:len(words)]
|
|
||||||
sent_labels = labels[:len(words)]
|
|
||||||
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
|
|
||||||
tokens = tokenizer.tokens_from_list(words)
|
|
||||||
yield tokens, sent_tags, sent_heads, sent_labels
|
|
||||||
ids = ids[len(words):]
|
|
||||||
tags = tags[len(words):]
|
|
||||||
heads = heads[len(words):]
|
|
||||||
labels = labels[len(words):]
|
|
||||||
|
|
||||||
|
|
||||||
def _map_indices_to_tokens(ids, heads):
|
|
||||||
mapped = []
|
|
||||||
for head in heads:
|
|
||||||
if head not in ids:
|
|
||||||
mapped.append(None)
|
|
||||||
else:
|
|
||||||
mapped.append(ids.index(head))
|
|
||||||
return mapped
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, dev_loc, model_dir):
|
|
||||||
global loss
|
|
||||||
nlp = Language()
|
|
||||||
n_corr = 0
|
|
||||||
pos_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
total = 0
|
|
||||||
skipped = 0
|
|
||||||
loss = 0
|
|
||||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
|
||||||
paragraphs = read_gold(file_)
|
|
||||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
|
|
||||||
assert len(tokens) == len(labels)
|
|
||||||
nlp.tagger.tag_from_strings(tokens, tag_strs)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
try:
|
|
||||||
pos_corr += token.tag_ == tag_strs[i]
|
|
||||||
except:
|
|
||||||
print i, token.orth_, token.tag
|
|
||||||
raise
|
|
||||||
n_tokens += 1
|
|
||||||
if heads[i] is None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if is_punct_label(labels[i]):
|
|
||||||
continue
|
|
||||||
n_corr += token.head.i == heads[i]
|
|
||||||
total += 1
|
|
||||||
print loss, skipped, (loss+skipped + total)
|
|
||||||
print pos_corr / n_tokens
|
|
||||||
return float(n_corr) / (total + loss)
|
|
||||||
|
|
||||||
|
|
||||||
def main(dev_loc, model_dir):
|
|
||||||
print evaluate(English, dev_loc, model_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,261 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
from spacy.syntax.parser import Parser, get_templates
|
|
||||||
from spacy._theano import TheanoModel
|
|
||||||
|
|
||||||
import theano
|
|
||||||
import theano.tensor as T
|
|
||||||
|
|
||||||
from theano.printing import Print
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
from collections import OrderedDict, defaultdict
|
|
||||||
|
|
||||||
|
|
||||||
theano.config.profile = False
|
|
||||||
theano.config.floatX = 'float32'
|
|
||||||
floatX = theano.config.floatX
|
|
||||||
|
|
||||||
|
|
||||||
def L1(L1_reg, *weights):
|
|
||||||
return L1_reg * sum(abs(w).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def L2(L2_reg, *weights):
|
|
||||||
return L2_reg * sum((w ** 2).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
|
|
||||||
updates = OrderedDict()
|
|
||||||
for param in params:
|
|
||||||
value = param.get_value(borrow=True)
|
|
||||||
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
|
|
||||||
broadcastable=param.broadcastable)
|
|
||||||
|
|
||||||
grad = T.grad(loss, param)
|
|
||||||
accu_new = rho * accu + (1 - rho) * grad ** 2
|
|
||||||
updates[accu] = accu_new
|
|
||||||
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
|
|
||||||
return updates
|
|
||||||
|
|
||||||
|
|
||||||
def relu(x):
|
|
||||||
return x * (x > 0)
|
|
||||||
|
|
||||||
|
|
||||||
def feed_layer(activation, weights, bias, input_):
|
|
||||||
return activation(T.dot(input_, weights) + bias)
|
|
||||||
|
|
||||||
|
|
||||||
def init_weights(n_in, n_out):
|
|
||||||
rng = numpy.random.RandomState(1235)
|
|
||||||
|
|
||||||
weights = numpy.asarray(
|
|
||||||
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
|
|
||||||
dtype=theano.config.floatX
|
|
||||||
)
|
|
||||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
|
||||||
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
|
|
||||||
|
|
||||||
|
|
||||||
def compile_model(n_classes, n_hidden, n_in, optimizer):
|
|
||||||
x = T.vector('x')
|
|
||||||
costs = T.ivector('costs')
|
|
||||||
loss = T.scalar('loss')
|
|
||||||
|
|
||||||
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
|
|
||||||
hidden_W, hidden_b = init_weights(n_in, n_hidden)
|
|
||||||
|
|
||||||
# Feed the inputs forward through the network
|
|
||||||
p_y_given_x = feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x))
|
|
||||||
|
|
||||||
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
|
|
||||||
|
|
||||||
train_model = theano.function(
|
|
||||||
name='train_model',
|
|
||||||
inputs=[x, costs],
|
|
||||||
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
|
|
||||||
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
|
|
||||||
on_unused_input='warn'
|
|
||||||
)
|
|
||||||
|
|
||||||
evaluate_model = theano.function(
|
|
||||||
name='evaluate_model',
|
|
||||||
inputs=[x],
|
|
||||||
outputs=[
|
|
||||||
feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x
|
|
||||||
)
|
|
||||||
)[0]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return train_model, evaluate_model
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|
||||||
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
|
||||||
seed=0, n_sents=0, verbose=False):
|
|
||||||
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(dep_model_dir):
|
|
||||||
shutil.rmtree(dep_model_dir)
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(dep_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config',
|
|
||||||
seed=seed,
|
|
||||||
templates=tuple(),
|
|
||||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
|
||||||
vector_lengths=(nv_word, nv_tag, nv_label),
|
|
||||||
hidden_nodes=nv_hidden,
|
|
||||||
eta=eta,
|
|
||||||
mu=mu
|
|
||||||
)
|
|
||||||
|
|
||||||
# Bake-in hyper-parameters
|
|
||||||
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
n_classes = nlp.parser.model.n_classes
|
|
||||||
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
|
|
||||||
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
|
|
||||||
predict, model_loc)
|
|
||||||
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
|
||||||
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
|
||||||
log_loc = path.join(model_dir, 'job.log')
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for _, sents in gold_tuples:
|
|
||||||
for annot_tuples, ctnt in sents:
|
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
score_model(scorer, nlp, annot_tuples)
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
|
||||||
assert gold.is_projective
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
|
||||||
scorer.tags_acc,
|
|
||||||
scorer.token_acc)
|
|
||||||
print logline
|
|
||||||
with open(log_loc, 'aw') as file_:
|
|
||||||
file_.write(logline + '\n')
|
|
||||||
nlp.parser.model.end_training()
|
|
||||||
nlp.tagger.model.end_training()
|
|
||||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, gold_tuples, gold_preproc=True):
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
|
|
||||||
nv_word=("Word vector length", "option", "W", int),
|
|
||||||
nv_tag=("Tag vector length", "option", "T", int),
|
|
||||||
nv_label=("Label vector length", "option", "L", int),
|
|
||||||
nv_hidden=("Hidden nodes length", "option", "H", int),
|
|
||||||
eta=("Learning rate", "option", "E", float),
|
|
||||||
mu=("Momentum", "option", "M", float),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
|
|
||||||
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
|
|
||||||
eta=0.1, mu=0.9, eval_only=False):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
|
|
||||||
|
|
||||||
nlp = train(English, gold_train, model_dir,
|
|
||||||
feat_set='embed',
|
|
||||||
eta=eta, mu=mu,
|
|
||||||
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
|
|
||||||
n_sents=n_sents, n_iter=n_iter,
|
|
||||||
verbose=verbose)
|
|
||||||
|
|
||||||
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
|
|
||||||
|
|
||||||
print 'TOK', 100-scorer.token_acc
|
|
||||||
print 'POS', scorer.tags_acc
|
|
||||||
print 'UAS', scorer.uas
|
|
||||||
print 'LAS', scorer.las
|
|
||||||
|
|
||||||
print 'NER P', scorer.ents_p
|
|
||||||
print 'NER R', scorer.ents_r
|
|
||||||
print 'NER F', scorer.ents_f
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,18 +1,13 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import json
|
import json
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import io
|
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
from spacy.syntax.nonproj import PseudoProjectivity
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.vocab import Vocab
|
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
||||||
from spacy.syntax.parser import get_templates
|
from spacy.syntax.parser import get_templates
|
||||||
|
@ -23,7 +18,6 @@ import spacy.attrs
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(loc, n=0):
|
def read_conllx(loc, n=0):
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||||
text = file_.read()
|
text = file_.read()
|
||||||
|
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||||
|
_2 = line.split('\t')
|
||||||
if '-' in id_ or '.' in id_:
|
if '-' in id_ or '.' in id_:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
|
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||||
random.shuffle(train_sents)
|
random.shuffle(train_sents)
|
||||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
||||||
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
|
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||||
|
|
|
@ -5,7 +5,7 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||||
from .deprecated import resolve_model_name
|
from .deprecated import resolve_model_name
|
||||||
from .cli.info import info
|
from .cli import info
|
||||||
|
|
||||||
from . import en
|
from . import en
|
||||||
from . import de
|
from . import de
|
||||||
|
@ -49,7 +49,3 @@ def load(name, **overrides):
|
||||||
overrides['path'] = model_path
|
overrides['path'] = model_path
|
||||||
|
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
||||||
|
|
||||||
def info(name, markdown):
|
|
||||||
info(name, markdown)
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
#
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
#from __future__ import unicode_literals,
|
#from __future__ import unicode_literals,
|
||||||
|
@ -8,12 +7,13 @@ import plac
|
||||||
from spacy.cli import download as cli_download
|
from spacy.cli import download as cli_download
|
||||||
from spacy.cli import link as cli_link
|
from spacy.cli import link as cli_link
|
||||||
from spacy.cli import info as cli_info
|
from spacy.cli import info as cli_info
|
||||||
|
from spacy.cli import package as cli_package
|
||||||
|
|
||||||
|
|
||||||
class CLI(object):
|
class CLI(object):
|
||||||
"""Command-line interface for spaCy"""
|
"""Command-line interface for spaCy"""
|
||||||
|
|
||||||
commands = ('download', 'link', 'info')
|
commands = ('download', 'link', 'info', 'package')
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||||
|
@ -32,8 +32,8 @@ class CLI(object):
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin=("package name or local path to model", "positional", None, str),
|
||||||
link_name=("Name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("Force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def link(self, origin, link_name, force=False):
|
def link(self, origin, link_name, force=False):
|
||||||
"""
|
"""
|
||||||
|
@ -59,6 +59,21 @@ class CLI(object):
|
||||||
cli_info(model, markdown)
|
cli_info(model, markdown)
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
|
output_dir=("output directory", "positional", None, str),
|
||||||
|
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||||
|
)
|
||||||
|
def package(self, input_dir, output_dir, force=False):
|
||||||
|
"""
|
||||||
|
Generate Python package for model data, including meta and required
|
||||||
|
installation files. A new directory will be created in the specified
|
||||||
|
output directory, and model data will be copied over.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cli_package(input_dir, output_dir, force)
|
||||||
|
|
||||||
|
|
||||||
def __missing__(self, name):
|
def __missing__(self, name):
|
||||||
print("\n Command %r does not exist\n" % name)
|
print("\n Command %r does not exist\n" % name)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
from .download import download
|
from .download import download
|
||||||
from .info import info
|
from .info import info
|
||||||
from .link import link
|
from .link import link
|
||||||
|
from .package import package
|
||||||
|
|
91
spacy/cli/package.py
Normal file
91
spacy/cli/package.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .. import about
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
def package(input_dir, output_dir, force):
|
||||||
|
input_path = Path(input_dir)
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
check_dirs(input_path, output_path)
|
||||||
|
|
||||||
|
template_setup = get_template('setup.py')
|
||||||
|
template_manifest = get_template('MANIFEST.in')
|
||||||
|
template_init = get_template('en_model_name/__init__.py')
|
||||||
|
meta = generate_meta()
|
||||||
|
|
||||||
|
model_name = meta['lang'] + '_' + meta['name']
|
||||||
|
model_name_v = model_name + '-' + meta['version']
|
||||||
|
main_path = output_path / model_name_v
|
||||||
|
package_path = main_path / model_name
|
||||||
|
|
||||||
|
create_dirs(package_path, force)
|
||||||
|
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
|
||||||
|
create_file(main_path / 'meta.json', json.dumps(meta, indent=2))
|
||||||
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
|
create_file(package_path / '__init__.py', template_init)
|
||||||
|
|
||||||
|
util.print_msg(
|
||||||
|
main_path.as_posix(),
|
||||||
|
"To build the package, run `python setup.py sdist` in that directory.",
|
||||||
|
title="Successfully created package {p}".format(p=model_name_v))
|
||||||
|
|
||||||
|
|
||||||
|
def check_dirs(input_path, output_path):
|
||||||
|
if not input_path.exists():
|
||||||
|
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
|
||||||
|
if not output_path.exists():
|
||||||
|
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
||||||
|
|
||||||
|
|
||||||
|
def create_dirs(package_path, force):
|
||||||
|
if package_path.exists():
|
||||||
|
if force:
|
||||||
|
shutil.rmtree(package_path.as_posix())
|
||||||
|
else:
|
||||||
|
util.sys_exit(package_path.as_posix(),
|
||||||
|
"Please delete the directory and try again.",
|
||||||
|
title="Package directory already exists")
|
||||||
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
def create_file(file_path, contents):
|
||||||
|
file_path.touch()
|
||||||
|
file_path.open('w').write(contents, encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def generate_meta():
|
||||||
|
settings = [('lang', 'Model language', 'en'),
|
||||||
|
('name', 'Model name', 'model'),
|
||||||
|
('version', 'Model version', '0.0.0'),
|
||||||
|
('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'),
|
||||||
|
('description', 'Model description', False),
|
||||||
|
('author', 'Author', False),
|
||||||
|
('email', 'Author email', False),
|
||||||
|
('url', 'Author website', False),
|
||||||
|
('license', 'License', 'CC BY-NC 3.0')]
|
||||||
|
|
||||||
|
util.print_msg("Enter the package settings for your model.", title="Generating meta.json")
|
||||||
|
|
||||||
|
meta = {}
|
||||||
|
for setting, desc, default in settings:
|
||||||
|
response = util.get_raw_input(desc, default)
|
||||||
|
meta[setting] = default if response == '' and default else response
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def get_template(filepath):
|
||||||
|
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
||||||
|
r = requests.get(url + filepath)
|
||||||
|
if r.status_code != 200:
|
||||||
|
util.sys_exit(
|
||||||
|
"Couldn't fetch template files from GitHub.",
|
||||||
|
title="Server error ({c})".format(c=r.status_code))
|
||||||
|
return r.text
|
|
@ -21,7 +21,6 @@ MORPH_RULES = {
|
||||||
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
|
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
|
||||||
|
|
||||||
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
|
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
|
||||||
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
|
|
||||||
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
|
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
|
||||||
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
|
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
|
||||||
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
|
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
|
||||||
|
|
|
@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
"vm.": [
|
"vm.": [
|
||||||
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}
|
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}
|
||||||
],
|
],
|
||||||
"siht.": [
|
|
||||||
{ORTH: "siht.", LEMMA: "sihteeri"}
|
|
||||||
],
|
|
||||||
"srk.": [
|
"srk.": [
|
||||||
{ORTH: "srk.", LEMMA: "seurakunta"}
|
{ORTH: "srk.", LEMMA: "seurakunta"}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,16 +1,12 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import numpy
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import random
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from warnings import warn
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
||||||
from .syntax.parser import get_templates
|
from .syntax.parser import get_templates
|
||||||
from .syntax.nonproj import PseudoProjectivity
|
from .syntax.nonproj import PseudoProjectivity
|
||||||
from .pipeline import DependencyParser, EntityRecognizer
|
from .pipeline import DependencyParser, EntityRecognizer
|
||||||
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
|
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
|
|
||||||
|
|
|
@ -2,13 +2,10 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .attrs cimport attr_id_t
|
from .attrs cimport attr_id_t
|
||||||
from .structs cimport TokenC, LexemeC
|
from .structs cimport TokenC
|
||||||
from .lexeme cimport Lexeme
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
from .attrs cimport ID, ENT_TYPE
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from .tokens.doc cimport get_token_attr
|
from .tokens.doc cimport get_token_attr
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
|
|
@ -1,12 +1,8 @@
|
||||||
# cython: infer_types
|
# cython: infer_types
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .lemmatizer import Lemmatizer
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
|
||||||
from .syntax.beam_parser cimport BeamParser
|
from .syntax.beam_parser cimport BeamParser
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from .syntax.ner cimport BiluoPushDown
|
||||||
from .syntax.arc_eager cimport ArcEager
|
from .syntax.arc_eager cimport ArcEager
|
||||||
from .vocab cimport Vocab
|
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
|
||||||
# TODO: The disorganization here is pretty embarrassing. At least it's only
|
# TODO: The disorganization here is pretty embarrassing. At least it's only
|
||||||
|
|
|
@ -1,20 +1,16 @@
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from libc.string cimport memset
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t, weight_t
|
from thinc.typedefs cimport atom_t
|
||||||
from thinc.extra.eg cimport Example
|
from thinc.extra.eg cimport Example
|
||||||
from thinc.structs cimport ExampleC
|
from thinc.structs cimport ExampleC
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.linalg cimport VecVec
|
from thinc.linalg cimport VecVec
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
|
|
||||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
from cpython cimport Py_UNICODE_ISSPACE
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
|
@ -8,10 +8,8 @@ import os.path
|
||||||
import pathlib
|
import pathlib
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import six
|
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
basestring
|
basestring
|
||||||
|
@ -19,6 +17,12 @@ except NameError:
|
||||||
basestring = str
|
basestring = str
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw_input
|
||||||
|
except NameError: # Python 3
|
||||||
|
raw_input = input
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
_data_path = pathlib.Path(__file__).parent / 'data'
|
_data_path = pathlib.Path(__file__).parent / 'data'
|
||||||
|
|
||||||
|
@ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_input(description, default=False):
|
||||||
|
"""Get user input via raw_input / input and return input value. Takes a
|
||||||
|
description for the prompt, and an optional default value that's displayed
|
||||||
|
with the prompt."""
|
||||||
|
|
||||||
|
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||||
|
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||||
|
user_input = raw_input(prompt)
|
||||||
|
return user_input
|
||||||
|
|
||||||
|
|
||||||
def print_table(data, **kwargs):
|
def print_table(data, **kwargs):
|
||||||
"""Print data in table format. Can either take a list of tuples or a
|
"""Print data in table format. Can either take a list of tuples or a
|
||||||
dictionary, which will be converted to a list of tuples."""
|
dictionary, which will be converted to a list of tuples."""
|
||||||
|
|
|
@ -44,7 +44,7 @@ $color-red: #d9515d
|
||||||
$color-green: #3ec930
|
$color-green: #3ec930
|
||||||
$color-yellow: #f4c025
|
$color-yellow: #f4c025
|
||||||
|
|
||||||
$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 )
|
$syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 )
|
||||||
|
|
||||||
$pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat
|
$pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat
|
||||||
$pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat
|
$pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat
|
||||||
|
|
|
@ -103,3 +103,38 @@ p
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
|
+h(2, "package") Package
|
||||||
|
+tag experimental
|
||||||
|
|
||||||
|
p
|
||||||
|
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
||||||
|
| from an existing model data directory. All data files are copied over,
|
||||||
|
| and the meta data can be entered directly from the command line. While
|
||||||
|
| this feature is still experimental, the required file templates are
|
||||||
|
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
|
||||||
|
| This means you need to be connected to the internet to use this command.
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy package [input_dir] [output_dir] [--force]
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code input_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Path to directory containing model data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code output_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Directory to create package folder in.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --force], #[code -f]
|
||||||
|
+cell flag
|
||||||
|
+cell Force overwriting of existing folder in output directory.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --help], #[code -h]
|
||||||
|
+cell flag
|
||||||
|
+cell Show help message and available arguments.
|
||||||
|
|
|
@ -14,9 +14,12 @@ p
|
||||||
| model name.
|
| model name.
|
||||||
|
|
||||||
+infobox("Important note")
|
+infobox("Important note")
|
||||||
| Due to improvements in the English lemmatizer in v1.7.0, you need to download the
|
| Due to improvements in the English lemmatizer in v1.7.0, you need to
|
||||||
| new English model. The German model is still compatible and will be
|
| #[strong download the new English models]. The German model is still
|
||||||
| recognised and linked automatically.
|
| compatible. If you've trained statistical models that use spaCy's
|
||||||
|
| annotations, you should #[strong retrain your models after updating spaCy].
|
||||||
|
| If you don't retrain your models, you may suffer train/test skew, which
|
||||||
|
| might decrease your accuracy.
|
||||||
|
|
||||||
+aside-code("Quickstart").
|
+aside-code("Quickstart").
|
||||||
# Install spaCy and download English model
|
# Install spaCy and download English model
|
||||||
|
@ -235,7 +238,11 @@ p
|
||||||
| #[+a("/docs/usage/adding-languages") additional languages], you can
|
| #[+a("/docs/usage/adding-languages") additional languages], you can
|
||||||
| create a shortuct link for it by pointing #[code spacy.link] to the
|
| create a shortuct link for it by pointing #[code spacy.link] to the
|
||||||
| model's data directory. To allow your model to be downloaded and
|
| model's data directory. To allow your model to be downloaded and
|
||||||
| installed via pip, you'll also need to generate a package for it.
|
| installed via pip, you'll also need to generate a package for it. You can
|
||||||
|
| do this manually, or via the new
|
||||||
|
| #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
|
||||||
|
| create all required files, and walk you through generating the meta data.
|
||||||
|
|
||||||
|
|
||||||
+infobox("Important note")
|
+infobox("Important note")
|
||||||
| The model packages are #[strong not suitable] for the public
|
| The model packages are #[strong not suitable] for the public
|
||||||
|
|
Loading…
Reference in New Issue
Block a user