mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
9c9cd99144
|
@ -1,130 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
from spacy.syntax.parser import GreedyParser
|
||||
from spacy.syntax.parser import OracleError
|
||||
from spacy.syntax.util import Config
|
||||
|
||||
|
||||
def is_punct_label(label):
|
||||
return label == 'P' or label.lower() == 'punct'
|
||||
|
||||
|
||||
def read_gold(file_):
|
||||
"""Read a standard CoNLL/MALT-style format"""
|
||||
sents = []
|
||||
for sent_str in file_.read().strip().split('\n\n'):
|
||||
ids = []
|
||||
words = []
|
||||
heads = []
|
||||
labels = []
|
||||
tags = []
|
||||
for i, line in enumerate(sent_str.split('\n')):
|
||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
||||
words.append(word)
|
||||
if head_idx == -1:
|
||||
head_idx = i
|
||||
ids.append(id_)
|
||||
heads.append(head_idx)
|
||||
labels.append(label)
|
||||
tags.append(pos_string)
|
||||
text = ' '.join(words)
|
||||
sents.append((text, [words], ids, words, tags, heads, labels))
|
||||
return sents
|
||||
|
||||
|
||||
def _parse_line(line):
|
||||
pieces = line.split()
|
||||
id_ = int(pieces[0])
|
||||
word = pieces[1]
|
||||
pos = pieces[3]
|
||||
head_idx = int(pieces[6])
|
||||
label = pieces[7]
|
||||
return id_, word, pos, head_idx, label
|
||||
|
||||
|
||||
def iter_data(paragraphs, tokenizer, gold_preproc=False):
|
||||
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
|
||||
assert len(words) == len(heads)
|
||||
for words in tokenized:
|
||||
sent_ids = ids[:len(words)]
|
||||
sent_tags = tags[:len(words)]
|
||||
sent_heads = heads[:len(words)]
|
||||
sent_labels = labels[:len(words)]
|
||||
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
|
||||
tokens = tokenizer.tokens_from_list(words)
|
||||
yield tokens, sent_tags, sent_heads, sent_labels
|
||||
ids = ids[len(words):]
|
||||
tags = tags[len(words):]
|
||||
heads = heads[len(words):]
|
||||
labels = labels[len(words):]
|
||||
|
||||
|
||||
def _map_indices_to_tokens(ids, heads):
|
||||
mapped = []
|
||||
for head in heads:
|
||||
if head not in ids:
|
||||
mapped.append(None)
|
||||
else:
|
||||
mapped.append(ids.index(head))
|
||||
return mapped
|
||||
|
||||
|
||||
|
||||
def evaluate(Language, dev_loc, model_dir):
|
||||
global loss
|
||||
nlp = Language()
|
||||
n_corr = 0
|
||||
pos_corr = 0
|
||||
n_tokens = 0
|
||||
total = 0
|
||||
skipped = 0
|
||||
loss = 0
|
||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
||||
paragraphs = read_gold(file_)
|
||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
|
||||
assert len(tokens) == len(labels)
|
||||
nlp.tagger.tag_from_strings(tokens, tag_strs)
|
||||
nlp.parser(tokens)
|
||||
for i, token in enumerate(tokens):
|
||||
try:
|
||||
pos_corr += token.tag_ == tag_strs[i]
|
||||
except:
|
||||
print i, token.orth_, token.tag
|
||||
raise
|
||||
n_tokens += 1
|
||||
if heads[i] is None:
|
||||
skipped += 1
|
||||
continue
|
||||
if is_punct_label(labels[i]):
|
||||
continue
|
||||
n_corr += token.head.i == heads[i]
|
||||
total += 1
|
||||
print loss, skipped, (loss+skipped + total)
|
||||
print pos_corr / n_tokens
|
||||
return float(n_corr) / (total + loss)
|
||||
|
||||
|
||||
def main(dev_loc, model_dir):
|
||||
print evaluate(English, dev_loc, model_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,261 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
import re
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
from spacy.syntax.parser import Parser, get_templates
|
||||
from spacy._theano import TheanoModel
|
||||
|
||||
import theano
|
||||
import theano.tensor as T
|
||||
|
||||
from theano.printing import Print
|
||||
|
||||
import numpy
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
|
||||
theano.config.profile = False
|
||||
theano.config.floatX = 'float32'
|
||||
floatX = theano.config.floatX
|
||||
|
||||
|
||||
def L1(L1_reg, *weights):
|
||||
return L1_reg * sum(abs(w).sum() for w in weights)
|
||||
|
||||
|
||||
def L2(L2_reg, *weights):
|
||||
return L2_reg * sum((w ** 2).sum() for w in weights)
|
||||
|
||||
|
||||
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
|
||||
updates = OrderedDict()
|
||||
for param in params:
|
||||
value = param.get_value(borrow=True)
|
||||
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
|
||||
broadcastable=param.broadcastable)
|
||||
|
||||
grad = T.grad(loss, param)
|
||||
accu_new = rho * accu + (1 - rho) * grad ** 2
|
||||
updates[accu] = accu_new
|
||||
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
|
||||
return updates
|
||||
|
||||
|
||||
def relu(x):
|
||||
return x * (x > 0)
|
||||
|
||||
|
||||
def feed_layer(activation, weights, bias, input_):
|
||||
return activation(T.dot(input_, weights) + bias)
|
||||
|
||||
|
||||
def init_weights(n_in, n_out):
|
||||
rng = numpy.random.RandomState(1235)
|
||||
|
||||
weights = numpy.asarray(
|
||||
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
|
||||
dtype=theano.config.floatX
|
||||
)
|
||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
|
||||
|
||||
|
||||
def compile_model(n_classes, n_hidden, n_in, optimizer):
|
||||
x = T.vector('x')
|
||||
costs = T.ivector('costs')
|
||||
loss = T.scalar('loss')
|
||||
|
||||
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
|
||||
hidden_W, hidden_b = init_weights(n_in, n_hidden)
|
||||
|
||||
# Feed the inputs forward through the network
|
||||
p_y_given_x = feed_layer(
|
||||
T.nnet.softmax,
|
||||
maxent_W,
|
||||
maxent_b,
|
||||
feed_layer(
|
||||
relu,
|
||||
hidden_W,
|
||||
hidden_b,
|
||||
x))
|
||||
|
||||
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
|
||||
|
||||
train_model = theano.function(
|
||||
name='train_model',
|
||||
inputs=[x, costs],
|
||||
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
|
||||
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
|
||||
on_unused_input='warn'
|
||||
)
|
||||
|
||||
evaluate_model = theano.function(
|
||||
name='evaluate_model',
|
||||
inputs=[x],
|
||||
outputs=[
|
||||
feed_layer(
|
||||
T.nnet.softmax,
|
||||
maxent_W,
|
||||
maxent_b,
|
||||
feed_layer(
|
||||
relu,
|
||||
hidden_W,
|
||||
hidden_b,
|
||||
x
|
||||
)
|
||||
)[0]
|
||||
]
|
||||
)
|
||||
return train_model, evaluate_model
|
||||
|
||||
|
||||
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
||||
seed=0, n_sents=0, verbose=False):
|
||||
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config',
|
||||
seed=seed,
|
||||
templates=tuple(),
|
||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
||||
vector_lengths=(nv_word, nv_tag, nv_label),
|
||||
hidden_nodes=nv_hidden,
|
||||
eta=eta,
|
||||
mu=mu
|
||||
)
|
||||
|
||||
# Bake-in hyper-parameters
|
||||
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
|
||||
nlp = Language(data_dir=model_dir)
|
||||
n_classes = nlp.parser.model.n_classes
|
||||
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
|
||||
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
|
||||
predict, model_loc)
|
||||
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
||||
log_loc = path.join(model_dir, 'job.log')
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
for _, sents in gold_tuples:
|
||||
for annot_tuples, ctnt in sents:
|
||||
if len(annot_tuples[1]) == 1:
|
||||
continue
|
||||
score_model(scorer, nlp, annot_tuples)
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||
assert gold.is_projective
|
||||
loss += nlp.parser.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
random.shuffle(gold_tuples)
|
||||
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc)
|
||||
print logline
|
||||
with open(log_loc, 'aw') as file_:
|
||||
file_.write(logline + '\n')
|
||||
nlp.parser.model.end_training()
|
||||
nlp.tagger.model.end_training()
|
||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
||||
return nlp
|
||||
|
||||
|
||||
def evaluate(nlp, gold_tuples, gold_preproc=True):
|
||||
scorer = Scorer()
|
||||
for raw_text, sents in gold_tuples:
|
||||
for annot_tuples, brackets in sents:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold)
|
||||
return scorer
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_sents=("Number of training sentences", "option", "n", int),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
|
||||
nv_word=("Word vector length", "option", "W", int),
|
||||
nv_tag=("Tag vector length", "option", "T", int),
|
||||
nv_label=("Label vector length", "option", "L", int),
|
||||
nv_hidden=("Hidden nodes length", "option", "H", int),
|
||||
eta=("Learning rate", "option", "E", float),
|
||||
mu=("Momentum", "option", "M", float),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
|
||||
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
|
||||
eta=0.1, mu=0.9, eval_only=False):
|
||||
|
||||
|
||||
|
||||
|
||||
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
|
||||
|
||||
nlp = train(English, gold_train, model_dir,
|
||||
feat_set='embed',
|
||||
eta=eta, mu=mu,
|
||||
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
|
||||
n_sents=n_sents, n_iter=n_iter,
|
||||
verbose=verbose)
|
||||
|
||||
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
|
||||
|
||||
print 'TOK', 100-scorer.token_acc
|
||||
print 'POS', scorer.tags_acc
|
||||
print 'UAS', scorer.uas
|
||||
print 'LAS', scorer.las
|
||||
|
||||
print 'NER P', scorer.ents_p
|
||||
print 'NER R', scorer.ents_r
|
||||
print 'NER F', scorer.ents_f
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,18 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import json
|
||||
from os import path
|
||||
import shutil
|
||||
import os
|
||||
import random
|
||||
import io
|
||||
import pathlib
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.language import Language
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
||||
from spacy.syntax.parser import get_templates
|
||||
|
@ -23,7 +18,6 @@ import spacy.attrs
|
|||
import io
|
||||
|
||||
|
||||
|
||||
def read_conllx(loc, n=0):
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
|
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
|
|||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||
_2 = line.split('\t')
|
||||
if '-' in id_ or '.' in id_:
|
||||
continue
|
||||
try:
|
||||
|
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
|||
random.shuffle(train_sents)
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
||||
nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
|
||||
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
|
||||
nlp.end_training(model_dir)
|
||||
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
|
|
|
@ -5,7 +5,7 @@ import json
|
|||
from pathlib import Path
|
||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||
from .deprecated import resolve_model_name
|
||||
from .cli.info import info
|
||||
from .cli import info
|
||||
|
||||
from . import en
|
||||
from . import de
|
||||
|
@ -49,7 +49,3 @@ def load(name, **overrides):
|
|||
overrides['path'] = model_path
|
||||
|
||||
return cls(**overrides)
|
||||
|
||||
|
||||
def info(name, markdown):
|
||||
info(name, markdown)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# coding: utf8
|
||||
#
|
||||
from __future__ import print_function
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
#from __future__ import unicode_literals,
|
||||
|
@ -8,12 +7,13 @@ import plac
|
|||
from spacy.cli import download as cli_download
|
||||
from spacy.cli import link as cli_link
|
||||
from spacy.cli import info as cli_info
|
||||
from spacy.cli import package as cli_package
|
||||
|
||||
|
||||
class CLI(object):
|
||||
"""Command-line interface for spaCy"""
|
||||
|
||||
commands = ('download', 'link', 'info')
|
||||
commands = ('download', 'link', 'info', 'package')
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||
|
@ -32,8 +32,8 @@ class CLI(object):
|
|||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("Name of shortuct link to create", "positional", None, str),
|
||||
force=("Force overwriting of existing link", "flag", "f", bool)
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(self, origin, link_name, force=False):
|
||||
"""
|
||||
|
@ -59,6 +59,21 @@ class CLI(object):
|
|||
cli_info(model, markdown)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output directory", "positional", None, str),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(self, input_dir, output_dir, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
|
||||
cli_package(input_dir, output_dir, force)
|
||||
|
||||
|
||||
def __missing__(self, name):
|
||||
print("\n Command %r does not exist\n" % name)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from .download import download
|
||||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
|
|
91
spacy/cli/package.py
Normal file
91
spacy/cli/package.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
from .. import about
|
||||
from .. import util
|
||||
|
||||
|
||||
def package(input_dir, output_dir, force):
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
check_dirs(input_path, output_path)
|
||||
|
||||
template_setup = get_template('setup.py')
|
||||
template_manifest = get_template('MANIFEST.in')
|
||||
template_init = get_template('en_model_name/__init__.py')
|
||||
meta = generate_meta()
|
||||
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
main_path = output_path / model_name_v
|
||||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
|
||||
create_file(main_path / 'meta.json', json.dumps(meta, indent=2))
|
||||
create_file(main_path / 'setup.py', template_setup)
|
||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||
create_file(package_path / '__init__.py', template_init)
|
||||
|
||||
util.print_msg(
|
||||
main_path.as_posix(),
|
||||
"To build the package, run `python setup.py sdist` in that directory.",
|
||||
title="Successfully created package {p}".format(p=model_name_v))
|
||||
|
||||
|
||||
def check_dirs(input_path, output_path):
|
||||
if not input_path.exists():
|
||||
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
|
||||
if not output_path.exists():
|
||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(package_path.as_posix())
|
||||
else:
|
||||
util.sys_exit(package_path.as_posix(),
|
||||
"Please delete the directory and try again.",
|
||||
title="Package directory already exists")
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
||||
|
||||
def create_file(file_path, contents):
|
||||
file_path.touch()
|
||||
file_path.open('w').write(contents, encoding='utf-8')
|
||||
|
||||
|
||||
def generate_meta():
|
||||
settings = [('lang', 'Model language', 'en'),
|
||||
('name', 'Model name', 'model'),
|
||||
('version', 'Model version', '0.0.0'),
|
||||
('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'),
|
||||
('description', 'Model description', False),
|
||||
('author', 'Author', False),
|
||||
('email', 'Author email', False),
|
||||
('url', 'Author website', False),
|
||||
('license', 'License', 'CC BY-NC 3.0')]
|
||||
|
||||
util.print_msg("Enter the package settings for your model.", title="Generating meta.json")
|
||||
|
||||
meta = {}
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
return meta
|
||||
|
||||
|
||||
def get_template(filepath):
|
||||
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
||||
r = requests.get(url + filepath)
|
||||
if r.status_code != 200:
|
||||
util.sys_exit(
|
||||
"Couldn't fetch template files from GitHub.",
|
||||
title="Server error ({c})".format(c=r.status_code))
|
||||
return r.text
|
|
@ -21,7 +21,6 @@ MORPH_RULES = {
|
|||
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
|
||||
|
||||
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
|
||||
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
|
||||
|
|
|
@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
|
|||
"vm.": [
|
||||
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}
|
||||
],
|
||||
"siht.": [
|
||||
{ORTH: "siht.", LEMMA: "sihteeri"}
|
||||
],
|
||||
"srk.": [
|
||||
{ORTH: "srk.", LEMMA: "seurakunta"}
|
||||
]
|
||||
|
|
|
@ -1,16 +1,12 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import numpy
|
||||
import io
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import os
|
||||
from os import path
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
import ujson as json
|
||||
|
||||
from .syntax import nonproj
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
from warnings import warn
|
||||
import pathlib
|
||||
from contextlib import contextmanager
|
||||
import shutil
|
||||
|
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
|||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .pipeline import DependencyParser, EntityRecognizer
|
||||
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
|
||||
from .syntax.arc_eager import ArcEager
|
||||
from .syntax.ner import BiluoPushDown
|
||||
|
||||
|
|
|
@ -2,13 +2,10 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .structs cimport TokenC, LexemeC
|
||||
from .lexeme cimport Lexeme
|
||||
from .structs cimport TokenC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport ID, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# cython: infer_types
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
|
|
|
@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
|
|||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .vocab cimport Vocab
|
||||
from .tagger import Tagger
|
||||
|
||||
# TODO: The disorganization here is pretty embarrassing. At least it's only
|
||||
|
|
|
@ -1,20 +1,16 @@
|
|||
import json
|
||||
import pathlib
|
||||
from collections import defaultdict
|
||||
from libc.string cimport memset
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t, weight_t
|
||||
from thinc.typedefs cimport atom_t
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.structs cimport ExampleC
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
|
||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||
from .gold cimport GoldParse
|
||||
|
||||
from .attrs cimport *
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
# cython: embedsignature=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import pathlib
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cpython cimport Py_UNICODE_ISSPACE
|
||||
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
|
|
|
@ -8,10 +8,8 @@ import os.path
|
|||
import pathlib
|
||||
import sys
|
||||
|
||||
import six
|
||||
import textwrap
|
||||
|
||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
try:
|
||||
basestring
|
||||
|
@ -19,6 +17,12 @@ except NameError:
|
|||
basestring = str
|
||||
|
||||
|
||||
try:
|
||||
raw_input
|
||||
except NameError: # Python 3
|
||||
raw_input = input
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
_data_path = pathlib.Path(__file__).parent / 'data'
|
||||
|
||||
|
@ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True):
|
|||
return None
|
||||
|
||||
|
||||
def get_raw_input(description, default=False):
|
||||
"""Get user input via raw_input / input and return input value. Takes a
|
||||
description for the prompt, and an optional default value that's displayed
|
||||
with the prompt."""
|
||||
|
||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||
user_input = raw_input(prompt)
|
||||
return user_input
|
||||
|
||||
|
||||
def print_table(data, **kwargs):
|
||||
"""Print data in table format. Can either take a list of tuples or a
|
||||
dictionary, which will be converted to a list of tuples."""
|
||||
|
|
|
@ -44,7 +44,7 @@ $color-red: #d9515d
|
|||
$color-green: #3ec930
|
||||
$color-yellow: #f4c025
|
||||
|
||||
$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 )
|
||||
$syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 )
|
||||
|
||||
$pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat
|
||||
$pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat
|
||||
|
|
|
@ -103,3 +103,38 @@ p
|
|||
+cell #[code --help], #[code -h]
|
||||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(2, "package") Package
|
||||
+tag experimental
|
||||
|
||||
p
|
||||
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
||||
| from an existing model data directory. All data files are copied over,
|
||||
| and the meta data can be entered directly from the command line. While
|
||||
| this feature is still experimental, the required file templates are
|
||||
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
|
||||
| This means you need to be connected to the internet to use this command.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package [input_dir] [output_dir] [--force]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code input_dir]
|
||||
+cell positional
|
||||
+cell Path to directory containing model data.
|
||||
|
||||
+row
|
||||
+cell #[code output_dir]
|
||||
+cell positional
|
||||
+cell Directory to create package folder in.
|
||||
|
||||
+row
|
||||
+cell #[code --force], #[code -f]
|
||||
+cell flag
|
||||
+cell Force overwriting of existing folder in output directory.
|
||||
|
||||
+row
|
||||
+cell #[code --help], #[code -h]
|
||||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
|
|
@ -14,9 +14,12 @@ p
|
|||
| model name.
|
||||
|
||||
+infobox("Important note")
|
||||
| Due to improvements in the English lemmatizer in v1.7.0, you need to download the
|
||||
| new English model. The German model is still compatible and will be
|
||||
| recognised and linked automatically.
|
||||
| Due to improvements in the English lemmatizer in v1.7.0, you need to
|
||||
| #[strong download the new English models]. The German model is still
|
||||
| compatible. If you've trained statistical models that use spaCy's
|
||||
| annotations, you should #[strong retrain your models after updating spaCy].
|
||||
| If you don't retrain your models, you may suffer train/test skew, which
|
||||
| might decrease your accuracy.
|
||||
|
||||
+aside-code("Quickstart").
|
||||
# Install spaCy and download English model
|
||||
|
@ -235,7 +238,11 @@ p
|
|||
| #[+a("/docs/usage/adding-languages") additional languages], you can
|
||||
| create a shortuct link for it by pointing #[code spacy.link] to the
|
||||
| model's data directory. To allow your model to be downloaded and
|
||||
| installed via pip, you'll also need to generate a package for it.
|
||||
| installed via pip, you'll also need to generate a package for it. You can
|
||||
| do this manually, or via the new
|
||||
| #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
|
||||
| create all required files, and walk you through generating the meta data.
|
||||
|
||||
|
||||
+infobox("Important note")
|
||||
| The model packages are #[strong not suitable] for the public
|
||||
|
|
Loading…
Reference in New Issue
Block a user