This commit is contained in:
Matthew Honnibal 2017-03-23 11:11:24 +01:00
commit 9c9cd99144
21 changed files with 183 additions and 444 deletions

View File

@ -1,130 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import time
import gzip
import plac
import cProfile
import pstats
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
def is_punct_label(label):
return label == 'P' or label.lower() == 'punct'
def read_gold(file_):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
id_, word, pos_string, head_idx, label = _parse_line(line)
words.append(word)
if head_idx == -1:
head_idx = i
ids.append(id_)
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
text = ' '.join(words)
sents.append((text, [words], ids, words, tags, heads, labels))
return sents
def _parse_line(line):
pieces = line.split()
id_ = int(pieces[0])
word = pieces[1]
pos = pieces[3]
head_idx = int(pieces[6])
label = pieces[7]
return id_, word, pos, head_idx, label
def iter_data(paragraphs, tokenizer, gold_preproc=False):
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
assert len(words) == len(heads)
for words in tokenized:
sent_ids = ids[:len(words)]
sent_tags = tags[:len(words)]
sent_heads = heads[:len(words)]
sent_labels = labels[:len(words)]
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
tokens = tokenizer.tokens_from_list(words)
yield tokens, sent_tags, sent_heads, sent_labels
ids = ids[len(words):]
tags = tags[len(words):]
heads = heads[len(words):]
labels = labels[len(words):]
def _map_indices_to_tokens(ids, heads):
mapped = []
for head in heads:
if head not in ids:
mapped.append(None)
else:
mapped.append(ids.index(head))
return mapped
def evaluate(Language, dev_loc, model_dir):
global loss
nlp = Language()
n_corr = 0
pos_corr = 0
n_tokens = 0
total = 0
skipped = 0
loss = 0
with codecs.open(dev_loc, 'r', 'utf8') as file_:
paragraphs = read_gold(file_)
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
assert len(tokens) == len(labels)
nlp.tagger.tag_from_strings(tokens, tag_strs)
nlp.parser(tokens)
for i, token in enumerate(tokens):
try:
pos_corr += token.tag_ == tag_strs[i]
except:
print i, token.orth_, token.tag
raise
n_tokens += 1
if heads[i] is None:
skipped += 1
continue
if is_punct_label(labels[i]):
continue
n_corr += token.head.i == heads[i]
total += 1
print loss, skipped, (loss+skipped + total)
print pos_corr / n_tokens
return float(n_corr) / (total + loss)
def main(dev_loc, model_dir):
print evaluate(English, dev_loc, model_dir)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,261 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import plac
import cProfile
import pstats
import re
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.syntax.parser import Parser, get_templates
from spacy._theano import TheanoModel
import theano
import theano.tensor as T
from theano.printing import Print
import numpy
from collections import OrderedDict, defaultdict
theano.config.profile = False
theano.config.floatX = 'float32'
floatX = theano.config.floatX
def L1(L1_reg, *weights):
return L1_reg * sum(abs(w).sum() for w in weights)
def L2(L2_reg, *weights):
return L2_reg * sum((w ** 2).sum() for w in weights)
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
updates = OrderedDict()
for param in params:
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
grad = T.grad(loss, param)
accu_new = rho * accu + (1 - rho) * grad ** 2
updates[accu] = accu_new
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
return updates
def relu(x):
return x * (x > 0)
def feed_layer(activation, weights, bias, input_):
return activation(T.dot(input_, weights) + bias)
def init_weights(n_in, n_out):
rng = numpy.random.RandomState(1235)
weights = numpy.asarray(
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
dtype=theano.config.floatX
)
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
def compile_model(n_classes, n_hidden, n_in, optimizer):
x = T.vector('x')
costs = T.ivector('costs')
loss = T.scalar('loss')
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
hidden_W, hidden_b = init_weights(n_in, n_hidden)
# Feed the inputs forward through the network
p_y_given_x = feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x))
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
train_model = theano.function(
name='train_model',
inputs=[x, costs],
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
on_unused_input='warn'
)
evaluate_model = theano.function(
name='evaluate_model',
inputs=[x],
outputs=[
feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x
)
)[0]
]
)
return train_model, evaluate_model
def score_model(scorer, nlp, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
seed=0, n_sents=0, verbose=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
Config.write(dep_model_dir, 'config',
seed=seed,
templates=tuple(),
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
vector_lengths=(nv_word, nv_tag, nv_label),
hidden_nodes=nv_hidden,
eta=eta,
mu=mu
)
# Bake-in hyper-parameters
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
nlp = Language(data_dir=model_dir)
n_classes = nlp.parser.model.n_classes
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
predict, model_loc)
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
log_loc = path.join(model_dir, 'job.log')
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for _, sents in gold_tuples:
for annot_tuples, ctnt in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, annot_tuples)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)
assert gold.is_projective
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc,
scorer.token_acc)
print logline
with open(log_loc, 'aw') as file_:
file_.write(logline + '\n')
nlp.parser.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
return nlp
def evaluate(nlp, gold_tuples, gold_preproc=True):
scorer = Scorer()
for raw_text, sents in gold_tuples:
for annot_tuples, brackets in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
nv_word=("Word vector length", "option", "W", int),
nv_tag=("Tag vector length", "option", "T", int),
nv_label=("Label vector length", "option", "L", int),
nv_hidden=("Hidden nodes length", "option", "H", int),
eta=("Learning rate", "option", "E", float),
mu=("Momentum", "option", "M", float),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
eta=0.1, mu=0.9, eval_only=False):
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
nlp = train(English, gold_train, model_dir,
feat_set='embed',
eta=eta, mu=mu,
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
n_sents=n_sents, n_iter=n_iter,
verbose=verbose)
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
print 'TOK', 100-scorer.token_acc
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas
print 'LAS', scorer.las
print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f
if __name__ == '__main__':
plac.call(main)

View File

@ -1,18 +1,13 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import json import json
from os import path
import shutil
import os
import random import random
import io
import pathlib import pathlib
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language from spacy.language import Language
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.tagger import Tagger from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.pipeline import DependencyParser, BeamDependencyParser
from spacy.syntax.parser import get_templates from spacy.syntax.parser import get_templates
@ -23,7 +18,6 @@ import spacy.attrs
import io import io
def read_conllx(loc, n=0): def read_conllx(loc, n=0):
with io.open(loc, 'r', encoding='utf8') as file_: with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read() text = file_.read()
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
lines.pop(0) lines.pop(0)
tokens = [] tokens = []
for line in lines: for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() id_, word, lemma, pos, tag, morph, head, dep, _1, \
_2 = line.split('\t')
if '-' in id_ or '.' in id_: if '-' in id_ or '.' in id_:
continue continue
try: try:
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
random.shuffle(train_sents) random.shuffle(train_sents)
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir) nlp.end_training(model_dir)
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))

View File

@ -5,7 +5,7 @@ import json
from pathlib import Path from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta from .util import set_lang_class, get_lang_class, parse_package_meta
from .deprecated import resolve_model_name from .deprecated import resolve_model_name
from .cli.info import info from .cli import info
from . import en from . import en
from . import de from . import de
@ -49,7 +49,3 @@ def load(name, **overrides):
overrides['path'] = model_path overrides['path'] = model_path
return cls(**overrides) return cls(**overrides)
def info(name, markdown):
info(name, markdown)

View File

@ -1,5 +1,4 @@
# coding: utf8 # coding: utf8
#
from __future__ import print_function from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals, #from __future__ import unicode_literals,
@ -8,12 +7,13 @@ import plac
from spacy.cli import download as cli_download from spacy.cli import download as cli_download
from spacy.cli import link as cli_link from spacy.cli import link as cli_link
from spacy.cli import info as cli_info from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
class CLI(object): class CLI(object):
"""Command-line interface for spaCy""" """Command-line interface for spaCy"""
commands = ('download', 'link', 'info') commands = ('download', 'link', 'info', 'package')
@plac.annotations( @plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str), model=("model to download (shortcut or model name)", "positional", None, str),
@ -32,8 +32,8 @@ class CLI(object):
@plac.annotations( @plac.annotations(
origin=("package name or local path to model", "positional", None, str), origin=("package name or local path to model", "positional", None, str),
link_name=("Name of shortuct link to create", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str),
force=("Force overwriting of existing link", "flag", "f", bool) force=("force overwriting of existing link", "flag", "f", bool)
) )
def link(self, origin, link_name, force=False): def link(self, origin, link_name, force=False):
""" """
@ -59,6 +59,21 @@ class CLI(object):
cli_info(model, markdown) cli_info(model, markdown)
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output directory", "positional", None, str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(self, input_dir, output_dir, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
cli_package(input_dir, output_dir, force)
def __missing__(self, name): def __missing__(self, name):
print("\n Command %r does not exist\n" % name) print("\n Command %r does not exist\n" % name)

View File

@ -1,4 +1,4 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite
from libc.string cimport memcpy from libc.string cimport memcpy

View File

@ -1,3 +1,4 @@
from .download import download from .download import download
from .info import info from .info import info
from .link import link from .link import link
from .package import package

91
spacy/cli/package.py Normal file
View File

@ -0,0 +1,91 @@
# coding: utf8
from __future__ import unicode_literals
import json
import shutil
import requests
from pathlib import Path
from .. import about
from .. import util
def package(input_dir, output_dir, force):
input_path = Path(input_dir)
output_path = Path(output_dir)
check_dirs(input_path, output_path)
template_setup = get_template('setup.py')
template_manifest = get_template('MANIFEST.in')
template_init = get_template('en_model_name/__init__.py')
meta = generate_meta()
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
main_path = output_path / model_name_v
package_path = main_path / model_name
create_dirs(package_path, force)
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
create_file(main_path / 'meta.json', json.dumps(meta, indent=2))
create_file(main_path / 'setup.py', template_setup)
create_file(main_path / 'MANIFEST.in', template_manifest)
create_file(package_path / '__init__.py', template_init)
util.print_msg(
main_path.as_posix(),
"To build the package, run `python setup.py sdist` in that directory.",
title="Successfully created package {p}".format(p=model_name_v))
def check_dirs(input_path, output_path):
if not input_path.exists():
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
if not output_path.exists():
util.sys_exit(output_path.as_posix(), title="Output directory not found")
def create_dirs(package_path, force):
if package_path.exists():
if force:
shutil.rmtree(package_path.as_posix())
else:
util.sys_exit(package_path.as_posix(),
"Please delete the directory and try again.",
title="Package directory already exists")
Path.mkdir(package_path, parents=True)
def create_file(file_path, contents):
file_path.touch()
file_path.open('w').write(contents, encoding='utf-8')
def generate_meta():
settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'),
('description', 'Model description', False),
('author', 'Author', False),
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
util.print_msg("Enter the package settings for your model.", title="Generating meta.json")
meta = {}
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
return meta
def get_template(filepath):
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
r = requests.get(url + filepath)
if r.status_code != 200:
util.sys_exit(
"Couldn't fetch template files from GitHub.",
title="Server error ({c})".format(c=r.status_code))
return r.text

View File

@ -21,7 +21,6 @@ MORPH_RULES = {
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, "them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, "his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, "hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, "its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},

View File

@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
"vm.": [ "vm.": [
{ORTH: "vm.", LEMMA: "viimeksi mainittu"} {ORTH: "vm.", LEMMA: "viimeksi mainittu"}
], ],
"siht.": [
{ORTH: "siht.", LEMMA: "sihteeri"}
],
"srk.": [ "srk.": [
{ORTH: "srk.", LEMMA: "seurakunta"} {ORTH: "srk.", LEMMA: "seurakunta"}
] ]

View File

@ -1,16 +1,12 @@
# cython: profile=True # cython: profile=True
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import numpy
import io import io
import json import json
import random
import re import re
import os import os
from os import path from os import path
from libc.string cimport memset
import ujson as json import ujson as json
from .syntax import nonproj from .syntax import nonproj

View File

@ -1,6 +1,5 @@
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import unicode_literals from __future__ import unicode_literals
from warnings import warn
import pathlib import pathlib
from contextlib import contextmanager from contextlib import contextmanager
import shutil import shutil
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer from .pipeline import DependencyParser, EntityRecognizer
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown

View File

@ -2,13 +2,10 @@
# cython: infer_types=True # cython: infer_types=True
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .structs cimport TokenC, LexemeC from .structs cimport TokenC
from .lexeme cimport Lexeme
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport ID, ENT_TYPE
from . import attrs from . import attrs
from .tokens.doc cimport get_token_attr from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc from .tokens.doc cimport Doc

View File

@ -1,12 +1,8 @@
# cython: infer_types # cython: infer_types
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path
from libc.string cimport memset from libc.string cimport memset
from .lemmatizer import Lemmatizer
try: try:
import ujson as json import ujson as json
except ImportError: except ImportError:

View File

@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
from .syntax.beam_parser cimport BeamParser from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager from .syntax.arc_eager cimport ArcEager
from .vocab cimport Vocab
from .tagger import Tagger from .tagger import Tagger
# TODO: The disorganization here is pretty embarrassing. At least it's only # TODO: The disorganization here is pretty embarrassing. At least it's only

View File

@ -1,20 +1,16 @@
import json import json
import pathlib import pathlib
from collections import defaultdict from collections import defaultdict
from libc.string cimport memset
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t, weight_t from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *

View File

@ -1,13 +1,10 @@
# cython: embedsignature=True # cython: embedsignature=True
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import pathlib import pathlib
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cpython cimport Py_UNICODE_ISSPACE
try: try:
import ujson as json import ujson as json

View File

@ -8,10 +8,8 @@ import os.path
import pathlib import pathlib
import sys import sys
import six
import textwrap import textwrap
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
try: try:
basestring basestring
@ -19,6 +17,12 @@ except NameError:
basestring = str basestring = str
try:
raw_input
except NameError: # Python 3
raw_input = input
LANGUAGES = {} LANGUAGES = {}
_data_path = pathlib.Path(__file__).parent / 'data' _data_path = pathlib.Path(__file__).parent / 'data'
@ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True):
return None return None
def get_raw_input(description, default=False):
"""Get user input via raw_input / input and return input value. Takes a
description for the prompt, and an optional default value that's displayed
with the prompt."""
additional = ' (default: {d})'.format(d=default) if default else ''
prompt = ' {d}{a}: '.format(d=description, a=additional)
user_input = raw_input(prompt)
return user_input
def print_table(data, **kwargs): def print_table(data, **kwargs):
"""Print data in table format. Can either take a list of tuples or a """Print data in table format. Can either take a list of tuples or a
dictionary, which will be converted to a list of tuples.""" dictionary, which will be converted to a list of tuples."""

View File

@ -44,7 +44,7 @@ $color-red: #d9515d
$color-green: #3ec930 $color-green: #3ec930
$color-yellow: #f4c025 $color-yellow: #f4c025
$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 ) $syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 )
$pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat
$pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat

View File

@ -103,3 +103,38 @@ p
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "package") Package
+tag experimental
p
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
| from an existing model data directory. All data files are copied over,
| and the meta data can be entered directly from the command line. While
| this feature is still experimental, the required file templates are
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
| This means you need to be connected to the internet to use this command.
+code(false, "bash").
python -m spacy package [input_dir] [output_dir] [--force]
+table(["Argument", "Type", "Description"])
+row
+cell #[code input_dir]
+cell positional
+cell Path to directory containing model data.
+row
+cell #[code output_dir]
+cell positional
+cell Directory to create package folder in.
+row
+cell #[code --force], #[code -f]
+cell flag
+cell Force overwriting of existing folder in output directory.
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.

View File

@ -14,9 +14,12 @@ p
| model name. | model name.
+infobox("Important note") +infobox("Important note")
| Due to improvements in the English lemmatizer in v1.7.0, you need to download the | Due to improvements in the English lemmatizer in v1.7.0, you need to
| new English model. The German model is still compatible and will be | #[strong download the new English models]. The German model is still
| recognised and linked automatically. | compatible. If you've trained statistical models that use spaCy's
| annotations, you should #[strong retrain your models after updating spaCy].
| If you don't retrain your models, you may suffer train/test skew, which
| might decrease your accuracy.
+aside-code("Quickstart"). +aside-code("Quickstart").
# Install spaCy and download English model # Install spaCy and download English model
@ -235,7 +238,11 @@ p
| #[+a("/docs/usage/adding-languages") additional languages], you can | #[+a("/docs/usage/adding-languages") additional languages], you can
| create a shortuct link for it by pointing #[code spacy.link] to the | create a shortuct link for it by pointing #[code spacy.link] to the
| model's data directory. To allow your model to be downloaded and | model's data directory. To allow your model to be downloaded and
| installed via pip, you'll also need to generate a package for it. | installed via pip, you'll also need to generate a package for it. You can
| do this manually, or via the new
| #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
| create all required files, and walk you through generating the meta data.
+infobox("Important note") +infobox("Important note")
| The model packages are #[strong not suitable] for the public | The model packages are #[strong not suitable] for the public