diff --git a/bin/parser/conll_parse.py b/bin/parser/conll_parse.py deleted file mode 100644 index 85a81c432..000000000 --- a/bin/parser/conll_parse.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random -import time -import gzip - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.parser import GreedyParser -from spacy.syntax.parser import OracleError -from spacy.syntax.util import Config - - -def is_punct_label(label): - return label == 'P' or label.lower() == 'punct' - - -def read_gold(file_): - """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): - ids = [] - words = [] - heads = [] - labels = [] - tags = [] - for i, line in enumerate(sent_str.split('\n')): - id_, word, pos_string, head_idx, label = _parse_line(line) - words.append(word) - if head_idx == -1: - head_idx = i - ids.append(id_) - heads.append(head_idx) - labels.append(label) - tags.append(pos_string) - text = ' '.join(words) - sents.append((text, [words], ids, words, tags, heads, labels)) - return sents - - -def _parse_line(line): - pieces = line.split() - id_ = int(pieces[0]) - word = pieces[1] - pos = pieces[3] - head_idx = int(pieces[6]) - label = pieces[7] - return id_, word, pos, head_idx, label - - -def iter_data(paragraphs, tokenizer, gold_preproc=False): - for raw, tokenized, ids, words, tags, heads, labels in paragraphs: - assert len(words) == len(heads) - for words in tokenized: - sent_ids = ids[:len(words)] - sent_tags = tags[:len(words)] - sent_heads = heads[:len(words)] - sent_labels = labels[:len(words)] - sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) - tokens = tokenizer.tokens_from_list(words) - yield tokens, sent_tags, sent_heads, sent_labels - ids = ids[len(words):] - tags = tags[len(words):] - heads = heads[len(words):] - labels = labels[len(words):] - - -def _map_indices_to_tokens(ids, heads): - mapped = [] - for head in heads: - if head not in ids: - mapped.append(None) - else: - mapped.append(ids.index(head)) - return mapped - - - -def evaluate(Language, dev_loc, model_dir): - global loss - nlp = Language() - n_corr = 0 - pos_corr = 0 - n_tokens = 0 - total = 0 - skipped = 0 - loss = 0 - with codecs.open(dev_loc, 'r', 'utf8') as file_: - paragraphs = read_gold(file_) - for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer): - assert len(tokens) == len(labels) - nlp.tagger.tag_from_strings(tokens, tag_strs) - nlp.parser(tokens) - for i, token in enumerate(tokens): - try: - pos_corr += token.tag_ == tag_strs[i] - except: - print i, token.orth_, token.tag - raise - n_tokens += 1 - if heads[i] is None: - skipped += 1 - continue - if is_punct_label(labels[i]): - continue - n_corr += token.head.i == heads[i] - total += 1 - print loss, skipped, (loss+skipped + total) - print pos_corr / n_tokens - return float(n_corr) / (total + loss) - - -def main(dev_loc, model_dir): - print evaluate(English, dev_loc, model_dir) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py deleted file mode 100755 index 72c9e04f1..000000000 --- a/bin/parser/nn_train.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random - -import plac -import cProfile -import pstats -import re - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse - -from spacy.scorer import Scorer - -from spacy.syntax.parser import Parser, get_templates -from spacy._theano import TheanoModel - -import theano -import theano.tensor as T - -from theano.printing import Print - -import numpy -from collections import OrderedDict, defaultdict - - -theano.config.profile = False -theano.config.floatX = 'float32' -floatX = theano.config.floatX - - -def L1(L1_reg, *weights): - return L1_reg * sum(abs(w).sum() for w in weights) - - -def L2(L2_reg, *weights): - return L2_reg * sum((w ** 2).sum() for w in weights) - - -def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): - updates = OrderedDict() - for param in params: - value = param.get_value(borrow=True) - accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), - broadcastable=param.broadcastable) - - grad = T.grad(loss, param) - accu_new = rho * accu + (1 - rho) * grad ** 2 - updates[accu] = accu_new - updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) - return updates - - -def relu(x): - return x * (x > 0) - - -def feed_layer(activation, weights, bias, input_): - return activation(T.dot(input_, weights) + bias) - - -def init_weights(n_in, n_out): - rng = numpy.random.RandomState(1235) - - weights = numpy.asarray( - rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), - dtype=theano.config.floatX - ) - bias = numpy.zeros((n_out,), dtype=theano.config.floatX) - return [wrapper(weights, name='W'), wrapper(bias, name='b')] - - -def compile_model(n_classes, n_hidden, n_in, optimizer): - x = T.vector('x') - costs = T.ivector('costs') - loss = T.scalar('loss') - - maxent_W, maxent_b = init_weights(n_hidden, n_classes) - hidden_W, hidden_b = init_weights(n_in, n_hidden) - - # Feed the inputs forward through the network - p_y_given_x = feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x)) - - loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) - - train_model = theano.function( - name='train_model', - inputs=[x, costs], - outputs=[p_y_given_x[0], T.grad(loss, x), loss], - updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), - on_unused_input='warn' - ) - - evaluate_model = theano.function( - name='evaluate_model', - inputs=[x], - outputs=[ - feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x - ) - )[0] - ] - ) - return train_model, evaluate_model - - -def score_model(scorer, nlp, annot_tuples, verbose=False): - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, - seed=0, n_sents=0, verbose=False): - - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - - Config.write(dep_model_dir, 'config', - seed=seed, - templates=tuple(), - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), - vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=nv_hidden, - eta=eta, - mu=mu - ) - - # Bake-in hyper-parameters - optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) - nlp = Language(data_dir=model_dir) - n_classes = nlp.parser.model.n_classes - train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) - nlp.parser.model = TheanoModel(n_classes, input_spec, train, - predict, model_loc) - - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - print "Itn.\tP.Loss\tUAS\tTag %\tToken %" - log_loc = path.join(model_dir, 'job.log') - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for _, sents in gold_tuples: - for annot_tuples, ctnt in sents: - if len(annot_tuples[1]) == 1: - continue - score_model(scorer, nlp, annot_tuples) - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - assert gold.is_projective - loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, - scorer.token_acc) - print logline - with open(log_loc, 'aw') as file_: - file_.write(logline + '\n') - nlp.parser.model.end_training() - nlp.tagger.model.end_training() - nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) - return nlp - - -def evaluate(nlp, gold_tuples, gold_preproc=True): - scorer = Scorer() - for raw_text, sents in gold_tuples: - for annot_tuples, brackets in sents: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold) - return scorer - - -@plac.annotations( - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - - nv_word=("Word vector length", "option", "W", int), - nv_tag=("Tag vector length", "option", "T", int), - nv_label=("Label vector length", "option", "L", int), - nv_hidden=("Hidden nodes length", "option", "H", int), - eta=("Learning rate", "option", "E", float), - mu=("Momentum", "option", "M", float), -) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, - nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, - eta=0.1, mu=0.9, eval_only=False): - - - - - gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) - - nlp = train(English, gold_train, model_dir, - feat_set='embed', - eta=eta, mu=mu, - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, - n_sents=n_sents, n_iter=n_iter, - verbose=verbose) - - scorer = evaluate(nlp, list(read_json_file(dev_loc))) - - print 'TOK', 100-scorer.token_acc - print 'POS', scorer.tags_acc - print 'UAS', scorer.uas - print 'LAS', scorer.las - - print 'NER P', scorer.ents_p - print 'NER R', scorer.ents_r - print 'NER F', scorer.ents_f - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index c87f40680..afc4491cb 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals import plac import json -from os import path -import shutil -import os import random -import io import pathlib from spacy.tokens import Doc from spacy.syntax.nonproj import PseudoProjectivity from spacy.language import Language from spacy.gold import GoldParse -from spacy.vocab import Vocab from spacy.tagger import Tagger from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.syntax.parser import get_templates @@ -23,7 +18,6 @@ import spacy.attrs import io - def read_conllx(loc, n=0): with io.open(loc, 'r', encoding='utf8') as file_: text = file_.read() @@ -35,7 +29,8 @@ def read_conllx(loc, n=0): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + id_, word, lemma, pos, tag, morph, head, dep, _1, \ + _2 = line.split('\t') if '-' in id_ or '.' in id_: continue try: @@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) - nlp = Language(vocab=vocab, tagger=tagger, parser=parser) + nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) diff --git a/spacy/__init__.py b/spacy/__init__.py index 70b3363d6..80bd1c539 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,7 +5,7 @@ import json from pathlib import Path from .util import set_lang_class, get_lang_class, parse_package_meta from .deprecated import resolve_model_name -from .cli.info import info +from .cli import info from . import en from . import de @@ -49,7 +49,3 @@ def load(name, **overrides): overrides['path'] = model_path return cls(**overrides) - - -def info(name, markdown): - info(name, markdown) diff --git a/spacy/__main__.py b/spacy/__main__.py index 9addbccde..cde146cba 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,5 +1,4 @@ # coding: utf8 -# from __future__ import print_function # NB! This breaks in plac on Python 2!! #from __future__ import unicode_literals, @@ -8,12 +7,13 @@ import plac from spacy.cli import download as cli_download from spacy.cli import link as cli_link from spacy.cli import info as cli_info +from spacy.cli import package as cli_package class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info') + commands = ('download', 'link', 'info', 'package') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -32,8 +32,8 @@ class CLI(object): @plac.annotations( origin=("package name or local path to model", "positional", None, str), - link_name=("Name of shortuct link to create", "positional", None, str), - force=("Force overwriting of existing link", "flag", "f", bool) + link_name=("name of shortuct link to create", "positional", None, str), + force=("force overwriting of existing link", "flag", "f", bool) ) def link(self, origin, link_name, force=False): """ @@ -59,6 +59,21 @@ class CLI(object): cli_info(model, markdown) + @plac.annotations( + input_dir=("directory with model data", "positional", None, str), + output_dir=("output directory", "positional", None, str), + force=("force overwriting of existing folder in output directory", "flag", "f", bool) + ) + def package(self, input_dir, output_dir, force=False): + """ + Generate Python package for model data, including meta and required + installation files. A new directory will be created in the specified + output directory, and model data will be copied over. + """ + + cli_package(input_dir, output_dir, force) + + def __missing__(self, name): print("\n Command %r does not exist\n" % name) diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index ceebe2e59..d5d4bf353 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -1,4 +1,4 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.stdio cimport fopen, fclose, fread, fwrite from libc.string cimport memcpy diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2c45b471a..2383e04b9 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,3 +1,4 @@ from .download import download from .info import info from .link import link +from .package import package diff --git a/spacy/cli/package.py b/spacy/cli/package.py new file mode 100644 index 000000000..5cab2b4bc --- /dev/null +++ b/spacy/cli/package.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +import json +import shutil +import requests +from pathlib import Path + +from .. import about +from .. import util + + +def package(input_dir, output_dir, force): + input_path = Path(input_dir) + output_path = Path(output_dir) + check_dirs(input_path, output_path) + + template_setup = get_template('setup.py') + template_manifest = get_template('MANIFEST.in') + template_init = get_template('en_model_name/__init__.py') + meta = generate_meta() + + model_name = meta['lang'] + '_' + meta['name'] + model_name_v = model_name + '-' + meta['version'] + main_path = output_path / model_name_v + package_path = main_path / model_name + + create_dirs(package_path, force) + shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) + create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) + create_file(main_path / 'setup.py', template_setup) + create_file(main_path / 'MANIFEST.in', template_manifest) + create_file(package_path / '__init__.py', template_init) + + util.print_msg( + main_path.as_posix(), + "To build the package, run `python setup.py sdist` in that directory.", + title="Successfully created package {p}".format(p=model_name_v)) + + +def check_dirs(input_path, output_path): + if not input_path.exists(): + util.sys_exit(input_path.as_poisx(), title="Model directory not found") + if not output_path.exists(): + util.sys_exit(output_path.as_posix(), title="Output directory not found") + + +def create_dirs(package_path, force): + if package_path.exists(): + if force: + shutil.rmtree(package_path.as_posix()) + else: + util.sys_exit(package_path.as_posix(), + "Please delete the directory and try again.", + title="Package directory already exists") + Path.mkdir(package_path, parents=True) + + +def create_file(file_path, contents): + file_path.touch() + file_path.open('w').write(contents, encoding='utf-8') + + +def generate_meta(): + settings = [('lang', 'Model language', 'en'), + ('name', 'Model name', 'model'), + ('version', 'Model version', '0.0.0'), + ('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'), + ('description', 'Model description', False), + ('author', 'Author', False), + ('email', 'Author email', False), + ('url', 'Author website', False), + ('license', 'License', 'CC BY-NC 3.0')] + + util.print_msg("Enter the package settings for your model.", title="Generating meta.json") + + meta = {} + for setting, desc, default in settings: + response = util.get_raw_input(desc, default) + meta[setting] = default if response == '' and default else response + return meta + + +def get_template(filepath): + url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' + r = requests.get(url + filepath) + if r.status_code != 200: + util.sys_exit( + "Couldn't fetch template files from GitHub.", + title="Server error ({c})".format(c=r.status_code)) + return r.text diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py index 2b8aae823..51a50736e 100644 --- a/spacy/en/morph_rules.py +++ b/spacy/en/morph_rules.py @@ -21,7 +21,6 @@ MORPH_RULES = { "them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, - "yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, "his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, "hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, "its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, diff --git a/spacy/fi/tokenizer_exceptions.py b/spacy/fi/tokenizer_exceptions.py index 52ea7428a..09775a2f4 100644 --- a/spacy/fi/tokenizer_exceptions.py +++ b/spacy/fi/tokenizer_exceptions.py @@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = { "vm.": [ {ORTH: "vm.", LEMMA: "viimeksi mainittu"} ], - "siht.": [ - {ORTH: "siht.", LEMMA: "sihteeri"} - ], "srk.": [ {ORTH: "srk.", LEMMA: "seurakunta"} ] diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 358412fab..471018109 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,16 +1,12 @@ # cython: profile=True from __future__ import unicode_literals, print_function -import numpy import io import json -import random import re import os from os import path -from libc.string cimport memset - import ujson as json from .syntax import nonproj diff --git a/spacy/language.py b/spacy/language.py index 573bb5a86..4542eae3b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,5 @@ from __future__ import absolute_import from __future__ import unicode_literals -from warnings import warn import pathlib from contextlib import contextmanager import shutil @@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer -from .pipeline import BeamDependencyParser, BeamEntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 5c52ae9d0..1883ae89a 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -2,13 +2,10 @@ # cython: infer_types=True from __future__ import unicode_literals -from os import path - from .typedefs cimport attr_t from .typedefs cimport hash_t from .attrs cimport attr_id_t -from .structs cimport TokenC, LexemeC -from .lexeme cimport Lexeme +from .structs cimport TokenC from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -17,7 +14,7 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport ID, ENT_TYPE from . import attrs from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 26405e988..e98ef1e92 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,8 @@ # cython: infer_types from __future__ import unicode_literals -from os import path - from libc.string cimport memset -from .lemmatizer import Lemmatizer - try: import ujson as json except ImportError: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 59e1994a9..b2d622329 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -2,7 +2,6 @@ from .syntax.parser cimport Parser from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager -from .vocab cimport Vocab from .tagger import Tagger # TODO: The disorganization here is pretty embarrassing. At least it's only diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1f6b587c5..4a2ef082a 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,20 +1,16 @@ import json import pathlib from collections import defaultdict -from libc.string cimport memset from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t, weight_t +from thinc.typedefs cimport atom_t from thinc.extra.eg cimport Example from thinc.structs cimport ExampleC from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec -from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG -from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON -from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5a4eb844a..42f090cde 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,13 +1,10 @@ # cython: embedsignature=True from __future__ import unicode_literals -import re import pathlib from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc -from cpython cimport Py_UNICODE_ISSPACE - try: import ujson as json diff --git a/spacy/util.py b/spacy/util.py index 49c51b436..893ba87c1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,10 +8,8 @@ import os.path import pathlib import sys -import six import textwrap -from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE try: basestring @@ -19,6 +17,12 @@ except NameError: basestring = str +try: + raw_input +except NameError: # Python 3 + raw_input = input + + LANGUAGES = {} _data_path = pathlib.Path(__file__).parent / 'data' @@ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True): return None +def get_raw_input(description, default=False): + """Get user input via raw_input / input and return input value. Takes a + description for the prompt, and an optional default value that's displayed + with the prompt.""" + + additional = ' (default: {d})'.format(d=default) if default else '' + prompt = ' {d}{a}: '.format(d=description, a=additional) + user_input = raw_input(prompt) + return user_input + + def print_table(data, **kwargs): """Print data in table format. Can either take a list of tuples or a dictionary, which will be converted to a list of tuples.""" diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index bfef915be..1c38d114a 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -44,7 +44,7 @@ $color-red: #d9515d $color-green: #3ec930 $color-yellow: #f4c025 -$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 ) +$syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 ) $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 990117542..66be83923 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -103,3 +103,38 @@ p +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. + ++h(2, "package") Package + +tag experimental + +p + | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | from an existing model data directory. All data files are copied over, + | and the meta data can be entered directly from the command line. While + | this feature is still experimental, the required file templates are + | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. + | This means you need to be connected to the internet to use this command. + ++code(false, "bash"). + python -m spacy package [input_dir] [output_dir] [--force] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code input_dir] + +cell positional + +cell Path to directory containing model data. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to create package folder in. + + +row + +cell #[code --force], #[code -f] + +cell flag + +cell Force overwriting of existing folder in output directory. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index d45d8d45e..39c271df4 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -14,9 +14,12 @@ p | model name. +infobox("Important note") - | Due to improvements in the English lemmatizer in v1.7.0, you need to download the - | new English model. The German model is still compatible and will be - | recognised and linked automatically. + | Due to improvements in the English lemmatizer in v1.7.0, you need to + | #[strong download the new English models]. The German model is still + | compatible. If you've trained statistical models that use spaCy's + | annotations, you should #[strong retrain your models after updating spaCy]. + | If you don't retrain your models, you may suffer train/test skew, which + | might decrease your accuracy. +aside-code("Quickstart"). # Install spaCy and download English model @@ -235,7 +238,11 @@ p | #[+a("/docs/usage/adding-languages") additional languages], you can | create a shortuct link for it by pointing #[code spacy.link] to the | model's data directory. To allow your model to be downloaded and - | installed via pip, you'll also need to generate a package for it. + | installed via pip, you'll also need to generate a package for it. You can + | do this manually, or via the new + | #[+a("/docs/usage/cli#package") #[code spacy package] command] that will + | create all required files, and walk you through generating the meta data. + +infobox("Important note") | The model packages are #[strong not suitable] for the public