Merge branch 'master' of https://github.com/explosion/spaCy

2025-10-30 23:47:31 +03:00 · 2017-03-23 11:11:24 +01:00 · 2017-03-23 11:11:24 +01:00 · 9c9cd99144
commit 9c9cd99144
parent 692eb0603d 8bc05c2ba9
21 changed files with 183 additions and 444 deletions
--- a/bin/parser/conll_parse.py
+++ b/bin/parser/conll_parse.py
@ -1,130 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import codecs
 import random
 import time
 import gzip
 import plac
 import cProfile
 import pstats
 import spacy.util
 from spacy.en import English
 from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
 from spacy.syntax.parser import GreedyParser
 from spacy.syntax.parser import OracleError
 from spacy.syntax.util import Config
 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
 def read_gold(file_):
    """Read a standard CoNLL/MALT-style format"""
    sents = []
    for sent_str in file_.read().strip().split('\n\n'):
        ids = []
        words = []
        heads = []
        labels = []
        tags = []
        for i, line in enumerate(sent_str.split('\n')):
            id_, word, pos_string, head_idx, label = _parse_line(line)
            words.append(word)
            if head_idx == -1:
                head_idx = i
            ids.append(id_)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
        text = ' '.join(words)
        sents.append((text, [words], ids, words, tags, heads, labels))
    return sents
 def _parse_line(line):
    pieces = line.split()
    id_ = int(pieces[0])
    word = pieces[1]
    pos = pieces[3]
    head_idx = int(pieces[6])
    label = pieces[7]
    return id_, word, pos, head_idx, label
 def iter_data(paragraphs, tokenizer, gold_preproc=False):
    for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
        assert len(words) == len(heads)
        for words in tokenized:
            sent_ids = ids[:len(words)]
            sent_tags = tags[:len(words)]
            sent_heads = heads[:len(words)]
            sent_labels = labels[:len(words)]
            sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
            tokens = tokenizer.tokens_from_list(words)
            yield tokens, sent_tags, sent_heads, sent_labels
            ids = ids[len(words):]
            tags = tags[len(words):]
            heads = heads[len(words):]
            labels = labels[len(words):]
 def _map_indices_to_tokens(ids, heads):
    mapped = []
    for head in heads:
        if head not in ids:
            mapped.append(None)
        else:
            mapped.append(ids.index(head))
    return mapped
 def evaluate(Language, dev_loc, model_dir):
    global loss
    nlp = Language()
    n_corr = 0
    pos_corr = 0
    n_tokens = 0
    total = 0
    skipped = 0
    loss = 0
    with codecs.open(dev_loc, 'r', 'utf8') as file_:
        paragraphs = read_gold(file_)
    for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
        assert len(tokens) == len(labels)
        nlp.tagger.tag_from_strings(tokens, tag_strs)
        nlp.parser(tokens)
        for i, token in enumerate(tokens):
            try:
                pos_corr += token.tag_ == tag_strs[i]
            except:
                print i, token.orth_, token.tag
                raise
            n_tokens += 1
            if heads[i] is None:
                skipped += 1
                continue
            if is_punct_label(labels[i]):
                continue
            n_corr += token.head.i == heads[i]
            total += 1
    print loss, skipped, (loss+skipped + total)
    print pos_corr / n_tokens
    return float(n_corr) / (total + loss)
 def main(dev_loc, model_dir):
    print evaluate(English, dev_loc, model_dir)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@ -1,261 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import codecs
 import random
 import plac
 import cProfile
 import pstats
 import re
 import spacy.util
 from spacy.en import English
 from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
 from spacy.gold import GoldParse
 from spacy.scorer import Scorer
 from spacy.syntax.parser import Parser, get_templates
 from spacy._theano import TheanoModel
 import theano
 import theano.tensor as T
 from theano.printing import Print
 import numpy
 from collections import OrderedDict, defaultdict
 theano.config.profile = False
 theano.config.floatX = 'float32'
 floatX = theano.config.floatX
 def L1(L1_reg, *weights):
    return L1_reg * sum(abs(w).sum() for w in weights)
 def L2(L2_reg, *weights):
    return L2_reg * sum((w ** 2).sum() for w in weights)
 def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
    updates = OrderedDict()
    for param in params:
        value = param.get_value(borrow=True)
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        grad = T.grad(loss, param)
        accu_new = rho * accu + (1 - rho) * grad ** 2
        updates[accu] = accu_new
        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
    return updates
 def relu(x):
    return x * (x > 0)
 def feed_layer(activation, weights, bias, input_):
    return activation(T.dot(input_, weights) + bias)
 def init_weights(n_in, n_out):
    rng = numpy.random.RandomState(1235)
    weights = numpy.asarray(
        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
        dtype=theano.config.floatX
    )
    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
 def compile_model(n_classes, n_hidden, n_in, optimizer):
    x = T.vector('x') 
    costs = T.ivector('costs')
    loss = T.scalar('loss')
    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
    hidden_W, hidden_b = init_weights(n_in, n_hidden)
    # Feed the inputs forward through the network
    p_y_given_x = feed_layer(
                    T.nnet.softmax,
                    maxent_W,
                    maxent_b,
                      feed_layer(
                        relu,
                        hidden_W,
                        hidden_b,
                        x))
    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
    train_model = theano.function(
        name='train_model',
        inputs=[x, costs],
        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
        on_unused_input='warn'
    )
    evaluate_model = theano.function(
        name='evaluate_model',
        inputs=[x],
        outputs=[
            feed_layer(
              T.nnet.softmax,
              maxent_W,
              maxent_b,
              feed_layer(
                relu,
                hidden_W,
                hidden_b,
                x
              )
            )[0]
        ]
    )
    return train_model, evaluate_model
 def score_model(scorer, nlp, annot_tuples, verbose=False):
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    nlp.tagger(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
          seed=0, n_sents=0,  verbose=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
    Config.write(dep_model_dir, 'config',
        seed=seed,
        templates=tuple(),
        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
        vector_lengths=(nv_word, nv_tag, nv_label),
        hidden_nodes=nv_hidden,
        eta=eta,
        mu=mu
    )
    # Bake-in hyper-parameters
    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
    nlp = Language(data_dir=model_dir)
    n_classes = nlp.parser.model.n_classes
    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
                                   predict, model_loc)
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
    log_loc = path.join(model_dir, 'job.log')
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, annot_tuples)
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                assert gold.is_projective
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                                 scorer.tags_acc,
                                                 scorer.token_acc)
        print logline
        with open(log_loc, 'aw') as file_:
            file_.write(logline + '\n')
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
    return nlp
 def evaluate(nlp, gold_tuples, gold_preproc=True):
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        for annot_tuples, brackets in sents:
            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
            nlp.tagger(tokens)
            nlp.parser(tokens)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold)
    return scorer
@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    nv_word=("Word vector length", "option", "W", int),
    nv_tag=("Tag vector length", "option", "T", int),
    nv_label=("Label vector length", "option", "L", int),
    nv_hidden=("Hidden nodes length", "option", "H", int),
    eta=("Learning rate", "option", "E", float),
    mu=("Momentum", "option", "M", float),
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
         eta=0.1, mu=0.9, eval_only=False):
    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
    nlp = train(English, gold_train, model_dir,
               feat_set='embed',
               eta=eta, mu=mu,
               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
               n_sents=n_sents, n_iter=n_iter,
               verbose=verbose)
    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las
    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -1,18 +1,13 @@
 from __future__ import unicode_literals
 import plac
 import json
 from os import path
 import shutil
 import os
 import random
 import io
 import pathlib
 from spacy.tokens import Doc
 from spacy.syntax.nonproj import PseudoProjectivity
 from spacy.language import Language
 from spacy.gold import GoldParse
 from spacy.vocab import Vocab
 from spacy.tagger import Tagger
 from spacy.pipeline import DependencyParser, BeamDependencyParser
 from spacy.syntax.parser import get_templates
@ -23,7 +18,6 @@ import spacy.attrs
 import io
 def read_conllx(loc, n=0):
    with io.open(loc, 'r', encoding='utf8') as file_:
        text = file_.read()
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
                lines.pop(0)
            tokens = []
            for line in lines:
-                id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
+                id_, word, lemma, pos, tag, morph, head, dep, _1, \
                _2 = line.split('\t')
                if '-' in id_ or '.' in id_:
                    continue
                try:
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
-    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
+    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
--- a/spacy/init.py
+++ b/spacy/init.py
@ -5,7 +5,7 @@ import json
 from pathlib import Path
 from .util import set_lang_class, get_lang_class, parse_package_meta
 from .deprecated import resolve_model_name
-from .cli.info import info
+from .cli import info
 from . import en
 from . import de
@ -49,7 +49,3 @@ def load(name, **overrides):
        overrides['path'] = model_path
    return cls(**overrides)
 def info(name, markdown):
    info(name, markdown)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,5 +1,4 @@
 # coding: utf8
 # 
 from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals,
@ -8,12 +7,13 @@ import plac
 from spacy.cli import download as cli_download
 from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 class CLI(object):
    """Command-line interface for spaCy"""
-    commands = ('download', 'link', 'info')
+    commands = ('download', 'link', 'info', 'package')
    @plac.annotations(
        model=("model to download (shortcut or model name)", "positional", None, str),
@ -32,8 +32,8 @@ class CLI(object):
    @plac.annotations(
        origin=("package name or local path to model", "positional", None, str),
-        link_name=("Name of shortuct link to create", "positional", None, str),
+        link_name=("name of shortuct link to create", "positional", None, str),
-        force=("Force overwriting of existing link", "flag", "f", bool)
+        force=("force overwriting of existing link", "flag", "f", bool)
    )
    def link(self, origin, link_name, force=False):
        """
@ -59,6 +59,21 @@ class CLI(object):
        cli_info(model, markdown)
    @plac.annotations(
        input_dir=("directory with model data", "positional", None, str),
        output_dir=("output directory", "positional", None, str),
        force=("force overwriting of existing folder in output directory", "flag", "f", bool)
    )
    def package(self, input_dir, output_dir, force=False):
        """
        Generate Python package for model data, including meta and required
        installation files. A new directory will be created in the specified
        output directory, and model data will be copied over.
        """
        cli_package(input_dir, output_dir, force)
    def __missing__(self, name):
        print("\n   Command %r does not exist\n" % name)
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -1,4 +1,4 @@
-from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from libc.stdio cimport fopen, fclose, fread, fwrite
 from libc.string cimport memcpy
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,3 +1,4 @@
 from .download import download
 from .info import info
 from .link import link
 from .package import package
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -0,0 +1,91 @@
 # coding: utf8
 from __future__ import unicode_literals
 import json
 import shutil
 import requests
 from pathlib import Path
 from .. import about
 from .. import util
 def package(input_dir, output_dir, force):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    check_dirs(input_path, output_path)
    template_setup = get_template('setup.py')
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('en_model_name/__init__.py')
    meta = generate_meta()
    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
    main_path = output_path / model_name_v
    package_path = main_path / model_name
    create_dirs(package_path, force)
    shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
    create_file(main_path / 'meta.json', json.dumps(meta, indent=2))
    create_file(main_path / 'setup.py', template_setup)
    create_file(main_path / 'MANIFEST.in', template_manifest)
    create_file(package_path / '__init__.py', template_init)
    util.print_msg(
        main_path.as_posix(),
        "To build the package, run `python setup.py sdist` in that directory.",
        title="Successfully created package {p}".format(p=model_name_v))
 def check_dirs(input_path, output_path):
    if not input_path.exists():
        util.sys_exit(input_path.as_poisx(), title="Model directory not found")
    if not output_path.exists():
        util.sys_exit(output_path.as_posix(), title="Output directory not found")
 def create_dirs(package_path, force):
    if package_path.exists():
        if force:
            shutil.rmtree(package_path.as_posix())
        else:
            util.sys_exit(package_path.as_posix(),
                "Please delete the directory and try again.",
                title="Package directory already exists")
    Path.mkdir(package_path, parents=True)
 def create_file(file_path, contents):
    file_path.touch()
    file_path.open('w').write(contents, encoding='utf-8')
 def generate_meta():
    settings = [('lang', 'Model language', 'en'),
                ('name', 'Model name', 'model'),
                ('version', 'Model version', '0.0.0'),
                ('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'),
                ('description', 'Model description', False),
                ('author', 'Author', False),
                ('email', 'Author email', False),
                ('url', 'Author website', False),
                ('license', 'License', 'CC BY-NC 3.0')]
    util.print_msg("Enter the package settings for your model.", title="Generating meta.json")
    meta = {}
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
    return meta
 def get_template(filepath):
    url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
    r = requests.get(url + filepath)
    if r.status_code != 200:
        util.sys_exit(
            "Couldn't fetch template files from GitHub.",
            title="Server error ({c})".format(c=r.status_code))
    return r.text
--- a/spacy/en/morph_rules.py
+++ b/spacy/en/morph_rules.py
@ -21,7 +21,6 @@ MORPH_RULES = {
        "them":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
        "mine":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
        "yours":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
        "his":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
        "hers":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"},
        "its":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
--- a/spacy/fi/tokenizer_exceptions.py
+++ b/spacy/fi/tokenizer_exceptions.py
@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = {
    "vm.": [
        {ORTH: "vm.", LEMMA: "viimeksi mainittu"}
    ],
    "siht.": [
        {ORTH: "siht.", LEMMA: "sihteeri"}
    ],
    "srk.": [
        {ORTH: "srk.", LEMMA: "seurakunta"}
    ]
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -1,16 +1,12 @@
 # cython: profile=True
 from __future__ import unicode_literals, print_function
 import numpy
 import io
 import json
 import random
 import re
 import os
 from os import path
 from libc.string cimport memset
 import ujson as json
 from .syntax import nonproj
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,6 +1,5 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 from warnings import warn
 import pathlib
 from contextlib import contextmanager
 import shutil
@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
 from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import DependencyParser, EntityRecognizer
 from .pipeline import BeamDependencyParser, BeamEntityRecognizer
 from .syntax.arc_eager import ArcEager
 from .syntax.ner import BiluoPushDown
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -2,13 +2,10 @@
 # cython: infer_types=True
 from __future__ import unicode_literals
 from os import path
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .attrs cimport attr_id_t
-from .structs cimport TokenC, LexemeC
+from .structs cimport TokenC
 from .lexeme cimport Lexeme
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@ -17,7 +14,7 @@ from libcpp.pair cimport pair
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t
-from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
+from .attrs cimport ID, ENT_TYPE
 from . import attrs
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,12 +1,8 @@
 # cython: infer_types
 from __future__ import unicode_literals
 from os import path
 from libc.string cimport memset
 from .lemmatizer import Lemmatizer
 try:
    import ujson as json
 except ImportError:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -2,7 +2,6 @@ from .syntax.parser cimport Parser
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
 from .syntax.arc_eager cimport ArcEager
 from .vocab cimport Vocab
 from .tagger import Tagger
 # TODO: The disorganization here is pretty embarrassing. At least it's only
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,20 +1,16 @@
 import json
 import pathlib
 from collections import defaultdict
 from libc.string cimport memset
 from cymem.cymem cimport Pool
-from thinc.typedefs cimport atom_t, weight_t
+from thinc.typedefs cimport atom_t
 from thinc.extra.eg cimport Example
 from thinc.structs cimport ExampleC
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from .typedefs cimport attr_t
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
 from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 from .gold cimport GoldParse
 from .attrs cimport *
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,13 +1,10 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals
 import re
 import pathlib
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cpython cimport Py_UNICODE_ISSPACE
 try:
    import ujson as json
--- a/spacy/util.py
+++ b/spacy/util.py
@ -8,10 +8,8 @@ import os.path
 import pathlib
 import sys
 import six
 import textwrap
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 try:
    basestring
@ -19,6 +17,12 @@ except NameError:
    basestring = str
 try:
    raw_input
 except NameError: # Python 3
    raw_input = input
 LANGUAGES = {}
 _data_path = pathlib.Path(__file__).parent / 'data'
@ -161,6 +165,17 @@ def parse_package_meta(package_path, package, require=True):
        return None
 def get_raw_input(description, default=False):
    """Get user input via raw_input / input and return input value. Takes a
    description for the prompt, and an optional default value that's displayed
    with the prompt."""
    additional = ' (default: {d})'.format(d=default) if default else ''
    prompt = '    {d}{a}: '.format(d=description, a=additional)
    user_input = raw_input(prompt)
    return user_input
 def print_table(data, **kwargs):
    """Print data in table format. Can either take a list of tuples or a
    dictionary, which will be converted to a list of tuples."""
--- a/website/assets/css/_variables.sass
+++ b/website/assets/css/_variables.sass
@ -44,7 +44,7 @@ $color-red: #d9515d
 $color-green: #3ec930
 $color-yellow: #f4c025
-$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 )
+$syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 )
 $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat
 $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -103,3 +103,38 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
 +h(2, "package") Package
    +tag experimental
 p
    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
    |  from an existing model data directory. All data files are copied over,
    |  and the meta data can be entered directly from the command line. While
    |  this feature is still experimental, the required file templates are
    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
    |  This means you need to be connected to the internet to use this command.
 +code(false, "bash").
    python -m spacy package [input_dir] [output_dir] [--force]
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code input_dir]
        +cell positional
        +cell Path to directory containing model data.
    +row
        +cell #[code output_dir]
        +cell positional
        +cell Directory to create package folder in.
    +row
        +cell #[code --force], #[code -f]
        +cell flag
        +cell Force overwriting of existing folder in output directory.
    +row
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@ -14,9 +14,12 @@ p
    |  model name.
 +infobox("Important note")
-    |  Due to improvements in the English lemmatizer in v1.7.0, you need to download the
+    |  Due to improvements in the English lemmatizer in v1.7.0, you need to
-    |  new English model. The German model is still compatible and will be
+    |  #[strong download the new English models]. The German model is still
-    |  recognised and linked automatically.
+    |  compatible. If you've trained statistical models that use spaCy's
    |  annotations, you should #[strong retrain your models after updating spaCy].
    |  If you don't retrain your models, you may suffer train/test skew, which
    |  might decrease your accuracy.
 +aside-code("Quickstart").
    # Install spaCy and download English model
@ -235,7 +238,11 @@ p
    |  #[+a("/docs/usage/adding-languages") additional languages], you can
    |  create a shortuct link for it by pointing #[code spacy.link] to the
    |  model's data directory. To allow your model to be downloaded and
-    |  installed via pip, you'll also need to generate a package for it.
+    |  installed via pip, you'll also need to generate a package for it. You can
    |  do this manually, or via the new
    |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
    |  create all required files, and walk you through generating the meta data.
 +infobox("Important note")
    |  The model packages are #[strong not suitable] for the public
`@ -1,4 +1,4 @@`
	`from libc.stdio cimport fopen, fclose, fread, fwrite, FILE`	`from libc.stdio cimport fopen, fclose, fread, fwrite`
	`from libc.string cimport memcpy`	`from libc.string cimport memcpy`