Merge branch 'master' into develop

2025-10-31 16:07:41 +03:00 · 2017-04-17 01:08:11 +02:00 · 2017-04-17 01:08:11 +02:00 · ad74245be9
commit ad74245be9
parent 40e3024241 c6c3162c50
27 changed files with 542 additions and 200 deletions
--- a/README.rst
+++ b/README.rst
@ -10,7 +10,7 @@ open-source software, released under the MIT license.
 📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
-💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
+💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
    :target: https://travis-ci.org/explosion/spaCy
@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests:
 =========== ============== ===========
 Version     Date           Description
 =========== ============== ===========
 `v1.8.0`_   ``2017-04-16`` Better NER training, saving and loading
 `v1.7.5`_   ``2017-04-07`` Bug fixes and new CLI commands
 `v1.7.3`_   ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes
 `v1.7.2`_   ``2017-03-20`` Small fixes to beam parser and model linking
@ -350,6 +351,7 @@ Version     Date           Description
 `v0.93`_    ``2015-09-22`` Bug fixes to word vectors
 =========== ============== ===========
 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0
 .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5
 .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3
 .. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2
--- a/examples/pos_tag.py
+++ b/examples/pos_tag.py
@ -1,7 +1,8 @@
-'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
+"""
 Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
 text, with each "sentence" on a newline, and spaces between tokens. Supports
 multi-processing.
-'''
+"""
 from __future__ import print_function, unicode_literals, division
 import io
 import bz2
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
 def iter_texts_from_json_bz2(loc):
-    '''
+    """
    Iterator of unicode strings, one per document (here, a comment).
    Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
    document text should be in a string field titled 'body'.
    This is the data format of the Reddit comments corpus.
-    '''
+    """
    with bz2.BZ2File(loc) as file_:
        for i, line in enumerate(file_):
            yield ujson.loads(line)['body']
@ -80,7 +81,7 @@ def is_sent_begin(word):
 def main(in_loc, out_dir, n_workers=4, batch_size=100000):
    if not path.exists(out_dir):
        path.join(out_dir)
-    texts = partition(batch_size, iter_texts(in_loc))
+    texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -1,22 +1,45 @@
 #!/usr/bin/env python
 """
 Example of training an additional entity type
 This script shows how to add a new entity type to an existing pre-trained NER
 model. To keep the example short and simple, only four sentences are provided
 as examples. In practice, you'll need many more — a few hundred would be a
 good start. You will also likely need to mix in examples of other entity
 types, which might be obtained by running the entity recognizer over unlabelled
 sentences, and adding their annotations to the training set.
 The actual training is performed by looping over the examples, and calling
 `nlp.entity.update()`. The `update()` method steps through the words of the
 input. At each word, it makes a prediction. It then consults the annotations
 provided on the GoldParse instance, to see whether it was right. If it was
 wrong, it adjusts its weights so that the correct action will score higher
 next time.
 After training your model, you can save it to a directory. We recommend
 wrapping models as Python packages, for ease of deployment.
 For more details, see the documentation:
 * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
 * Saving and loading models: https://spacy.io/docs/usage/saving-loading
 Developed for: spaCy 1.7.6
 Last tested for: spaCy 1.7.6
 """
 # coding: utf8
 from __future__ import unicode_literals, print_function
-import json
+
 import pathlib
 import random
 from pathlib import Path
 import spacy
 from spacy.pipeline import EntityRecognizer
 from spacy.gold import GoldParse
 from spacy.tagger import Tagger
 try:
    unicode
 except:
    unicode = str
 def train_ner(nlp, train_data, output_dir):
-    # Add new words to vocab.
+    # Add new words to vocab
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
@ -30,11 +53,14 @@ def train_ner(nlp, train_data, output_dir):
            nlp.tagger(doc)
            loss = nlp.entity.update(doc, gold)
    nlp.end_training()
-    nlp.save_to_directory(output_dir)
+    if output_dir:
        nlp.save_to_directory(output_dir)
 def main(model_name, output_directory=None):
    nlp = spacy.load(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)
    train_data = [
        (
@ -55,18 +81,18 @@ def main(model_name, output_directory=None):
        )
    ]
    nlp.entity.add_label('ANIMAL')
    if output_directory is not None:
        output_directory = pathlib.Path(output_directory)
    ner = train_ner(nlp, train_data, output_directory)
    # Test that the entity is recognized
    doc = nlp('Do you like horses?')
    for ent in doc.ents:
        print(ent.label_, ent.text)
-    nlp2 = spacy.load('en', path=output_directory)
+    if output_directory:
-    nlp2.entity.add_label('ANIMAL')
+        nlp2 = spacy.load('en', path=output_directory)
-    doc2 = nlp2('Do you like horses?')
+        nlp2.entity.add_label('ANIMAL')
-    for ent in doc2.ents:
+        doc2 = nlp2('Do you like horses?')
-        print(ent.label_, ent.text)
+        for ent in doc2.ents:
            print(ent.label_, ent.text)
 if __name__ == '__main__':
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0
 regex==2017.4.5
 pytest>=3.0.6,<4.0.0
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,39 +1,40 @@
 # coding: utf8
 from __future__ import unicode_literals
-from pathlib import Path
+from . import util
 from .util import set_lang_class, get_lang_class, parse_package_meta
 from .deprecated import resolve_model_name
 from .cli import info
 from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
-set_lang_class(en.English.lang, en.English)
+_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
-set_lang_class(de.German.lang, de.German)
+             it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
-set_lang_class(es.Spanish.lang, es.Spanish)
+             fi.Finnish, bn.Bengali, he.Hebrew)
-set_lang_class(pt.Portuguese.lang, pt.Portuguese)
+
-set_lang_class(fr.French.lang, fr.French)
+
-set_lang_class(it.Italian.lang, it.Italian)
+for _lang in _languages:
-set_lang_class(hu.Hungarian.lang, hu.Hungarian)
+    util.set_lang_class(_lang.lang, _lang)
 set_lang_class(zh.Chinese.lang, zh.Chinese)
 set_lang_class(nl.Dutch.lang, nl.Dutch)
 set_lang_class(sv.Swedish.lang, sv.Swedish)
 set_lang_class(fi.Finnish.lang, fi.Finnish)
 set_lang_class(bn.Bengali.lang, bn.Bengali)
 set_lang_class(he.Hebrew.lang, he.Hebrew)
 def load(name, **overrides):
-    data_path = overrides.get('path', util.get_data_path())
+    if overrides.get('path') in (None, False, True):
-    model_name = resolve_model_name(name)
+        data_path = util.get_data_path()
-    meta = parse_package_meta(data_path, model_name, require=False)
+        model_name = resolve_model_name(name)
        model_path = data_path / model_name
        if not model_path.exists():
            lang_name = util.get_lang_class(name).lang
            model_path = None
            util.print_msg(
                "Only loading the '{}' tokenizer.".format(lang_name),
                title="Warning: no model found for '{}'".format(name))
    else:
        model_path = util.ensure_path(overrides['path'])
        data_path = model_path.parent
        model_name = ''
    meta = util.parse_package_meta(data_path, model_name, require=False)
    lang = meta['lang'] if meta and 'lang' in meta else name
-    cls = get_lang_class(lang)
+    cls = util.get_lang_class(lang)
    overrides['meta'] = meta
-    model_path = Path(data_path / model_name)
+    overrides['path'] = model_path
    if model_path.exists():
        overrides['path'] = model_path
    return cls(**overrides)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -63,15 +63,16 @@ class CLI(object):
    @plac.annotations(
        input_dir=("directory with model data", "positional", None, str),
        output_dir=("output parent directory", "positional", None, str),
        meta=("path to meta.json", "option", "m", str),
        force=("force overwriting of existing folder in output directory", "flag", "f", bool)
    )
-    def package(self, input_dir, output_dir, force=False):
+    def package(self, input_dir, output_dir, meta=None, force=False):
        """
        Generate Python package for model data, including meta and required
        installation files. A new directory will be created in the specified
        output directory, and model data will be copied over.
        """
-        cli_package(input_dir, output_dir, force)
+        cli_package(input_dir, output_dir, meta, force)
    @plac.annotations(
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __title__ = 'spacy'
-__version__ = '1.7.5'
+__version__ = '1.8.0'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False):
 def symlink(model_path, link_name, force):
    model_path = Path(model_path)
-    if not Path(model_path).exists():
+    if not model_path.exists():
        util.sys_exit(
            "The data should be located in {p}".format(p=model_path),
            title="Can't locate model data")
@ -48,12 +48,16 @@ def symlink(model_path, link_name, force):
    except:
        # This is quite dirty, but just making sure other errors are caught so
        # users at least see a proper message.
-        util.sys_exit(
+        util.print_msg(
-            "Creating a symlink in spacy/data failed. You can still import "
+            "Creating a symlink in spacy/data failed. Make sure you have the "
-            "the model as a Python package and call its load() method, or "
+            "required permissions and try re-running the command as admin, or "
-            "create the symlink manually:",
+            "use a virtualenv to install spaCy in a user directory, instead of "
            "doing a system installation.",
            "You can still import the model as a Python package and call its "
            "load() method, or create the symlink manually:",
            "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
            title="Error: Couldn't link model to '{l}'".format(l=link_name))
        raise
    util.print_msg(
        "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -9,16 +9,24 @@ from ..compat import unicode_, json_dumps
 from .. import util
-def package(input_dir, output_dir, force):
+def package(input_dir, output_dir, meta_path, force):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
-    check_dirs(input_path, output_path)
+    meta_path = util.ensure_path(meta_path)
    check_dirs(input_path, output_path, meta_path)
    template_setup = get_template('setup.py')
    template_manifest = get_template('MANIFEST.in')
    template_init = get_template('en_model_name/__init__.py')
    meta = generate_meta()
    meta_path = meta_path or input_path / 'meta.json'
    if meta_path.is_file():
        util.print_msg(unicode_(meta_path), title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
        meta = generate_meta()
    validate_meta(meta, ['lang', 'name', 'version'])
    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
    main_path = output_path / model_name_v
@ -37,20 +45,23 @@ def package(input_dir, output_dir, force):
        title="Successfully created package {p}".format(p=model_name_v))
-def check_dirs(input_path, output_path):
+def check_dirs(input_path, output_path, meta_path):
    if not input_path.exists():
        util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
    if not output_path.exists():
        util.sys_exit(unicode_(output_path), title="Output directory not found")
    if meta_path and not meta_path.exists():
        util.sys_exit(unicode_(meta_path), title="meta.json not found")
 def create_dirs(package_path, force):
    if package_path.exists():
        if force:
-            shutil.rmtree(unicode_(package_path.as_posix))
+            shutil.rmtree(unicode_(package_path))
        else:
-            util.sys_exit(unicode_(package_path.as_posix),
+            util.sys_exit(unicode_(package_path),
-                "Please delete the directory and try again.",
+                "Please delete the directory and try again, or use the --force "
                "flag to overwrite existing directories.",
                title="Package directory already exists")
    Path.mkdir(package_path, parents=True)
@ -80,6 +91,14 @@ def generate_meta():
    return meta
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            util.sys_exit(
                "This setting is required to build your package.",
                title='No "{k}" setting found in meta.json'.format(k=key))
 def get_template(filepath):
    url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
    r = requests.get(url + filepath)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -2,8 +2,8 @@
 from __future__ import unicode_literals, division, print_function
 import json
 from pathlib import Path
 from ..util import ensure_path
 from ..scorer import Scorer
 from ..gold import GoldParse, merge_sents
 from ..gold import read_json_file as read_gold_json
@ -12,9 +12,9 @@ from .. import util
 def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
          parser_L1):
-    output_path = Path(output_dir)
+    output_path = ensure_path(output_dir)
-    train_path = Path(train_data)
+    train_path = ensure_path(train_data)
-    dev_path = Path(dev_data)
+    dev_path = ensure_path(dev_data)
    check_dirs(output_path, train_path, dev_path)
    lang = util.get_lang_class(language)
@ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne
 def train_config(config):
-    config_path = Path(config)
+    config_path = ensure_path(config)
    if not config_path.is_file():
        util.sys_exit(config_path.as_posix(), title="Config file not found")
    config = json.load(config_path)
@ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
                entity_cfg, n_iter):
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
-    with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
+    with Language.train(output_path, train_data,
                        pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer:
        for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides):
 def resolve_model_name(name):
    """
    If spaCy is loaded with 'de', check if symlink already exists. If
-    not, user have upgraded from older version and have old models installed.
+    not, user may have upgraded from older version and have old models installed.
    Check if old model directory exists and if so, return that instead and create
    shortcut link. If English model is found and no shortcut exists, raise error
    and tell user to install new model.
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function
 import io
 import re
 import ujson
 from pathlib import Path
 from .syntax import nonproj
 from .util import ensure_path
 def tags_to_entities(tags):
@ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words):
 def read_json_file(loc, docs_filter=None):
-    loc = Path(loc)
+    loc = ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename)
    else:
-        with io.open(loc, 'r', encoding='utf8') as file_:
+        with loc.open('r', encoding='utf8') as file_:
            docs = ujson.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
--- a/spacy/language.py
+++ b/spacy/language.py
@ -204,15 +204,18 @@ class Language(object):
    @classmethod
    @contextmanager
    def train(cls, path, gold_tuples, **configs):
-        if parser_cfg['pseudoprojective']:
+        parser_cfg = configs.get('deps', {})
        if parser_cfg.get('pseudoprojective'):
            # preprocess training data here before ArcEager.get_labels() is called
            gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
        for subdir in ('deps', 'ner', 'pos'):
            if subdir not in configs:
                configs[subdir] = {}
-        configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
+        if parser_cfg:
-        configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
+            configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
        if 'ner' in configs:
            configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
        cls.setup_directory(path, **configs)
@ -236,8 +239,7 @@ class Language(object):
        self.pipeline = self.Defaults.create_pipeline(self)
        yield Trainer(self, gold_tuples)
        self.end_training()
-        self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
+        self.save_to_directory(path)
                               pos=self.tagger.cfg)
    def __init__(self, **overrides):
        if 'data_dir' in overrides and 'path' not in overrides:
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -40,7 +40,7 @@ from ..strings cimport StringStore
 from ..gold cimport GoldParse
-USE_FTRL = False
+USE_FTRL = True
 DEBUG = False
 def set_debug(val):
    global DEBUG
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
    assert lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models
 def test_tagger_lemmatizer_base_forms(lemmatizer):
    if lemmatizer is None:
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -3,9 +3,8 @@ from __future__ import unicode_literals
 from ...vocab import Vocab
 from ...tokenizer import Tokenizer
-from ...util import utf8open
+from ... import util
 from os import path
 import pytest
@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
@pytest.mark.parametrize('file_name', ["sun.txt"])
 def test_tokenizer_handle_text_from_file(tokenizer, file_name):
-    loc = path.join(path.dirname(__file__), file_name)
+    loc = util.ensure_path(__file__).parent / file_name
-    text = utf8open(loc).read()
+    text = loc.open('r', encoding='utf8').read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -192,6 +192,8 @@ cdef class Token:
    property lemma:
        def __get__(self):
            return self.c.lemma
        def __set__(self, int lemma):
            self.c.lemma = lemma
    property pos:
        def __get__(self):
@ -570,6 +572,8 @@ cdef class Token:
    property lemma_:
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
            self.c.lemma = self.vocab.strings[lemma_]
    property pos_:
        def __get__(self):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
 import io
 import ujson
 import re
 from pathlib import Path
@ -21,9 +20,11 @@ def set_lang_class(name, cls):
 def get_lang_class(name):
    if name in LANGUAGES:
        return LANGUAGES[name]
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
    if lang not in LANGUAGES:
-        raise RuntimeError('Language not supported: %s' % lang)
+        raise RuntimeError('Language not supported: %s' % name)
    return LANGUAGES[lang]
@ -46,15 +47,6 @@ def ensure_path(path):
        return path
 def or_(val1, val2):
    if val1 is not None:
        return val1
    elif callable(val2):
        return val2()
    else:
        return val2
 def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
@ -103,22 +95,28 @@ def normalize_slice(length, start, stop, step=None):
    return start, stop
 def utf8open(loc, mode='r'):
    return io.open(loc, mode, encoding='utf8')
 def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
 def read_json(location):
    with location.open('r', encoding='utf8') as f:
        return ujson.load(f)
 def parse_package_meta(package_path, package, require=True):
    """
    Check if a meta.json exists in a package and return its contents as a
    dictionary. If require is set to True, raise an error if no meta.json found.
    """
    # TODO: Allow passing in full model path and only require one argument
    # instead of path and package name. This lets us avoid passing in an awkward
    # empty string in spacy.load() if user supplies full model path.
    location = package_path / package / 'meta.json'
    if location.is_file():
-        with location.open('r', encoding='utf8') as f:
+        return read_json(location)
            meta = ujson.load(f)
            return meta
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
@ -126,10 +124,11 @@ def parse_package_meta(package_path, package, require=True):
 def get_raw_input(description, default=False):
-    """Get user input via raw_input / input and return input value. Takes a
+    """
    Get user input via raw_input / input and return input value. Takes a
    description for the prompt, and an optional default value that's displayed
-    with the prompt."""
+    with the prompt.
-
+    """
    additional = ' (default: {d})'.format(d=default) if default else ''
    prompt = '    {d}{a}: '.format(d=description, a=additional)
    user_input = input_(prompt)
@ -137,9 +136,10 @@ def get_raw_input(description, default=False):
 def print_table(data, **kwargs):
-    """Print data in table format. Can either take a list of tuples or a
+    """
-    dictionary, which will be converted to a list of tuples."""
+    Print data in table format. Can either take a list of tuples or a
-
+    dictionary, which will be converted to a list of tuples.
    """
    if type(data) == dict:
        data = list(data.items())
@ -155,10 +155,11 @@ def print_table(data, **kwargs):
 def print_markdown(data, **kwargs):
-    """Print listed data in GitHub-flavoured Markdown format so it can be
+    """
    Print listed data in GitHub-flavoured Markdown format so it can be
    copy-pasted into issues. Can either take a list of tuples or a dictionary,
-    which will be converted to a list of tuples."""
+    which will be converted to a list of tuples.
-
+    """
    def excl_value(value):
        # don't print value if it contains absolute path of directory (i.e.
        # personal info). Other conditions can be included here if necessary.
@ -175,16 +176,16 @@ def print_markdown(data, **kwargs):
    if 'title' in kwargs and kwargs['title']:
        print(tpl_title.format(msg=kwargs['title']))
    print(tpl_msg.format(msg=markdown))
 def print_msg(*text, **kwargs):
-    """Print formatted message. Each positional argument is rendered as newline-
+    """
    Print formatted message. Each positional argument is rendered as newline-
    separated paragraph. If kwarg 'title' exist, title is printed above the text
    and highlighted (using ANSI escape sequences manually to avoid unnecessary
-    dependency)."""
+    dependency).
-
+    """
    message = '\n\n'.join([_wrap_text(t) for t in text])
    tpl_msg = '\n{msg}\n'
    tpl_title = '\n\033[93m{msg}\033[0m'
@ -196,9 +197,10 @@ def print_msg(*text, **kwargs):
 def _wrap_text(text):
-    """Wrap text at given width using textwrap module. Indent should consist of
+    """
-    spaces. Its length is deducted from wrap width to ensure exact wrapping."""
+    Wrap text at given width using textwrap module. Indent should consist of
-
+    spaces. Its length is deducted from wrap width to ensure exact wrapping.
    """
    wrap_max = 80
    indent = '    '
    wrap_width = wrap_max - len(indent)
@ -208,10 +210,11 @@ def _wrap_text(text):
 def sys_exit(*messages, **kwargs):
-    """Performs SystemExit. For modules used from the command line, like
+    """
    Performs SystemExit. For modules used from the command line, like
    download and link. To print message, use the same arguments as for
-    print_msg()."""
+    print_msg().
-
+    """
    if messages:
        print_msg(*messages, **kwargs)
    sys.exit(0)
--- a/website/_harp.json
+++ b/website/_harp.json
@ -12,7 +12,7 @@
        "COMPANY_URL": "https://explosion.ai",
        "DEMOS_URL": "https://demos.explosion.ai",
-        "SPACY_VERSION": "1.7",
+        "SPACY_VERSION": "1.8",
        "LATEST_NEWS": {
            "url": "https://survey.spacy.io/",
            "title": "Take the spaCy user survey and help us improve the library!"
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -20,8 +20,10 @@
            "Word vectors": "word-vectors-similarities",
            "Deep learning": "deep-learning",
            "Custom tokenization": "customizing-tokenizer",
            "Adding languages": "adding-languages",
            "Training": "training",
-            "Adding languages": "adding-languages"
+            "Training NER": "training-ner",
            "Saving & loading": "saving-loading"
        },
        "Examples": {
            "Tutorials": "tutorials",
@ -101,11 +103,21 @@
    "customizing-tokenizer": {
        "title": "Customizing the tokenizer",
-        "next": "training"
+        "next": "adding-languages"
    },
    "training": {
-        "title": "Training the tagger, parser and entity recognizer"
+        "title": "Training spaCy's statistical models",
        "next": "saving-loading"
    },
    "training-ner": {
        "title": "Training the Named Entity Recognizer",
        "next": "saving-loading"
    },
    "saving-loading": {
        "title": "Saving and loading models"
    },
    "pos-tagging": {
@ -356,6 +368,18 @@
        },
        "code": {
            "Training a new entity type": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
                "author": "Matthew Honnibal",
                "tags": ["ner", "training"]
            },
            "Training an NER system from scratch": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
                "author": "Matthew Honnibal",
                "tags": ["ner", "training"]
            },
            "Information extraction": {
                "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
                "author": "Matthew Honnibal",
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@ -63,14 +63,16 @@ p
            tag_map = TAG_MAP
            stop_words = STOP_WORDS
-p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()].
+p
    |  Additionally, the new #[code Language] class needs to be added to the
    |  list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py].
    |  The languages are then registered using the #[code set_lang_class()] function.
 +code("spacy/__init__.py").
    from . import en
    from . import xx
-    set_lang_class(en.English.lang, en.English)
+    _languages = (en.English, ..., xx.Xxxxx)
    set_lang_class(xx.Xxxxx.lang, xx.Xxxxx)
 p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]:
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -248,15 +248,17 @@ p
    +tag experimental
 p
-    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
+    |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
-    |  from an existing model data directory. All data files are copied over,
+    |  from an existing model data directory. All data files are copied over.
-    |  and the meta data can be entered directly from the command line. While
+    |  If the path to a meta.json is supplied, or a meta.json is found in the
-    |  this feature is still experimental, the required file templates are
+    |  input directory, this file is used. Otherwise, the data can be entered
-    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
+    |  directly from the command line. While this feature is still experimental,
-    |  This means you need to be connected to the internet to use this command.
+    |  the required file templates are downloaded from
    |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means
    |  you need to be connected to the internet to use this command.
 +code(false, "bash").
-    python -m spacy package [input_dir] [output_dir] [--force]
+    python -m spacy package [input_dir] [output_dir] [--meta] [--force]
 +table(["Argument", "Type", "Description"])
    +row
@ -269,6 +271,11 @@ p
        +cell positional
        +cell Directory to create package folder in.
    +row
        +cell #[code meta]
        +cell option
        +cell Path to meta.json file (optional).
    +row
        +cell #[code --force], #[code -f]
        +cell flag
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -137,7 +137,7 @@ p
        return word.ent_type != 0
    def count_parent_verb_by_person(docs):
-        counts = defaultdict(defaultdict(int))
+        counts = defaultdict(lambda: defaultdict(int))
        for doc in docs:
            for ent in doc.ents:
                if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@ -235,62 +235,13 @@ p
 p
    |  If you've trained your own model, for example for
-    |  #[+a("/docs/usage/adding-languages") additional languages], you can
+    |  #[+a("/docs/usage/adding-languages") additional languages] or
-    |  create a shortuct link for it by pointing #[code spacy.link] to the
+    |  #[+a("/docs/usage/train-ner") custom named entities], you can save its
-    |  model's data directory. To allow your model to be downloaded and
+    |  state using the #[code Language.save_to_directory()] method. To make the
-    |  installed via pip, you'll also need to generate a package for it. You can
+    |  model more convenient to deploy, we recommend wrapping it as a Python
-    |  do this manually, or via the new
+    |  package.
    |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
    |  create all required files, and walk you through generating the meta data.
-
+infobox("Saving and loading models")
-+infobox("Important note")
+    |  For more information and a detailed guide on how to package your model,
-    |  The model packages are #[strong not suitable] for the public
+    |  see the documentation on
-    |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
+    |  #[+a("/docs/usage/saving-loading") saving and loading models].
    |  designed for binary data and files over 50 MB. However, if your company
    |  is running an internal installation of pypi, publishing your models on
    |  there can be a convenient solution to share them with your team.
 p The model directory should look like this:
 +code("Directory structure", "yaml").
    └── /
        ├── MANIFEST.in                   # to include meta.json
        ├── meta.json                     # model meta data
        ├── setup.py                      # setup file for pip installation
        └── en_core_web_md                # model directory
            ├── __init__.py               # init for pip installation
            └── en_core_web_md-1.2.0      # model data
 p
    |  You can find templates for all files in our
    |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
    |  Unless you want to customise installation and loading, the only file
    |  you'll need to modify is #[code meta.json], which includes the model's
    |  meta data. It will later be copied into the package and data directory.
 +code("meta.json", "json").
    {
        "name": "core_web_md",
        "lang": "en",
        "version": "1.2.0",
        "spacy_version": "1.7.0",
        "description": "English model for spaCy",
        "author": "Explosion AI",
        "email": "contact@explosion.ai",
        "license": "MIT"
    }
 p
    |  Keep in mind that the directories need to be named according to the
    |  naming conventions. The #[code lang] setting is also used to create the
    |  respective #[code Language] class in spaCy, which will later be returned
    |  by the model's #[code load()] method.
 p
    |  To generate the package, run the following command from within the
    |  directory. This will create a #[code .tar.gz] archive in a directory
    |  #[code /dist].
 +code(false, "bash").
    python setup.py sdist
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -0,0 +1,108 @@
 include ../../_includes/_mixins
 p
    |  After training your model, you'll usually want to save its state, and load
    |  it back later. You can do this with the #[code Language.save_to_directory()]
    |  method:
 +code.
    nlp.save_to_directory('/home/me/data/en_example_model')
 p
    |  The directory will be created if it doesn't exist, and the whole pipeline
    |  will be written out. To make the model more convenient to deploy, we
    |  recommend wrapping it as a Python package.
 +h(2, "generating") Generating a model package
 +infobox("Important note")
    |  The model packages are #[strong not suitable] for the public
    |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
    |  designed for binary data and files over 50 MB. However, if your company
    |  is running an internal installation of pypi, publishing your models on
    |  there can be a convenient solution to share them with your team.
 p
    |  spaCy comes with a handy  CLI command that will create all required files,
    |  and walk you through generating the meta data. You can also create the
    |  meta.json manually and place it in the model data directory, or supply a
    |  path to it using the #[code --meta] flag. For more info on this, see the
    |  #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
 +aside-code("meta.json", "json").
    {
        "name": "example_model",
        "lang": "en",
        "version": "1.0.0",
        "spacy_version": "&gt;=1.7.0,&lt;2.0.0",
        "description": "Example model for spaCy",
        "author": "You",
        "email": "you@example.com",
        "license": "CC BY-SA 3.0"
    }
 +code(false, "bash").
    python -m spacy package /home/me/data/en_example_model /home/me/my_models
 p This command will create a model package directory that should look like this:
 +code("Directory structure", "yaml").
    └── /
        ├── MANIFEST.in                   # to include meta.json
        ├── meta.json                     # model meta data
        ├── setup.py                      # setup file for pip installation
        └── en_example_model              # model directory
            ├── __init__.py               # init for pip installation
            └── en_example_model-1.0.0    # model data
 p
    |  You can also find templates for all files in our
    |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
    |  If you're creating the package manually, keep in mind that the directories
    |  need to be named according to the naming conventions of
    |  #[code [language]_[type]] and #[code [language]_[type]-[version]]. The
    |  #[code lang] setting in the meta.json is also used to create the
    |  respective #[code Language] class in spaCy, which will later be returned
    |  by the model's #[code load()] method.
 +h(2, "building") Building a model package
 p
    |  To build the package, run the following command from within the
    |  directory. This will create a #[code .tar.gz] archive in a directory
    |  #[code /dist].
 +code(false, "bash").
    python setup.py sdist
 p
    |  For more information on building Python packages, see the
    |  #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
 +h(2, "loading") Loading a model package
 p
    |  Model packages can be installed by pointing pip to the model's
    |  #[code .tar.gz] archive:
 +code(false, "bash").
    pip install /path/to/en_example_model-1.0.0.tar.gz
 p You'll then be able to load the model as follows:
 +code.
    import en_example_model
    nlp = en_example_model.load()
 p
    |  To load the model via #[code spacy.load()], you can also
    |  create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
    |  package name to a custom model name of your choice:
 +code(false, "bash").
    python -m spacy link en_example_model example
 +code.
    import spacy
    nlp = spacy.load('example')
--- a/website/docs/usage/training-ner.jade
+++ b/website/docs/usage/training-ner.jade
@ -0,0 +1,174 @@
 include ../../_includes/_mixins
 p
    |  All #[+a("/docs/usage/models") spaCy models] support online learning, so
    |  you can update a pre-trained model with new examples. You can even add
    |  new classes to an existing model, to recognise a new entity type,
    |  part-of-speech, or syntactic relation. Updating an existing model is
    |  particularly useful as a "quick and dirty solution", if you have only a
    |  few corrections or annotations.
 +h(2, "improving-accuracy") Improving accuracy on existing entity types
 p
    |  To update the model, you first need to create an instance of
    |  #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
    |  you want to learn. You will then pass this instance to the
    |  #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
    |  method. For example:
 +code.
    import spacy
    from spacy.gold import GoldParse
    nlp = spacy.load('en')
    doc = nlp.make_doc(u'Facebook released React in 2014')
    gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
    nlp.entity.update(doc, gold)
 p
    |  You'll usually need to provide many examples to meaningfully improve the
    |  system — a few hundred is a good start, although more is better. You
    |  should avoid iterating over the same few examples multiple times, or the
    |  model is likely to "forget" how to annotate other examples. If you
    |  iterate over the same few examples, you're effectively changing the loss
    |  function. The optimizer will find a way to minimize the loss on your
    |  examples, without regard for the consequences on the examples it's no
    |  longer paying attention to.
 p
    |  One way to avoid this "catastrophic forgetting" problem is to "remind"
    |  the model of other examples by augmenting your annotations with sentences
    |  annotated with entities automatically recognised by the original model.
    |  Ultimately, this is an empirical process: you'll need to
    |  #[strong experiment on your own data] to find a solution that works best
    |  for you.
 +h(2, "adding") Adding a new entity type
 p
    |  You can add new entity types to an existing model. Let's say we want to
    |  recognise the category #[code TECHNOLOGY]. The new category will include
    |  programming languages, frameworks and platforms. First, we need to
    |  register the new entity type:
 +code.
    nlp.entity.add_label('TECHNOLOGY')
 p
    |  Next, iterate over your examples, calling #[code entity.update()]. As
    |  above, we want to avoid iterating over only a small number of sentences.
    |  A useful compromise is to run the model over a number of plain-text
    |  sentences, and pass the entities to #[code GoldParse], as "true"
    |  annotations. This encourages the optimizer to find a solution that
    |  predicts the new category with minimal difference from the previous
    |  output.
 +h(2, "saving-loading") Saving and loading
 p
    |  After training our model, you'll usually want to save its state, and load
    |  it back later. You can do this with the #[code Language.save_to_directory()]
    |  method:
 +code.
    nlp.save_to_directory('/home/me/data/en_technology')
 p
    |  To make the model more convenient to deploy, we recommend wrapping it as
    |  a Python package, so that you can install it via pip and load it as a
    |  module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
    |  to create all required files and directories.
 +code(false, "bash").
    python -m spacy package /home/me/data/en_technology /home/me/my_models
 p
    |  To build the package and create a #[code .tar.gz] archive, run
    |  #[code python setup.py sdist] from within its directory.
 +infobox("Saving and loading models")
    |  For more information and a detailed guide on how to package your model,
    |  see the documentation on
    |  #[+a("/docs/usage/saving-loading") saving and loading models].
 p
    |  After you've generated and installed the package, you'll be able to
    |  load the model as follows:
 +code.
    import en_technology
    nlp = en_technology.load()
 +h(2, "example") Example: Adding and training an #[code ANIMAL] entity
 p
    |  This script shows how to add a new entity type to an existing pre-trained
    |  NER model. To keep the example short and simple, only four sentences are
    |  provided as examples. In practice, you'll need many more —
    |  #[strong a few hundred] would be a good start. You will also likely need
    |  to mix in #[strong examples of other entity types], which might be
    |  obtained by running the entity recognizer over unlabelled sentences, and
    |  adding their annotations to the training set.
 p
    |  For the full, runnable script of this example, see
    |  #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
 +code("Training the entity recognizer").
    import spacy
    from spacy.pipeline import EntityRecognizer
    from spacy.gold import GoldParse
    from spacy.tagger import Tagger
    import random
    model_name = 'en'
    entity_label = 'ANIMAL'
    output_directory = '/path/to/model'
    train_data = [
        ("Horses are too tall and they pretend to care about your feelings",
        [(0, 6, 'ANIMAL')]),
        ("horses are too tall and they pretend to care about your feelings",
        [(0, 6, 'ANIMAL')]),
        ("horses pretend to care about your feelings",
        [(0, 6, 'ANIMAL')]),
        ("they pretend to care about your feelings, those horses",
        [(48, 54, 'ANIMAL')])
    ]
    nlp = spacy.load(model_name)
    nlp.entity.add_label(entity_label)
    ner = train_ner(nlp, train_data, output_directory)
    def train_ner(nlp, train_data, output_dir):
        # Add new words to vocab
        for raw_text, _ in train_data:
            doc = nlp.make_doc(raw_text)
            for word in doc:
                _ = nlp.vocab[word.orth]
        for itn in range(20):
            random.shuffle(train_data)
            for raw_text, entity_offsets in train_data:
                gold = GoldParse(doc, entities=entity_offsets)
                doc = nlp.make_doc(raw_text)
                nlp.tagger(doc)
                loss = nlp.entity.update(doc, gold)
        nlp.end_training()
        nlp.save_to_directory(output_dir)
 p
    +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
 p
    |  The actual training is performed by looping over the examples, and
    |  calling #[code nlp.entity.update()]. The #[code update()] method steps
    |  through the words of the input. At each word, it makes a prediction. It
    |  then consults the annotations provided on the #[code GoldParse] instance,
    |  to see whether it was right. If it was wrong, it adjusts its weights so
    |  that the correct action will score higher next time.
 p
    |  After training your model, you can
    |  #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
    |  models as Python packages, for ease of deployment.
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -1,13 +1,10 @@
 include ../../_includes/_mixins
 p
-    |  This tutorial describes how to train new statistical models for spaCy's
+    |  This workflow describes how to train new statistical models for spaCy's
    |  part-of-speech tagger, named entity recognizer and dependency parser.
-
+    |  Once the model is trained, you can then
-p
+    |  #[+a("/docs/usage/saving-loading") save and load] it.
    |  I'll start with some quick code examples, that describe how to train
    |  each model. I'll then provide a bit of background about the algorithms,
    |  and explain how the data and feature templates work.
 +h(2, "train-pos-tagger") Training the part-of-speech tagger
@ -48,7 +45,21 @@ p
 p
    +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
-+h(2, "train-entity") Training the dependency parser
+h(2, "extend-entity") Extending the named entity recognizer
 p
    |  All #[+a("/docs/usage/models") spaCy models] support online learning, so
    |  you can update a pre-trained model with new examples. You can even add
    |  new classes to an existing model, to recognise a new entity type,
    |  part-of-speech, or syntactic relation. Updating an existing model is
    |  particularly useful as a "quick and dirty solution", if you have only a
    |  few corrections or annotations.
 p.o-inline-list
    +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
    +button("/docs/usage/training-ner", false, "secondary") Usage Workflow
 +h(2, "train-dependency") Training the dependency parser
 +code.
    from spacy.vocab import Vocab
@ -67,7 +78,7 @@ p
 p
    +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
-+h(2, 'feature-templates') Customizing the feature extraction
+h(2, "feature-templates") Customizing the feature extraction
 p
    |  spaCy currently uses linear models for the tagger, parser and entity