mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
		
						commit
						0605b95f2e
					
				| 
						 | 
				
			
			@ -10,7 +10,7 @@ open-source software, released under the MIT license.
 | 
			
		|||
 | 
			
		||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
 | 
			
		||||
 | 
			
		||||
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 | 
			
		||||
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
 | 
			
		||||
 | 
			
		||||
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
 | 
			
		||||
    :target: https://travis-ci.org/explosion/spaCy
 | 
			
		||||
| 
						 | 
				
			
			@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests:
 | 
			
		|||
=========== ============== ===========
 | 
			
		||||
Version     Date           Description
 | 
			
		||||
=========== ============== ===========
 | 
			
		||||
`v1.8.0`_   ``2017-04-16`` Better NER training, saving and loading
 | 
			
		||||
`v1.7.5`_   ``2017-04-07`` Bug fixes and new CLI commands
 | 
			
		||||
`v1.7.3`_   ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes
 | 
			
		||||
`v1.7.2`_   ``2017-03-20`` Small fixes to beam parser and model linking
 | 
			
		||||
| 
						 | 
				
			
			@ -350,6 +351,7 @@ Version     Date           Description
 | 
			
		|||
`v0.93`_    ``2015-09-22`` Bug fixes to word vectors
 | 
			
		||||
=========== ============== ===========
 | 
			
		||||
 | 
			
		||||
.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0
 | 
			
		||||
.. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5
 | 
			
		||||
.. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3
 | 
			
		||||
.. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
 | 
			
		||||
"""
 | 
			
		||||
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
 | 
			
		||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
 | 
			
		||||
multi-processing.
 | 
			
		||||
'''
 | 
			
		||||
"""
 | 
			
		||||
from __future__ import print_function, unicode_literals, division
 | 
			
		||||
import io
 | 
			
		||||
import bz2
 | 
			
		||||
| 
						 | 
				
			
			@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def iter_texts_from_json_bz2(loc):
 | 
			
		||||
    '''
 | 
			
		||||
    """
 | 
			
		||||
    Iterator of unicode strings, one per document (here, a comment).
 | 
			
		||||
    
 | 
			
		||||
    Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
 | 
			
		||||
    document text should be in a string field titled 'body'.
 | 
			
		||||
 | 
			
		||||
    This is the data format of the Reddit comments corpus.
 | 
			
		||||
    '''
 | 
			
		||||
    """
 | 
			
		||||
    with bz2.BZ2File(loc) as file_:
 | 
			
		||||
        for i, line in enumerate(file_):
 | 
			
		||||
            yield ujson.loads(line)['body']
 | 
			
		||||
| 
						 | 
				
			
			@ -80,7 +81,7 @@ def is_sent_begin(word):
 | 
			
		|||
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
 | 
			
		||||
    if not path.exists(out_dir):
 | 
			
		||||
        path.join(out_dir)
 | 
			
		||||
    texts = partition(batch_size, iter_texts(in_loc))
 | 
			
		||||
    texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
 | 
			
		||||
    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,32 @@
 | 
			
		|||
#!/usr/bin/env python
 | 
			
		||||
"""
 | 
			
		||||
Example of training an additional entity type
 | 
			
		||||
 | 
			
		||||
This script shows how to add a new entity type to an existing pre-trained NER
 | 
			
		||||
model. To keep the example short and simple, only four sentences are provided
 | 
			
		||||
as examples. In practice, you'll need many more — a few hundred would be a
 | 
			
		||||
good start. You will also likely need to mix in examples of other entity
 | 
			
		||||
types, which might be obtained by running the entity recognizer over unlabelled
 | 
			
		||||
sentences, and adding their annotations to the training set.
 | 
			
		||||
 | 
			
		||||
The actual training is performed by looping over the examples, and calling
 | 
			
		||||
`nlp.entity.update()`. The `update()` method steps through the words of the
 | 
			
		||||
input. At each word, it makes a prediction. It then consults the annotations
 | 
			
		||||
provided on the GoldParse instance, to see whether it was right. If it was
 | 
			
		||||
wrong, it adjusts its weights so that the correct action will score higher
 | 
			
		||||
next time.
 | 
			
		||||
 | 
			
		||||
After training your model, you can save it to a directory. We recommend
 | 
			
		||||
wrapping models as Python packages, for ease of deployment.
 | 
			
		||||
 | 
			
		||||
For more details, see the documentation:
 | 
			
		||||
* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
 | 
			
		||||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
 | 
			
		||||
 | 
			
		||||
Developed for: spaCy 1.7.6
 | 
			
		||||
Last tested for: spaCy 1.7.6
 | 
			
		||||
"""
 | 
			
		||||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals, print_function
 | 
			
		||||
 | 
			
		||||
import random
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,3 +11,4 @@ ujson>=1.35
 | 
			
		|||
dill>=0.2,<0.3
 | 
			
		||||
requests>=2.13.0,<3.0.0
 | 
			
		||||
regex==2017.4.5
 | 
			
		||||
pytest>=3.0.6,<4.0.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,39 +1,40 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from .util import set_lang_class, get_lang_class, parse_package_meta
 | 
			
		||||
from . import util
 | 
			
		||||
from .deprecated import resolve_model_name
 | 
			
		||||
from .cli import info
 | 
			
		||||
 | 
			
		||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
set_lang_class(en.English.lang, en.English)
 | 
			
		||||
set_lang_class(de.German.lang, de.German)
 | 
			
		||||
set_lang_class(es.Spanish.lang, es.Spanish)
 | 
			
		||||
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
 | 
			
		||||
set_lang_class(fr.French.lang, fr.French)
 | 
			
		||||
set_lang_class(it.Italian.lang, it.Italian)
 | 
			
		||||
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
 | 
			
		||||
set_lang_class(zh.Chinese.lang, zh.Chinese)
 | 
			
		||||
set_lang_class(nl.Dutch.lang, nl.Dutch)
 | 
			
		||||
set_lang_class(sv.Swedish.lang, sv.Swedish)
 | 
			
		||||
set_lang_class(fi.Finnish.lang, fi.Finnish)
 | 
			
		||||
set_lang_class(bn.Bengali.lang, bn.Bengali)
 | 
			
		||||
set_lang_class(he.Hebrew.lang, he.Hebrew)
 | 
			
		||||
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
 | 
			
		||||
             it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
 | 
			
		||||
             fi.Finnish, bn.Bengali, he.Hebrew)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for _lang in _languages:
 | 
			
		||||
    util.set_lang_class(_lang.lang, _lang)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load(name, **overrides):
 | 
			
		||||
    data_path = overrides.get('path', util.get_data_path())
 | 
			
		||||
    if overrides.get('path') in (None, False, True):
 | 
			
		||||
        data_path = util.get_data_path()
 | 
			
		||||
        model_name = resolve_model_name(name)
 | 
			
		||||
    meta = parse_package_meta(data_path, model_name, require=False)
 | 
			
		||||
        model_path = data_path / model_name
 | 
			
		||||
        if not model_path.exists():
 | 
			
		||||
            lang_name = util.get_lang_class(name).lang
 | 
			
		||||
            model_path = None
 | 
			
		||||
            util.print_msg(
 | 
			
		||||
                "Only loading the '{}' tokenizer.".format(lang_name),
 | 
			
		||||
                title="Warning: no model found for '{}'".format(name))
 | 
			
		||||
    else:
 | 
			
		||||
        model_path = util.ensure_path(overrides['path'])
 | 
			
		||||
        data_path = model_path.parent
 | 
			
		||||
        model_name = ''
 | 
			
		||||
    meta = util.parse_package_meta(data_path, model_name, require=False)
 | 
			
		||||
    lang = meta['lang'] if meta and 'lang' in meta else name
 | 
			
		||||
    cls = get_lang_class(lang)
 | 
			
		||||
    cls = util.get_lang_class(lang)
 | 
			
		||||
    overrides['meta'] = meta
 | 
			
		||||
    model_path = Path(data_path / model_name)
 | 
			
		||||
    if model_path.exists():
 | 
			
		||||
    overrides['path'] = model_path
 | 
			
		||||
 | 
			
		||||
    return cls(**overrides)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,7 @@
 | 
			
		|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 | 
			
		||||
 | 
			
		||||
__title__ = 'spacy'
 | 
			
		||||
__version__ = '1.7.5'
 | 
			
		||||
__version__ = '1.8.0'
 | 
			
		||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 | 
			
		||||
__uri__ = 'https://spacy.io'
 | 
			
		||||
__author__ = 'Matthew Honnibal'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,7 @@ from __future__ import unicode_literals
 | 
			
		|||
import platform
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from ..compat import unicode_
 | 
			
		||||
from .. import about
 | 
			
		||||
from .. import util
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -13,10 +14,10 @@ def info(model=None, markdown=False):
 | 
			
		|||
        data = util.parse_package_meta(util.get_data_path(), model, require=True)
 | 
			
		||||
        model_path = Path(__file__).parent / util.get_data_path() / model
 | 
			
		||||
        if model_path.resolve() != model_path:
 | 
			
		||||
            data['link'] = str(model_path)
 | 
			
		||||
            data['source'] = str(model_path.resolve())
 | 
			
		||||
            data['link'] = unicode_(model_path)
 | 
			
		||||
            data['source'] = unicode_(model_path.resolve())
 | 
			
		||||
        else:
 | 
			
		||||
            data['source'] = str(model_path)
 | 
			
		||||
            data['source'] = unicode_(model_path)
 | 
			
		||||
        print_info(data, "model " + model, markdown)
 | 
			
		||||
    else:
 | 
			
		||||
        data = get_spacy_data()
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +35,7 @@ def print_info(data, title, markdown):
 | 
			
		|||
def get_spacy_data():
 | 
			
		||||
    return {
 | 
			
		||||
        'spaCy version': about.__version__,
 | 
			
		||||
        'Location': str(Path(__file__).parent.parent),
 | 
			
		||||
        'Location': unicode_(Path(__file__).parent.parent),
 | 
			
		||||
        'Platform': platform.platform(),
 | 
			
		||||
        'Python version': platform.python_version(),
 | 
			
		||||
        'Installed models': ', '.join(list_models())
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False):
 | 
			
		|||
 | 
			
		||||
def symlink(model_path, link_name, force):
 | 
			
		||||
    model_path = Path(model_path)
 | 
			
		||||
    if not Path(model_path).exists():
 | 
			
		||||
    if not model_path.exists():
 | 
			
		||||
        util.sys_exit(
 | 
			
		||||
            "The data should be located in {p}".format(p=model_path),
 | 
			
		||||
            title="Can't locate model data")
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ def symlink(model_path, link_name, force):
 | 
			
		|||
    except:
 | 
			
		||||
        # This is quite dirty, but just making sure other errors are caught so
 | 
			
		||||
        # users at least see a proper message.
 | 
			
		||||
        util.sys_exit(
 | 
			
		||||
        util.print_msg(
 | 
			
		||||
            "Creating a symlink in spacy/data failed. Make sure you have the "
 | 
			
		||||
            "required permissions and try re-running the command as admin, or "
 | 
			
		||||
            "use a virtualenv to install spaCy in a user directory, instead of "
 | 
			
		||||
| 
						 | 
				
			
			@ -57,6 +57,7 @@ def symlink(model_path, link_name, force):
 | 
			
		|||
            "load() method, or create the symlink manually:",
 | 
			
		||||
            "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
 | 
			
		||||
            title="Error: Couldn't link model to '{l}'".format(l=link_name))
 | 
			
		||||
        raise
 | 
			
		||||
 | 
			
		||||
    util.print_msg(
 | 
			
		||||
        "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,8 +2,8 @@
 | 
			
		|||
from __future__ import unicode_literals, division, print_function
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from ..util import ensure_path
 | 
			
		||||
from ..scorer import Scorer
 | 
			
		||||
from ..gold import GoldParse, merge_sents
 | 
			
		||||
from ..gold import read_json_file as read_gold_json
 | 
			
		||||
| 
						 | 
				
			
			@ -12,9 +12,9 @@ from .. import util
 | 
			
		|||
 | 
			
		||||
def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
 | 
			
		||||
          parser_L1):
 | 
			
		||||
    output_path = Path(output_dir)
 | 
			
		||||
    train_path = Path(train_data)
 | 
			
		||||
    dev_path = Path(dev_data)
 | 
			
		||||
    output_path = ensure_path(output_dir)
 | 
			
		||||
    train_path = ensure_path(train_data)
 | 
			
		||||
    dev_path = ensure_path(dev_data)
 | 
			
		||||
    check_dirs(output_path, train_path, dev_path)
 | 
			
		||||
 | 
			
		||||
    lang = util.get_lang_class(language)
 | 
			
		||||
| 
						 | 
				
			
			@ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def train_config(config):
 | 
			
		||||
    config_path = Path(config)
 | 
			
		||||
    config_path = ensure_path(config)
 | 
			
		||||
    if not config_path.is_file():
 | 
			
		||||
        util.sys_exit(config_path.as_posix(), title="Config file not found")
 | 
			
		||||
    config = json.load(config_path)
 | 
			
		||||
| 
						 | 
				
			
			@ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
 | 
			
		|||
                entity_cfg, n_iter):
 | 
			
		||||
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
 | 
			
		||||
 | 
			
		||||
    with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
 | 
			
		||||
    with Language.train(output_path, train_data,
 | 
			
		||||
                        pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer:
 | 
			
		||||
        for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
 | 
			
		||||
            for doc, gold in epoch:
 | 
			
		||||
                trainer.update(doc, gold)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides):
 | 
			
		|||
def resolve_model_name(name):
 | 
			
		||||
    """
 | 
			
		||||
    If spaCy is loaded with 'de', check if symlink already exists. If
 | 
			
		||||
    not, user have upgraded from older version and have old models installed.
 | 
			
		||||
    not, user may have upgraded from older version and have old models installed.
 | 
			
		||||
    Check if old model directory exists and if so, return that instead and create
 | 
			
		||||
    shortcut link. If English model is found and no shortcut exists, raise error
 | 
			
		||||
    and tell user to install new model.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function
 | 
			
		|||
import io
 | 
			
		||||
import re
 | 
			
		||||
import ujson
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from .syntax import nonproj
 | 
			
		||||
from .util import ensure_path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def tags_to_entities(tags):
 | 
			
		||||
| 
						 | 
				
			
			@ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def read_json_file(loc, docs_filter=None):
 | 
			
		||||
    loc = Path(loc)
 | 
			
		||||
    loc = ensure_path(loc)
 | 
			
		||||
    if loc.is_dir():
 | 
			
		||||
        for filename in loc.iterdir():
 | 
			
		||||
            yield from read_json_file(loc / filename)
 | 
			
		||||
    else:
 | 
			
		||||
        with io.open(loc, 'r', encoding='utf8') as file_:
 | 
			
		||||
        with loc.open('r', encoding='utf8') as file_:
 | 
			
		||||
            docs = ujson.load(file_)
 | 
			
		||||
        for doc in docs:
 | 
			
		||||
            if docs_filter is not None and not docs_filter(doc):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,7 +2,6 @@
 | 
			
		|||
from __future__ import absolute_import, unicode_literals
 | 
			
		||||
from contextlib import contextmanager
 | 
			
		||||
import shutil
 | 
			
		||||
import ujson
 | 
			
		||||
 | 
			
		||||
from .tokenizer import Tokenizer
 | 
			
		||||
from .vocab import Vocab
 | 
			
		||||
| 
						 | 
				
			
			@ -15,7 +14,7 @@ from .syntax.nonproj import PseudoProjectivity
 | 
			
		|||
from .pipeline import DependencyParser, EntityRecognizer
 | 
			
		||||
from .syntax.arc_eager import ArcEager
 | 
			
		||||
from .syntax.ner import BiluoPushDown
 | 
			
		||||
from .compat import unicode_
 | 
			
		||||
from .compat import json_dumps
 | 
			
		||||
from .attrs import IS_STOP
 | 
			
		||||
from . import attrs
 | 
			
		||||
from . import orth
 | 
			
		||||
| 
						 | 
				
			
			@ -188,15 +187,16 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def setup_directory(cls, path, **configs):
 | 
			
		||||
        """
 | 
			
		||||
        Initialise a model directory.
 | 
			
		||||
        """
 | 
			
		||||
        for name, config in configs.items():
 | 
			
		||||
            directory = path / name
 | 
			
		||||
            if directory.exists():
 | 
			
		||||
                shutil.rmtree(str(directory))
 | 
			
		||||
            directory.mkdir()
 | 
			
		||||
            with (directory / 'config.json').open('wb') as file_:
 | 
			
		||||
                data = ujson.dumps(config, indent=2)
 | 
			
		||||
                if isinstance(data, unicode_):
 | 
			
		||||
                    data = data.encode('utf8')
 | 
			
		||||
                data = json_dumps(config)
 | 
			
		||||
                file_.write(data)
 | 
			
		||||
        if not (path / 'vocab').exists():
 | 
			
		||||
            (path / 'vocab').mkdir()
 | 
			
		||||
| 
						 | 
				
			
			@ -204,14 +204,17 @@ class Language(object):
 | 
			
		|||
    @classmethod
 | 
			
		||||
    @contextmanager
 | 
			
		||||
    def train(cls, path, gold_tuples, **configs):
 | 
			
		||||
        if parser_cfg['pseudoprojective']:
 | 
			
		||||
        parser_cfg = configs.get('deps', {})
 | 
			
		||||
        if parser_cfg.get('pseudoprojective'):
 | 
			
		||||
            # preprocess training data here before ArcEager.get_labels() is called
 | 
			
		||||
            gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
 | 
			
		||||
 | 
			
		||||
        for subdir in ('deps', 'ner', 'pos'):
 | 
			
		||||
            if subdir not in configs:
 | 
			
		||||
                configs[subdir] = {}
 | 
			
		||||
        if parser_cfg:
 | 
			
		||||
            configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
 | 
			
		||||
        if 'ner' in configs:
 | 
			
		||||
            configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
 | 
			
		||||
 | 
			
		||||
        cls.setup_directory(path, **configs)
 | 
			
		||||
| 
						 | 
				
			
			@ -236,10 +239,18 @@ class Language(object):
 | 
			
		|||
        self.pipeline = self.Defaults.create_pipeline(self)
 | 
			
		||||
        yield Trainer(self, gold_tuples)
 | 
			
		||||
        self.end_training()
 | 
			
		||||
        self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
 | 
			
		||||
                               pos=self.tagger.cfg)
 | 
			
		||||
        self.save_to_directory(path)
 | 
			
		||||
 | 
			
		||||
    def __init__(self, **overrides):
 | 
			
		||||
        """
 | 
			
		||||
        Create or load the pipeline.
 | 
			
		||||
 | 
			
		||||
        Arguments:
 | 
			
		||||
            **overrides: Keyword arguments indicating which defaults to override.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            Language: The newly constructed object.
 | 
			
		||||
        """
 | 
			
		||||
        if 'data_dir' in overrides and 'path' not in overrides:
 | 
			
		||||
            raise ValueError("The argument 'data_dir' has been renamed to 'path'")
 | 
			
		||||
        path = util.ensure_path(overrides.get('path', True))
 | 
			
		||||
| 
						 | 
				
			
			@ -293,7 +304,7 @@ class Language(object):
 | 
			
		|||
        and can contain arbtrary whitespace.  Alignment into the original string
 | 
			
		||||
        is preserved.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
        Argsuments:
 | 
			
		||||
            text (unicode): The text to be processed.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
| 
						 | 
				
			
			@ -342,12 +353,19 @@ class Language(object):
 | 
			
		|||
            yield doc
 | 
			
		||||
 | 
			
		||||
    def save_to_directory(self, path):
 | 
			
		||||
        """
 | 
			
		||||
        Save the Vocab, StringStore and pipeline to a directory.
 | 
			
		||||
 | 
			
		||||
        Arguments:
 | 
			
		||||
            path (string or pathlib path): Path to save the model.
 | 
			
		||||
        """
 | 
			
		||||
        configs = {
 | 
			
		||||
            'pos': self.tagger.cfg if self.tagger else {},
 | 
			
		||||
            'deps': self.parser.cfg if self.parser else {},
 | 
			
		||||
            'ner': self.entity.cfg if self.entity else {},
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
        self.setup_directory(path, **configs)
 | 
			
		||||
 | 
			
		||||
        strings_loc = path / 'vocab' / 'strings.json'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
 | 
			
		|||
    assert lemmatizer.noun(text) == set(lemmas)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.xfail
 | 
			
		||||
@pytest.mark.models
 | 
			
		||||
def test_tagger_lemmatizer_base_forms(lemmatizer):
 | 
			
		||||
    if lemmatizer is None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,9 +3,8 @@ from __future__ import unicode_literals
 | 
			
		|||
 | 
			
		||||
from ...vocab import Vocab
 | 
			
		||||
from ...tokenizer import Tokenizer
 | 
			
		||||
from ...util import utf8open
 | 
			
		||||
from ... import util
 | 
			
		||||
 | 
			
		||||
from os import path
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
 | 
			
		|||
 | 
			
		||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
 | 
			
		||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
 | 
			
		||||
    loc = path.join(path.dirname(__file__), file_name)
 | 
			
		||||
    text = utf8open(loc).read()
 | 
			
		||||
    loc = util.ensure_path(__file__).parent / file_name
 | 
			
		||||
    text = loc.open('r', encoding='utf8').read()
 | 
			
		||||
    assert len(text) != 0
 | 
			
		||||
    tokens = tokenizer(text)
 | 
			
		||||
    assert len(tokens) > 100
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,6 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals, print_function
 | 
			
		||||
 | 
			
		||||
import io
 | 
			
		||||
import ujson
 | 
			
		||||
import re
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
| 
						 | 
				
			
			@ -21,9 +20,11 @@ def set_lang_class(name, cls):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def get_lang_class(name):
 | 
			
		||||
    if name in LANGUAGES:
 | 
			
		||||
        return LANGUAGES[name]
 | 
			
		||||
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
 | 
			
		||||
    if lang not in LANGUAGES:
 | 
			
		||||
        raise RuntimeError('Language not supported: %s' % lang)
 | 
			
		||||
        raise RuntimeError('Language not supported: %s' % name)
 | 
			
		||||
    return LANGUAGES[lang]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -46,15 +47,6 @@ def ensure_path(path):
 | 
			
		|||
        return path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def or_(val1, val2):
 | 
			
		||||
    if val1 is not None:
 | 
			
		||||
        return val1
 | 
			
		||||
    elif callable(val2):
 | 
			
		||||
        return val2()
 | 
			
		||||
    else:
 | 
			
		||||
        return val2
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_regex(path):
 | 
			
		||||
    path = ensure_path(path)
 | 
			
		||||
    with path.open() as file_:
 | 
			
		||||
| 
						 | 
				
			
			@ -103,10 +95,6 @@ def normalize_slice(length, start, stop, step=None):
 | 
			
		|||
    return start, stop
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def utf8open(loc, mode='r'):
 | 
			
		||||
    return io.open(loc, mode, encoding='utf8')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_renamed_kwargs(renamed, kwargs):
 | 
			
		||||
    for old, new in renamed.items():
 | 
			
		||||
        if old in kwargs:
 | 
			
		||||
| 
						 | 
				
			
			@ -119,6 +107,13 @@ def read_json(location):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def parse_package_meta(package_path, package, require=True):
 | 
			
		||||
    """
 | 
			
		||||
    Check if a meta.json exists in a package and return its contents as a
 | 
			
		||||
    dictionary. If require is set to True, raise an error if no meta.json found.
 | 
			
		||||
    """
 | 
			
		||||
    # TODO: Allow passing in full model path and only require one argument
 | 
			
		||||
    # instead of path and package name. This lets us avoid passing in an awkward
 | 
			
		||||
    # empty string in spacy.load() if user supplies full model path.
 | 
			
		||||
    location = package_path / package / 'meta.json'
 | 
			
		||||
    if location.is_file():
 | 
			
		||||
        return read_json(location)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,7 +12,7 @@
 | 
			
		|||
        "COMPANY_URL": "https://explosion.ai",
 | 
			
		||||
        "DEMOS_URL": "https://demos.explosion.ai",
 | 
			
		||||
 | 
			
		||||
        "SPACY_VERSION": "1.7",
 | 
			
		||||
        "SPACY_VERSION": "1.8",
 | 
			
		||||
        "LATEST_NEWS": {
 | 
			
		||||
            "url": "https://survey.spacy.io/",
 | 
			
		||||
            "title": "Take the spaCy user survey and help us improve the library!"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -55,14 +55,14 @@ p Create or load the pipeline.
 | 
			
		|||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code **kwrags]
 | 
			
		||||
        +cell #[code **overrides]
 | 
			
		||||
        +cell -
 | 
			
		||||
        +cell Keyword arguments indicating which defaults to override.
 | 
			
		||||
 | 
			
		||||
    +footrow
 | 
			
		||||
        +cell return
 | 
			
		||||
        +cell #[code Language]
 | 
			
		||||
        +cell #[code self]
 | 
			
		||||
        +cell The newly constructed object.
 | 
			
		||||
 | 
			
		||||
+h(2, "call") Language.__call__
 | 
			
		||||
    +tag method
 | 
			
		||||
| 
						 | 
				
			
			@ -136,3 +136,19 @@ p
 | 
			
		|||
        +cell yield
 | 
			
		||||
        +cell #[code Doc]
 | 
			
		||||
        +cell Containers for accessing the linguistic annotations.
 | 
			
		||||
 | 
			
		||||
+h(2, "save_to_directory") Language.save_to_directory
 | 
			
		||||
    +tag method
 | 
			
		||||
 | 
			
		||||
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
 | 
			
		||||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code path]
 | 
			
		||||
        +cell string or pathlib path
 | 
			
		||||
        +cell Path to save the model.
 | 
			
		||||
 | 
			
		||||
    +footrow
 | 
			
		||||
        +cell return
 | 
			
		||||
        +cell #[code None]
 | 
			
		||||
        +cell -
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,8 +20,10 @@
 | 
			
		|||
            "Word vectors": "word-vectors-similarities",
 | 
			
		||||
            "Deep learning": "deep-learning",
 | 
			
		||||
            "Custom tokenization": "customizing-tokenizer",
 | 
			
		||||
            "Adding languages": "adding-languages",
 | 
			
		||||
            "Training": "training",
 | 
			
		||||
            "Adding languages": "adding-languages"
 | 
			
		||||
            "Training NER": "training-ner",
 | 
			
		||||
            "Saving & loading": "saving-loading"
 | 
			
		||||
        },
 | 
			
		||||
        "Examples": {
 | 
			
		||||
            "Tutorials": "tutorials",
 | 
			
		||||
| 
						 | 
				
			
			@ -101,11 +103,21 @@
 | 
			
		|||
 | 
			
		||||
    "customizing-tokenizer": {
 | 
			
		||||
        "title": "Customizing the tokenizer",
 | 
			
		||||
        "next": "training"
 | 
			
		||||
        "next": "adding-languages"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "training": {
 | 
			
		||||
        "title": "Training the tagger, parser and entity recognizer"
 | 
			
		||||
        "title": "Training spaCy's statistical models",
 | 
			
		||||
        "next": "saving-loading"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "training-ner": {
 | 
			
		||||
        "title": "Training the Named Entity Recognizer",
 | 
			
		||||
        "next": "saving-loading"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "saving-loading": {
 | 
			
		||||
        "title": "Saving and loading models"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "pos-tagging": {
 | 
			
		||||
| 
						 | 
				
			
			@ -356,6 +368,18 @@
 | 
			
		|||
        },
 | 
			
		||||
 | 
			
		||||
        "code": {
 | 
			
		||||
            "Training a new entity type": {
 | 
			
		||||
                "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
 | 
			
		||||
                "author": "Matthew Honnibal",
 | 
			
		||||
                "tags": ["ner", "training"]
 | 
			
		||||
            },
 | 
			
		||||
 | 
			
		||||
            "Training an NER system from scratch": {
 | 
			
		||||
                "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
 | 
			
		||||
                "author": "Matthew Honnibal",
 | 
			
		||||
                "tags": ["ner", "training"]
 | 
			
		||||
            },
 | 
			
		||||
 | 
			
		||||
            "Information extraction": {
 | 
			
		||||
                "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
 | 
			
		||||
                "author": "Matthew Honnibal",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -63,14 +63,16 @@ p
 | 
			
		|||
            tag_map = TAG_MAP
 | 
			
		||||
            stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()].
 | 
			
		||||
p
 | 
			
		||||
    |  Additionally, the new #[code Language] class needs to be added to the
 | 
			
		||||
    |  list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py].
 | 
			
		||||
    |  The languages are then registered using the #[code set_lang_class()] function.
 | 
			
		||||
 | 
			
		||||
+code("spacy/__init__.py").
 | 
			
		||||
    from . import en
 | 
			
		||||
    from . import xx
 | 
			
		||||
 | 
			
		||||
    set_lang_class(en.English.lang, en.English)
 | 
			
		||||
    set_lang_class(xx.Xxxxx.lang, xx.Xxxxx)
 | 
			
		||||
    _languages = (en.English, ..., xx.Xxxxx)
 | 
			
		||||
 | 
			
		||||
p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]:
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -248,7 +248,7 @@ p
 | 
			
		|||
    +tag experimental
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
 | 
			
		||||
    |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
 | 
			
		||||
    |  from an existing model data directory. All data files are copied over.
 | 
			
		||||
    |  If the path to a meta.json is supplied, or a meta.json is found in the
 | 
			
		||||
    |  input directory, this file is used. Otherwise, the data can be entered
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -137,7 +137,7 @@ p
 | 
			
		|||
        return word.ent_type != 0
 | 
			
		||||
 | 
			
		||||
    def count_parent_verb_by_person(docs):
 | 
			
		||||
        counts = defaultdict(defaultdict(int))
 | 
			
		||||
        counts = defaultdict(lambda: defaultdict(int))
 | 
			
		||||
        for doc in docs:
 | 
			
		||||
            for ent in doc.ents:
 | 
			
		||||
                if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -235,62 +235,13 @@ p
 | 
			
		|||
 | 
			
		||||
p
 | 
			
		||||
    |  If you've trained your own model, for example for
 | 
			
		||||
    |  #[+a("/docs/usage/adding-languages") additional languages], you can
 | 
			
		||||
    |  create a shortuct link for it by pointing #[code spacy.link] to the
 | 
			
		||||
    |  model's data directory. To allow your model to be downloaded and
 | 
			
		||||
    |  installed via pip, you'll also need to generate a package for it. You can
 | 
			
		||||
    |  do this manually, or via the new
 | 
			
		||||
    |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
 | 
			
		||||
    |  create all required files, and walk you through generating the meta data.
 | 
			
		||||
    |  #[+a("/docs/usage/adding-languages") additional languages] or
 | 
			
		||||
    |  #[+a("/docs/usage/train-ner") custom named entities], you can save its
 | 
			
		||||
    |  state using the #[code Language.save_to_directory()] method. To make the
 | 
			
		||||
    |  model more convenient to deploy, we recommend wrapping it as a Python
 | 
			
		||||
    |  package.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
+infobox("Important note")
 | 
			
		||||
    |  The model packages are #[strong not suitable] for the public
 | 
			
		||||
    |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
 | 
			
		||||
    |  designed for binary data and files over 50 MB. However, if your company
 | 
			
		||||
    |  is running an internal installation of pypi, publishing your models on
 | 
			
		||||
    |  there can be a convenient solution to share them with your team.
 | 
			
		||||
 | 
			
		||||
p The model directory should look like this:
 | 
			
		||||
 | 
			
		||||
+code("Directory structure", "yaml").
 | 
			
		||||
    └── /
 | 
			
		||||
        ├── MANIFEST.in                   # to include meta.json
 | 
			
		||||
        ├── meta.json                     # model meta data
 | 
			
		||||
        ├── setup.py                      # setup file for pip installation
 | 
			
		||||
        └── en_core_web_md                # model directory
 | 
			
		||||
            ├── __init__.py               # init for pip installation
 | 
			
		||||
            └── en_core_web_md-1.2.0      # model data
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  You can find templates for all files in our
 | 
			
		||||
    |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
 | 
			
		||||
    |  Unless you want to customise installation and loading, the only file
 | 
			
		||||
    |  you'll need to modify is #[code meta.json], which includes the model's
 | 
			
		||||
    |  meta data. It will later be copied into the package and data directory.
 | 
			
		||||
 | 
			
		||||
+code("meta.json", "json").
 | 
			
		||||
    {
 | 
			
		||||
        "name": "core_web_md",
 | 
			
		||||
        "lang": "en",
 | 
			
		||||
        "version": "1.2.0",
 | 
			
		||||
        "spacy_version": "1.7.0",
 | 
			
		||||
        "description": "English model for spaCy",
 | 
			
		||||
        "author": "Explosion AI",
 | 
			
		||||
        "email": "contact@explosion.ai",
 | 
			
		||||
        "license": "MIT"
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Keep in mind that the directories need to be named according to the
 | 
			
		||||
    |  naming conventions. The #[code lang] setting is also used to create the
 | 
			
		||||
    |  respective #[code Language] class in spaCy, which will later be returned
 | 
			
		||||
    |  by the model's #[code load()] method.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To generate the package, run the following command from within the
 | 
			
		||||
    |  directory. This will create a #[code .tar.gz] archive in a directory
 | 
			
		||||
    |  #[code /dist].
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python setup.py sdist
 | 
			
		||||
+infobox("Saving and loading models")
 | 
			
		||||
    |  For more information and a detailed guide on how to package your model,
 | 
			
		||||
    |  see the documentation on
 | 
			
		||||
    |  #[+a("/docs/usage/saving-loading") saving and loading models].
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										109
									
								
								website/docs/usage/saving-loading.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								website/docs/usage/saving-loading.jade
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,109 @@
 | 
			
		|||
include ../../_includes/_mixins
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  After training your model, you'll usually want to save its state, and load
 | 
			
		||||
    |  it back later. You can do this with the
 | 
			
		||||
    |  #[+api("language#save_to_directory") #[code Language.save_to_directory()]]
 | 
			
		||||
    |  method:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    nlp.save_to_directory('/home/me/data/en_example_model')
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  The directory will be created if it doesn't exist, and the whole pipeline
 | 
			
		||||
    |  will be written out. To make the model more convenient to deploy, we
 | 
			
		||||
    |  recommend wrapping it as a Python package.
 | 
			
		||||
 | 
			
		||||
+h(2, "generating") Generating a model package
 | 
			
		||||
 | 
			
		||||
+infobox("Important note")
 | 
			
		||||
    |  The model packages are #[strong not suitable] for the public
 | 
			
		||||
    |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
 | 
			
		||||
    |  designed for binary data and files over 50 MB. However, if your company
 | 
			
		||||
    |  is running an internal installation of pypi, publishing your models on
 | 
			
		||||
    |  there can be a convenient solution to share them with your team.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  spaCy comes with a handy CLI command that will create all required files,
 | 
			
		||||
    |  and walk you through generating the meta data. You can also create the
 | 
			
		||||
    |  meta.json manually and place it in the model data directory, or supply a
 | 
			
		||||
    |  path to it using the #[code --meta] flag. For more info on this, see the
 | 
			
		||||
    |  #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
 | 
			
		||||
 | 
			
		||||
+aside-code("meta.json", "json").
 | 
			
		||||
    {
 | 
			
		||||
        "name": "example_model",
 | 
			
		||||
        "lang": "en",
 | 
			
		||||
        "version": "1.0.0",
 | 
			
		||||
        "spacy_version": ">=1.7.0,<2.0.0",
 | 
			
		||||
        "description": "Example model for spaCy",
 | 
			
		||||
        "author": "You",
 | 
			
		||||
        "email": "you@example.com",
 | 
			
		||||
        "license": "CC BY-SA 3.0"
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python -m spacy package /home/me/data/en_example_model /home/me/my_models
 | 
			
		||||
 | 
			
		||||
p This command will create a model package directory that should look like this:
 | 
			
		||||
 | 
			
		||||
+code("Directory structure", "yaml").
 | 
			
		||||
    └── /
 | 
			
		||||
        ├── MANIFEST.in                   # to include meta.json
 | 
			
		||||
        ├── meta.json                     # model meta data
 | 
			
		||||
        ├── setup.py                      # setup file for pip installation
 | 
			
		||||
        └── en_example_model              # model directory
 | 
			
		||||
            ├── __init__.py               # init for pip installation
 | 
			
		||||
            └── en_example_model-1.0.0    # model data
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  You can also find templates for all files in our
 | 
			
		||||
    |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
 | 
			
		||||
    |  If you're creating the package manually, keep in mind that the directories
 | 
			
		||||
    |  need to be named according to the naming conventions of
 | 
			
		||||
    |  #[code [language]_[type]] and #[code [language]_[type]-[version]]. The
 | 
			
		||||
    |  #[code lang] setting in the meta.json is also used to create the
 | 
			
		||||
    |  respective #[code Language] class in spaCy, which will later be returned
 | 
			
		||||
    |  by the model's #[code load()] method.
 | 
			
		||||
 | 
			
		||||
+h(2, "building") Building a model package
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To build the package, run the following command from within the
 | 
			
		||||
    |  directory. This will create a #[code .tar.gz] archive in a directory
 | 
			
		||||
    |  #[code /dist].
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python setup.py sdist
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  For more information on building Python packages, see the
 | 
			
		||||
    |  #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
+h(2, "loading") Loading a model package
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Model packages can be installed by pointing pip to the model's
 | 
			
		||||
    |  #[code .tar.gz] archive:
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    pip install /path/to/en_example_model-1.0.0.tar.gz
 | 
			
		||||
 | 
			
		||||
p You'll then be able to load the model as follows:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    import en_example_model
 | 
			
		||||
    nlp = en_example_model.load()
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To load the model via #[code spacy.load()], you can also
 | 
			
		||||
    |  create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
 | 
			
		||||
    |  package name to a custom model name of your choice:
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python -m spacy link en_example_model example
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    import spacy
 | 
			
		||||
    nlp = spacy.load('example')
 | 
			
		||||
							
								
								
									
										174
									
								
								website/docs/usage/training-ner.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								website/docs/usage/training-ner.jade
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,174 @@
 | 
			
		|||
include ../../_includes/_mixins
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  All #[+a("/docs/usage/models") spaCy models] support online learning, so
 | 
			
		||||
    |  you can update a pre-trained model with new examples. You can even add
 | 
			
		||||
    |  new classes to an existing model, to recognise a new entity type,
 | 
			
		||||
    |  part-of-speech, or syntactic relation. Updating an existing model is
 | 
			
		||||
    |  particularly useful as a "quick and dirty solution", if you have only a
 | 
			
		||||
    |  few corrections or annotations.
 | 
			
		||||
 | 
			
		||||
+h(2, "improving-accuracy") Improving accuracy on existing entity types
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To update the model, you first need to create an instance of
 | 
			
		||||
    |  #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
 | 
			
		||||
    |  you want to learn. You will then pass this instance to the
 | 
			
		||||
    |  #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
 | 
			
		||||
    |  method. For example:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    import spacy
 | 
			
		||||
    from spacy.gold import GoldParse
 | 
			
		||||
 | 
			
		||||
    nlp = spacy.load('en')
 | 
			
		||||
    doc = nlp.make_doc(u'Facebook released React in 2014')
 | 
			
		||||
    gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
 | 
			
		||||
    nlp.entity.update(doc, gold)
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  You'll usually need to provide many examples to meaningfully improve the
 | 
			
		||||
    |  system — a few hundred is a good start, although more is better. You
 | 
			
		||||
    |  should avoid iterating over the same few examples multiple times, or the
 | 
			
		||||
    |  model is likely to "forget" how to annotate other examples. If you
 | 
			
		||||
    |  iterate over the same few examples, you're effectively changing the loss
 | 
			
		||||
    |  function. The optimizer will find a way to minimize the loss on your
 | 
			
		||||
    |  examples, without regard for the consequences on the examples it's no
 | 
			
		||||
    |  longer paying attention to.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  One way to avoid this "catastrophic forgetting" problem is to "remind"
 | 
			
		||||
    |  the model of other examples by augmenting your annotations with sentences
 | 
			
		||||
    |  annotated with entities automatically recognised by the original model.
 | 
			
		||||
    |  Ultimately, this is an empirical process: you'll need to
 | 
			
		||||
    |  #[strong experiment on your own data] to find a solution that works best
 | 
			
		||||
    |  for you.
 | 
			
		||||
 | 
			
		||||
+h(2, "adding") Adding a new entity type
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  You can add new entity types to an existing model. Let's say we want to
 | 
			
		||||
    |  recognise the category #[code TECHNOLOGY]. The new category will include
 | 
			
		||||
    |  programming languages, frameworks and platforms. First, we need to
 | 
			
		||||
    |  register the new entity type:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    nlp.entity.add_label('TECHNOLOGY')
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Next, iterate over your examples, calling #[code entity.update()]. As
 | 
			
		||||
    |  above, we want to avoid iterating over only a small number of sentences.
 | 
			
		||||
    |  A useful compromise is to run the model over a number of plain-text
 | 
			
		||||
    |  sentences, and pass the entities to #[code GoldParse], as "true"
 | 
			
		||||
    |  annotations. This encourages the optimizer to find a solution that
 | 
			
		||||
    |  predicts the new category with minimal difference from the previous
 | 
			
		||||
    |  output.
 | 
			
		||||
 | 
			
		||||
+h(2, "saving-loading") Saving and loading
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  After training our model, you'll usually want to save its state, and load
 | 
			
		||||
    |  it back later. You can do this with the #[code Language.save_to_directory()]
 | 
			
		||||
    |  method:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    nlp.save_to_directory('/home/me/data/en_technology')
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To make the model more convenient to deploy, we recommend wrapping it as
 | 
			
		||||
    |  a Python package, so that you can install it via pip and load it as a
 | 
			
		||||
    |  module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
 | 
			
		||||
    |  to create all required files and directories.
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python -m spacy package /home/me/data/en_technology /home/me/my_models
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  To build the package and create a #[code .tar.gz] archive, run
 | 
			
		||||
    |  #[code python setup.py sdist] from within its directory.
 | 
			
		||||
 | 
			
		||||
+infobox("Saving and loading models")
 | 
			
		||||
    |  For more information and a detailed guide on how to package your model,
 | 
			
		||||
    |  see the documentation on
 | 
			
		||||
    |  #[+a("/docs/usage/saving-loading") saving and loading models].
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  After you've generated and installed the package, you'll be able to
 | 
			
		||||
    |  load the model as follows:
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    import en_technology
 | 
			
		||||
    nlp = en_technology.load()
 | 
			
		||||
 | 
			
		||||
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  This script shows how to add a new entity type to an existing pre-trained
 | 
			
		||||
    |  NER model. To keep the example short and simple, only four sentences are
 | 
			
		||||
    |  provided as examples. In practice, you'll need many more —
 | 
			
		||||
    |  #[strong a few hundred] would be a good start. You will also likely need
 | 
			
		||||
    |  to mix in #[strong examples of other entity types], which might be
 | 
			
		||||
    |  obtained by running the entity recognizer over unlabelled sentences, and
 | 
			
		||||
    |  adding their annotations to the training set.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  For the full, runnable script of this example, see
 | 
			
		||||
    |  #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
 | 
			
		||||
 | 
			
		||||
+code("Training the entity recognizer").
 | 
			
		||||
    import spacy
 | 
			
		||||
    from spacy.pipeline import EntityRecognizer
 | 
			
		||||
    from spacy.gold import GoldParse
 | 
			
		||||
    from spacy.tagger import Tagger
 | 
			
		||||
    import random
 | 
			
		||||
 | 
			
		||||
    model_name = 'en'
 | 
			
		||||
    entity_label = 'ANIMAL'
 | 
			
		||||
    output_directory = '/path/to/model'
 | 
			
		||||
    train_data = [
 | 
			
		||||
        ("Horses are too tall and they pretend to care about your feelings",
 | 
			
		||||
        [(0, 6, 'ANIMAL')]),
 | 
			
		||||
        ("horses are too tall and they pretend to care about your feelings",
 | 
			
		||||
        [(0, 6, 'ANIMAL')]),
 | 
			
		||||
        ("horses pretend to care about your feelings",
 | 
			
		||||
        [(0, 6, 'ANIMAL')]),
 | 
			
		||||
        ("they pretend to care about your feelings, those horses",
 | 
			
		||||
        [(48, 54, 'ANIMAL')])
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    nlp = spacy.load(model_name)
 | 
			
		||||
    nlp.entity.add_label(entity_label)
 | 
			
		||||
    ner = train_ner(nlp, train_data, output_directory)
 | 
			
		||||
 | 
			
		||||
    def train_ner(nlp, train_data, output_dir):
 | 
			
		||||
        # Add new words to vocab
 | 
			
		||||
        for raw_text, _ in train_data:
 | 
			
		||||
            doc = nlp.make_doc(raw_text)
 | 
			
		||||
            for word in doc:
 | 
			
		||||
                _ = nlp.vocab[word.orth]
 | 
			
		||||
 | 
			
		||||
        for itn in range(20):
 | 
			
		||||
            random.shuffle(train_data)
 | 
			
		||||
            for raw_text, entity_offsets in train_data:
 | 
			
		||||
                gold = GoldParse(doc, entities=entity_offsets)
 | 
			
		||||
                doc = nlp.make_doc(raw_text)
 | 
			
		||||
                nlp.tagger(doc)
 | 
			
		||||
                loss = nlp.entity.update(doc, gold)
 | 
			
		||||
        nlp.end_training()
 | 
			
		||||
        nlp.save_to_directory(output_dir)
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  The actual training is performed by looping over the examples, and
 | 
			
		||||
    |  calling #[code nlp.entity.update()]. The #[code update()] method steps
 | 
			
		||||
    |  through the words of the input. At each word, it makes a prediction. It
 | 
			
		||||
    |  then consults the annotations provided on the #[code GoldParse] instance,
 | 
			
		||||
    |  to see whether it was right. If it was wrong, it adjusts its weights so
 | 
			
		||||
    |  that the correct action will score higher next time.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  After training your model, you can
 | 
			
		||||
    |  #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
 | 
			
		||||
    |  models as Python packages, for ease of deployment.
 | 
			
		||||
| 
						 | 
				
			
			@ -1,13 +1,10 @@
 | 
			
		|||
include ../../_includes/_mixins
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  This tutorial describes how to train new statistical models for spaCy's
 | 
			
		||||
    |  This workflow describes how to train new statistical models for spaCy's
 | 
			
		||||
    |  part-of-speech tagger, named entity recognizer and dependency parser.
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  I'll start with some quick code examples, that describe how to train
 | 
			
		||||
    |  each model. I'll then provide a bit of background about the algorithms,
 | 
			
		||||
    |  and explain how the data and feature templates work.
 | 
			
		||||
    |  Once the model is trained, you can then
 | 
			
		||||
    |  #[+a("/docs/usage/saving-loading") save and load] it.
 | 
			
		||||
 | 
			
		||||
+h(2, "train-pos-tagger") Training the part-of-speech tagger
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +45,21 @@ p
 | 
			
		|||
p
 | 
			
		||||
    +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
 | 
			
		||||
 | 
			
		||||
+h(2, "train-entity") Training the dependency parser
 | 
			
		||||
+h(2, "extend-entity") Extending the named entity recognizer
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  All #[+a("/docs/usage/models") spaCy models] support online learning, so
 | 
			
		||||
    |  you can update a pre-trained model with new examples. You can even add
 | 
			
		||||
    |  new classes to an existing model, to recognise a new entity type,
 | 
			
		||||
    |  part-of-speech, or syntactic relation. Updating an existing model is
 | 
			
		||||
    |  particularly useful as a "quick and dirty solution", if you have only a
 | 
			
		||||
    |  few corrections or annotations.
 | 
			
		||||
 | 
			
		||||
p.o-inline-list
 | 
			
		||||
    +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
 | 
			
		||||
    +button("/docs/usage/training-ner", false, "secondary") Usage Workflow
 | 
			
		||||
 | 
			
		||||
+h(2, "train-dependency") Training the dependency parser
 | 
			
		||||
 | 
			
		||||
+code.
 | 
			
		||||
    from spacy.vocab import Vocab
 | 
			
		||||
| 
						 | 
				
			
			@ -67,7 +78,7 @@ p
 | 
			
		|||
p
 | 
			
		||||
    +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
 | 
			
		||||
 | 
			
		||||
+h(2, 'feature-templates') Customizing the feature extraction
 | 
			
		||||
+h(2, "feature-templates") Customizing the feature extraction
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  spaCy currently uses linear models for the tagger, parser and entity
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user