diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cbe2963d3..ef4070153 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,3 +1,32 @@ +#!/usr/bin/env python +""" +Example of training and additional entity type + +This script shows how to add a new entity type to an existing pre-trained NER +model. To keep the example short and simple, only four sentences are provided +as examples. In practice, you'll need many more — a few hundred would be a +good start. You will also likely need to mix in examples of other entity +types, which might be obtained by running the entity recognizer over unlabelled +sentences, and adding their annotations to the training set. + +The actual training is performed by looping over the examples, and calling +`nlp.entity.update()`. The `update()` method steps through the words of the +input. At each word, it makes a prediction. It then consults the annotations +provided on the GoldParse instance, to see whether it was right. If it was +wrong, it adjusts its weights so that the correct action will score higher +next time. + +After training your model, you can save it to a directory. We recommend +wrapping models as Python packages, for ease of deployment. + +For more details, see the documentation: +* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner +* Saving and loading models: https://spacy.io/docs/usage/saving-loading + +Developed for: spaCy 1.7.6 +Last tested for: spaCy 1.7.6 +""" +# coding: utf8 from __future__ import unicode_literals, print_function import random diff --git a/requirements.txt b/requirements.txt index f1f26171b..6212ab3cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +pytest>=3.0.6,<4.0.0 diff --git a/spacy/__init__.py b/spacy/__init__.py index bc668121f..06e9374ea 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,39 +1,38 @@ # coding: utf8 from __future__ import unicode_literals -from pathlib import Path - -from .util import set_lang_class, get_lang_class, parse_package_meta +from . import util from .deprecated import resolve_model_name from .cli import info from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he -set_lang_class(en.English.lang, en.English) -set_lang_class(de.German.lang, de.German) -set_lang_class(es.Spanish.lang, es.Spanish) -set_lang_class(pt.Portuguese.lang, pt.Portuguese) -set_lang_class(fr.French.lang, fr.French) -set_lang_class(it.Italian.lang, it.Italian) -set_lang_class(hu.Hungarian.lang, hu.Hungarian) -set_lang_class(zh.Chinese.lang, zh.Chinese) -set_lang_class(nl.Dutch.lang, nl.Dutch) -set_lang_class(sv.Swedish.lang, sv.Swedish) -set_lang_class(fi.Finnish.lang, fi.Finnish) -set_lang_class(bn.Bengali.lang, bn.Bengali) -set_lang_class(he.Hebrew.lang, he.Hebrew) +_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, + it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, + fi.Finnish, bn.Bengali, he.Hebrew) + + +for _lang in _languages: + util.set_lang_class(_lang.lang, _lang) def load(name, **overrides): - data_path = overrides.get('path', util.get_data_path()) - model_name = resolve_model_name(name) - meta = parse_package_meta(data_path, model_name, require=False) + if overrides.get('path') in (None, False, True): + data_path = util.get_data_path() + model_name = resolve_model_name(name) + model_path = data_path / model_name + if not model_path.exists(): + model_path = None + util.print_msg( + "Only loading the '{}' tokenizer.".format(name), + title="Warning: no model found for '{}'".format(name)) + else: + model_path = util.ensure_path(overrides['path']) + data_path = model_path.parent + meta = util.parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name - cls = get_lang_class(lang) + cls = util.get_lang_class(lang) overrides['meta'] = meta - model_path = Path(data_path / model_name) - if model_path.exists(): - overrides['path'] = model_path - + overrides['path'] = model_path return cls(**overrides) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index f481a2502..65053089a 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides): def resolve_model_name(name): """ If spaCy is loaded with 'de', check if symlink already exists. If - not, user have upgraded from older version and have old models installed. + not, user may have upgraded from older version and have old models installed. Check if old model directory exists and if so, return that instead and create shortcut link. If English model is found and no shortcut exists, raise error and tell user to install new model. diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 22afa1f43..da79b43a8 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -3,9 +3,8 @@ from __future__ import unicode_literals from ...vocab import Vocab from ...tokenizer import Tokenizer -from ...util import utf8open +from ... import util -from os import path import pytest @@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize('file_name', ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): - loc = path.join(path.dirname(__file__), file_name) - text = utf8open(loc).read() + loc = util.ensure_path(__file__).parent / file_name + text = loc.open('r', encoding='utf8').read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 diff --git a/spacy/util.py b/spacy/util.py index f8af8baa3..573489682 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import io import ujson import re from pathlib import Path @@ -46,15 +45,6 @@ def ensure_path(path): return path -def or_(val1, val2): - if val1 is not None: - return val1 - elif callable(val2): - return val2() - else: - return val2 - - def read_regex(path): path = ensure_path(path) with path.open() as file_: @@ -103,10 +93,6 @@ def normalize_slice(length, start, stop, step=None): return start, stop -def utf8open(loc, mode='r'): - return io.open(loc, mode, encoding='utf8') - - def check_renamed_kwargs(renamed, kwargs): for old, new in renamed.items(): if old in kwargs: diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c8c85af1d..2ffbf9d68 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -20,8 +20,10 @@ "Word vectors": "word-vectors-similarities", "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Adding languages": "adding-languages", "Training": "training", - "Adding languages": "adding-languages" + "Training NER": "training-ner", + "Saving & loading": "saving-loading" }, "Examples": { "Tutorials": "tutorials", @@ -101,11 +103,21 @@ "customizing-tokenizer": { "title": "Customizing the tokenizer", - "next": "training" + "next": "adding-languages" }, "training": { - "title": "Training the tagger, parser and entity recognizer" + "title": "Training spaCy's statistical models", + "next": "saving-loading" + }, + + "training-ner": { + "title": "Training the Named Entity Recognizer", + "next": "saving-loading" + }, + + "saving-loading": { + "title": "Saving and loading models" }, "pos-tagging": { @@ -356,6 +368,18 @@ }, "code": { + "Training a new entity type": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + + "Training an NER system from scratch": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 5ad8a214d..e4d762615 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -248,7 +248,7 @@ p +tag experimental p - | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | from an existing model data directory. All data files are copied over. | If the path to a meta.json is supplied, or a meta.json is found in the | input directory, this file is used. Otherwise, the data can be entered diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 39c271df4..9d50dcbc0 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -235,62 +235,13 @@ p p | If you've trained your own model, for example for - | #[+a("/docs/usage/adding-languages") additional languages], you can - | create a shortuct link for it by pointing #[code spacy.link] to the - | model's data directory. To allow your model to be downloaded and - | installed via pip, you'll also need to generate a package for it. You can - | do this manually, or via the new - | #[+a("/docs/usage/cli#package") #[code spacy package] command] that will - | create all required files, and walk you through generating the meta data. + | #[+a("/docs/usage/adding-languages") additional languages] or + | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | state using the #[code Language.save_to_directory()] method. To make the + | model more convenient to deploy, we recommend wrapping it as a Python + | package. - -+infobox("Important note") - | The model packages are #[strong not suitable] for the public - | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not - | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. - -p The model directory should look like this: - -+code("Directory structure", "yaml"). - └── / - ├── MANIFEST.in # to include meta.json - ├── meta.json # model meta data - ├── setup.py # setup file for pip installation - └── en_core_web_md # model directory - ├── __init__.py # init for pip installation - └── en_core_web_md-1.2.0 # model data - -p - | You can find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. - | Unless you want to customise installation and loading, the only file - | you'll need to modify is #[code meta.json], which includes the model's - | meta data. It will later be copied into the package and data directory. - -+code("meta.json", "json"). - { - "name": "core_web_md", - "lang": "en", - "version": "1.2.0", - "spacy_version": "1.7.0", - "description": "English model for spaCy", - "author": "Explosion AI", - "email": "contact@explosion.ai", - "license": "MIT" - } - -p - | Keep in mind that the directories need to be named according to the - | naming conventions. The #[code lang] setting is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. - -p - | To generate the package, run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. - -+code(false, "bash"). - python setup.py sdist ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade new file mode 100644 index 000000000..063c5dc50 --- /dev/null +++ b/website/docs/usage/saving-loading.jade @@ -0,0 +1,108 @@ +include ../../_includes/_mixins + +p + | After training your model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_example_model') + +p + | The directory will be created if it doesn't exist, and the whole pipeline + | will be written out. To make the model more convenient to deploy, we + | recommend wrapping it as a Python package. + ++h(2, "generating") Generating a model package + ++infobox("Important note") + | The model packages are #[strong not suitable] for the public + | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not + | designed for binary data and files over 50 MB. However, if your company + | is running an internal installation of pypi, publishing your models on + | there can be a convenient solution to share them with your team. + +p + | spaCy comes with a handy CLI command that will create all required files, + | and walk you through generating the meta data. You can also create the + | meta.json manually and place it in the model data directory, or supply a + | path to it using the #[code --meta] flag. For more info on this, see the + | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + ++aside-code("meta.json", "json"). + { + "name": "example_model", + "lang": "en", + "version": "1.0.0", + "spacy_version": ">=1.7.0,<2.0.0", + "description": "Example model for spaCy", + "author": "You", + "email": "you@example.com", + "license": "CC BY-SA 3.0" + } + ++code(false, "bash"). + python -m spacy package /home/me/data/en_example_model /home/me/my_models + +p This command will create a model package directory that should look like this: + ++code("Directory structure", "yaml"). + └── / + ├── MANIFEST.in # to include meta.json + ├── meta.json # model meta data + ├── setup.py # setup file for pip installation + └── en_example_model # model directory + ├── __init__.py # init for pip installation + └── en_example_model-1.0.0 # model data + +p + | You can also find templates for all files in our + | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | If you're creating the package manually, keep in mind that the directories + | need to be named according to the naming conventions of + | #[code [language]_[type]] and #[code [language]_[type]-[version]]. The + | #[code lang] setting in the meta.json is also used to create the + | respective #[code Language] class in spaCy, which will later be returned + | by the model's #[code load()] method. + ++h(2, "building") Building a model package + +p + | To build the package, run the following command from within the + | directory. This will create a #[code .tar.gz] archive in a directory + | #[code /dist]. + ++code(false, "bash"). + python setup.py sdist + +p + | For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + + ++h(2, "loading") Loading a model package + +p + | Model packages can be installed by pointing pip to the model's + | #[code .tar.gz] archive: + ++code(false, "bash"). + pip install /path/to/en_example_model-1.0.0.tar.gz + +p You'll then be able to load the model as follows: + ++code. + import en_example_model + nlp = en_example_model.load() + +p + | To load the model via #[code spacy.load()], you can also + | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the + | package name to a custom model name of your choice: + ++code(false, "bash"). + python -m spacy link en_example_model example + ++code. + import spacy + nlp = spacy.load('example') diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade new file mode 100644 index 000000000..78eb4905e --- /dev/null +++ b/website/docs/usage/training-ner.jade @@ -0,0 +1,174 @@ +include ../../_includes/_mixins + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + ++h(2, "improving-accuracy") Improving accuracy on existing entity types + +p + | To update the model, you first need to create an instance of + | #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels + | you want to learn. You will then pass this instance to the + | #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]] + | method. For example: + ++code. + import spacy + from spacy.gold import GoldParse + + nlp = spacy.load('en') + doc = nlp.make_doc(u'Facebook released React in 2014') + gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) + nlp.entity.update(doc, gold) + +p + | You'll usually need to provide many examples to meaningfully improve the + | system — a few hundred is a good start, although more is better. You + | should avoid iterating over the same few examples multiple times, or the + | model is likely to "forget" how to annotate other examples. If you + | iterate over the same few examples, you're effectively changing the loss + | function. The optimizer will find a way to minimize the loss on your + | examples, without regard for the consequences on the examples it's no + | longer paying attention to. + +p + | One way to avoid this "catastrophic forgetting" problem is to "remind" + | the model of other examples by augmenting your annotations with sentences + | annotated with entities automatically recognised by the original model. + | Ultimately, this is an empirical process: you'll need to + | #[strong experiment on your own data] to find a solution that works best + | for you. + ++h(2, "adding") Adding a new entity type + +p + | You can add new entity types to an existing model. Let's say we want to + | recognise the category #[code TECHNOLOGY]. The new category will include + | programming languages, frameworks and platforms. First, we need to + | register the new entity type: + ++code. + nlp.entity.add_label('TECHNOLOGY') + +p + | Next, iterate over your examples, calling #[code entity.update()]. As + | above, we want to avoid iterating over only a small number of sentences. + | A useful compromise is to run the model over a number of plain-text + | sentences, and pass the entities to #[code GoldParse], as "true" + | annotations. This encourages the optimizer to find a solution that + | predicts the new category with minimal difference from the previous + | output. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] + | to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. + +p + | After you've generated and installed the package, you'll be able to + | load the model as follows: + ++code. + import en_technology + nlp = en_technology.load() + ++h(2, "example") Example: Adding and training an #[code ANIMAL] entity + +p + | This script shows how to add a new entity type to an existing pre-trained + | NER model. To keep the example short and simple, only four sentences are + | provided as examples. In practice, you'll need many more — + | #[strong a few hundred] would be a good start. You will also likely need + | to mix in #[strong examples of other entity types], which might be + | obtained by running the entity recognizer over unlabelled sentences, and + | adding their annotations to the training set. + +p + | For the full, runnable script of this example, see + | #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py]. + ++code("Training the entity recognizer"). + import spacy + from spacy.pipeline import EntityRecognizer + from spacy.gold import GoldParse + from spacy.tagger import Tagger + import random + + model_name = 'en' + entity_label = 'ANIMAL' + output_directory = '/path/to/model' + train_data = [ + ("Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')]) + ] + + nlp = spacy.load(model_name) + nlp.entity.add_label(entity_label) + ner = train_ner(nlp, train_data, output_directory) + + def train_ner(nlp, train_data, output_dir): + # Add new words to vocab + for raw_text, _ in train_data: + doc = nlp.make_doc(raw_text) + for word in doc: + _ = nlp.vocab[word.orth] + + for itn in range(20): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + gold = GoldParse(doc, entities=entity_offsets) + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + loss = nlp.entity.update(doc, gold) + nlp.end_training() + nlp.save_to_directory(output_dir) + +p + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example + +p + | The actual training is performed by looping over the examples, and + | calling #[code nlp.entity.update()]. The #[code update()] method steps + | through the words of the input. At each word, it makes a prediction. It + | then consults the annotations provided on the #[code GoldParse] instance, + | to see whether it was right. If it was wrong, it adjusts its weights so + | that the correct action will score higher next time. + +p + | After training your model, you can + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping + | models as Python packages, for ease of deployment. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 39f524829..8a5c111bd 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -1,13 +1,10 @@ include ../../_includes/_mixins p - | This tutorial describes how to train new statistical models for spaCy's + | This workflow describes how to train new statistical models for spaCy's | part-of-speech tagger, named entity recognizer and dependency parser. - -p - | I'll start with some quick code examples, that describe how to train - | each model. I'll then provide a bit of background about the algorithms, - | and explain how the data and feature templates work. + | Once the model is trained, you can then + | #[+a("/docs/usage/saving-loading") save and load] it. +h(2, "train-pos-tagger") Training the part-of-speech tagger @@ -48,7 +45,21 @@ p p +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example -+h(2, "train-entity") Training the dependency parser ++h(2, "extend-entity") Extending the named entity recognizer + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + +p.o-inline-list + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example + +button("/docs/usage/training-ner", false, "secondary") Usage Workflow + ++h(2, "train-dependency") Training the dependency parser +code. from spacy.vocab import Vocab @@ -67,7 +78,7 @@ p p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example -+h(2, 'feature-templates') Customizing the feature extraction ++h(2, "feature-templates") Customizing the feature extraction p | spaCy currently uses linear models for the tagger, parser and entity