mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
fa89613444
|
@ -1,3 +1,32 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Example of training and additional entity type
|
||||
|
||||
This script shows how to add a new entity type to an existing pre-trained NER
|
||||
model. To keep the example short and simple, only four sentences are provided
|
||||
as examples. In practice, you'll need many more — a few hundred would be a
|
||||
good start. You will also likely need to mix in examples of other entity
|
||||
types, which might be obtained by running the entity recognizer over unlabelled
|
||||
sentences, and adding their annotations to the training set.
|
||||
|
||||
The actual training is performed by looping over the examples, and calling
|
||||
`nlp.entity.update()`. The `update()` method steps through the words of the
|
||||
input. At each word, it makes a prediction. It then consults the annotations
|
||||
provided on the GoldParse instance, to see whether it was right. If it was
|
||||
wrong, it adjusts its weights so that the correct action will score higher
|
||||
next time.
|
||||
|
||||
After training your model, you can save it to a directory. We recommend
|
||||
wrapping models as Python packages, for ease of deployment.
|
||||
|
||||
For more details, see the documentation:
|
||||
* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
|
||||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||
|
||||
Developed for: spaCy 1.7.6
|
||||
Last tested for: spaCy 1.7.6
|
||||
"""
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import random
|
||||
|
|
|
@ -11,3 +11,4 @@ ujson>=1.35
|
|||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
regex==2017.4.5
|
||||
pytest>=3.0.6,<4.0.0
|
||||
|
|
|
@ -1,39 +1,38 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||
from . import util
|
||||
from .deprecated import resolve_model_name
|
||||
from .cli import info
|
||||
|
||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
||||
|
||||
|
||||
set_lang_class(en.English.lang, en.English)
|
||||
set_lang_class(de.German.lang, de.German)
|
||||
set_lang_class(es.Spanish.lang, es.Spanish)
|
||||
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
|
||||
set_lang_class(fr.French.lang, fr.French)
|
||||
set_lang_class(it.Italian.lang, it.Italian)
|
||||
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
|
||||
set_lang_class(zh.Chinese.lang, zh.Chinese)
|
||||
set_lang_class(nl.Dutch.lang, nl.Dutch)
|
||||
set_lang_class(sv.Swedish.lang, sv.Swedish)
|
||||
set_lang_class(fi.Finnish.lang, fi.Finnish)
|
||||
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
||||
set_lang_class(he.Hebrew.lang, he.Hebrew)
|
||||
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
||||
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
|
||||
fi.Finnish, bn.Bengali, he.Hebrew)
|
||||
|
||||
|
||||
for _lang in _languages:
|
||||
util.set_lang_class(_lang.lang, _lang)
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
data_path = overrides.get('path', util.get_data_path())
|
||||
model_name = resolve_model_name(name)
|
||||
meta = parse_package_meta(data_path, model_name, require=False)
|
||||
if overrides.get('path') in (None, False, True):
|
||||
data_path = util.get_data_path()
|
||||
model_name = resolve_model_name(name)
|
||||
model_path = data_path / model_name
|
||||
if not model_path.exists():
|
||||
model_path = None
|
||||
util.print_msg(
|
||||
"Only loading the '{}' tokenizer.".format(name),
|
||||
title="Warning: no model found for '{}'".format(name))
|
||||
else:
|
||||
model_path = util.ensure_path(overrides['path'])
|
||||
data_path = model_path.parent
|
||||
meta = util.parse_package_meta(data_path, model_name, require=False)
|
||||
lang = meta['lang'] if meta and 'lang' in meta else name
|
||||
cls = get_lang_class(lang)
|
||||
cls = util.get_lang_class(lang)
|
||||
overrides['meta'] = meta
|
||||
model_path = Path(data_path / model_name)
|
||||
if model_path.exists():
|
||||
overrides['path'] = model_path
|
||||
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
|
|
|
@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides):
|
|||
def resolve_model_name(name):
|
||||
"""
|
||||
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||
not, user have upgraded from older version and have old models installed.
|
||||
not, user may have upgraded from older version and have old models installed.
|
||||
Check if old model directory exists and if so, return that instead and create
|
||||
shortcut link. If English model is found and no shortcut exists, raise error
|
||||
and tell user to install new model.
|
||||
|
|
|
@ -3,9 +3,8 @@ from __future__ import unicode_literals
|
|||
|
||||
from ...vocab import Vocab
|
||||
from ...tokenizer import Tokenizer
|
||||
from ...util import utf8open
|
||||
from ... import util
|
||||
|
||||
from os import path
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
|||
|
||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||
loc = path.join(path.dirname(__file__), file_name)
|
||||
text = utf8open(loc).read()
|
||||
loc = util.ensure_path(__file__).parent / file_name
|
||||
text = loc.open('r', encoding='utf8').read()
|
||||
assert len(text) != 0
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) > 100
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import ujson
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
@ -46,15 +45,6 @@ def ensure_path(path):
|
|||
return path
|
||||
|
||||
|
||||
def or_(val1, val2):
|
||||
if val1 is not None:
|
||||
return val1
|
||||
elif callable(val2):
|
||||
return val2()
|
||||
else:
|
||||
return val2
|
||||
|
||||
|
||||
def read_regex(path):
|
||||
path = ensure_path(path)
|
||||
with path.open() as file_:
|
||||
|
@ -103,10 +93,6 @@ def normalize_slice(length, start, stop, step=None):
|
|||
return start, stop
|
||||
|
||||
|
||||
def utf8open(loc, mode='r'):
|
||||
return io.open(loc, mode, encoding='utf8')
|
||||
|
||||
|
||||
def check_renamed_kwargs(renamed, kwargs):
|
||||
for old, new in renamed.items():
|
||||
if old in kwargs:
|
||||
|
|
|
@ -20,8 +20,10 @@
|
|||
"Word vectors": "word-vectors-similarities",
|
||||
"Deep learning": "deep-learning",
|
||||
"Custom tokenization": "customizing-tokenizer",
|
||||
"Adding languages": "adding-languages",
|
||||
"Training": "training",
|
||||
"Adding languages": "adding-languages"
|
||||
"Training NER": "training-ner",
|
||||
"Saving & loading": "saving-loading"
|
||||
},
|
||||
"Examples": {
|
||||
"Tutorials": "tutorials",
|
||||
|
@ -101,11 +103,21 @@
|
|||
|
||||
"customizing-tokenizer": {
|
||||
"title": "Customizing the tokenizer",
|
||||
"next": "training"
|
||||
"next": "adding-languages"
|
||||
},
|
||||
|
||||
"training": {
|
||||
"title": "Training the tagger, parser and entity recognizer"
|
||||
"title": "Training spaCy's statistical models",
|
||||
"next": "saving-loading"
|
||||
},
|
||||
|
||||
"training-ner": {
|
||||
"title": "Training the Named Entity Recognizer",
|
||||
"next": "saving-loading"
|
||||
},
|
||||
|
||||
"saving-loading": {
|
||||
"title": "Saving and loading models"
|
||||
},
|
||||
|
||||
"pos-tagging": {
|
||||
|
@ -356,6 +368,18 @@
|
|||
},
|
||||
|
||||
"code": {
|
||||
"Training a new entity type": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": ["ner", "training"]
|
||||
},
|
||||
|
||||
"Training an NER system from scratch": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": ["ner", "training"]
|
||||
},
|
||||
|
||||
"Information extraction": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
||||
"author": "Matthew Honnibal",
|
||||
|
|
|
@ -248,7 +248,7 @@ p
|
|||
+tag experimental
|
||||
|
||||
p
|
||||
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
||||
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
|
||||
| from an existing model data directory. All data files are copied over.
|
||||
| If the path to a meta.json is supplied, or a meta.json is found in the
|
||||
| input directory, this file is used. Otherwise, the data can be entered
|
||||
|
|
|
@ -235,62 +235,13 @@ p
|
|||
|
||||
p
|
||||
| If you've trained your own model, for example for
|
||||
| #[+a("/docs/usage/adding-languages") additional languages], you can
|
||||
| create a shortuct link for it by pointing #[code spacy.link] to the
|
||||
| model's data directory. To allow your model to be downloaded and
|
||||
| installed via pip, you'll also need to generate a package for it. You can
|
||||
| do this manually, or via the new
|
||||
| #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
|
||||
| create all required files, and walk you through generating the meta data.
|
||||
| #[+a("/docs/usage/adding-languages") additional languages] or
|
||||
| #[+a("/docs/usage/train-ner") custom named entities], you can save its
|
||||
| state using the #[code Language.save_to_directory()] method. To make the
|
||||
| model more convenient to deploy, we recommend wrapping it as a Python
|
||||
| package.
|
||||
|
||||
|
||||
+infobox("Important note")
|
||||
| The model packages are #[strong not suitable] for the public
|
||||
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
||||
| designed for binary data and files over 50 MB. However, if your company
|
||||
| is running an internal installation of pypi, publishing your models on
|
||||
| there can be a convenient solution to share them with your team.
|
||||
|
||||
p The model directory should look like this:
|
||||
|
||||
+code("Directory structure", "yaml").
|
||||
└── /
|
||||
├── MANIFEST.in # to include meta.json
|
||||
├── meta.json # model meta data
|
||||
├── setup.py # setup file for pip installation
|
||||
└── en_core_web_md # model directory
|
||||
├── __init__.py # init for pip installation
|
||||
└── en_core_web_md-1.2.0 # model data
|
||||
|
||||
p
|
||||
| You can find templates for all files in our
|
||||
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
||||
| Unless you want to customise installation and loading, the only file
|
||||
| you'll need to modify is #[code meta.json], which includes the model's
|
||||
| meta data. It will later be copied into the package and data directory.
|
||||
|
||||
+code("meta.json", "json").
|
||||
{
|
||||
"name": "core_web_md",
|
||||
"lang": "en",
|
||||
"version": "1.2.0",
|
||||
"spacy_version": "1.7.0",
|
||||
"description": "English model for spaCy",
|
||||
"author": "Explosion AI",
|
||||
"email": "contact@explosion.ai",
|
||||
"license": "MIT"
|
||||
}
|
||||
|
||||
p
|
||||
| Keep in mind that the directories need to be named according to the
|
||||
| naming conventions. The #[code lang] setting is also used to create the
|
||||
| respective #[code Language] class in spaCy, which will later be returned
|
||||
| by the model's #[code load()] method.
|
||||
|
||||
p
|
||||
| To generate the package, run the following command from within the
|
||||
| directory. This will create a #[code .tar.gz] archive in a directory
|
||||
| #[code /dist].
|
||||
|
||||
+code(false, "bash").
|
||||
python setup.py sdist
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||
|
|
108
website/docs/usage/saving-loading.jade
Normal file
108
website/docs/usage/saving-loading.jade
Normal file
|
@ -0,0 +1,108 @@
|
|||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| After training your model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||
| method:
|
||||
|
||||
+code.
|
||||
nlp.save_to_directory('/home/me/data/en_example_model')
|
||||
|
||||
p
|
||||
| The directory will be created if it doesn't exist, and the whole pipeline
|
||||
| will be written out. To make the model more convenient to deploy, we
|
||||
| recommend wrapping it as a Python package.
|
||||
|
||||
+h(2, "generating") Generating a model package
|
||||
|
||||
+infobox("Important note")
|
||||
| The model packages are #[strong not suitable] for the public
|
||||
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
||||
| designed for binary data and files over 50 MB. However, if your company
|
||||
| is running an internal installation of pypi, publishing your models on
|
||||
| there can be a convenient solution to share them with your team.
|
||||
|
||||
p
|
||||
| spaCy comes with a handy CLI command that will create all required files,
|
||||
| and walk you through generating the meta data. You can also create the
|
||||
| meta.json manually and place it in the model data directory, or supply a
|
||||
| path to it using the #[code --meta] flag. For more info on this, see the
|
||||
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
|
||||
|
||||
+aside-code("meta.json", "json").
|
||||
{
|
||||
"name": "example_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=1.7.0,<2.0.0",
|
||||
"description": "Example model for spaCy",
|
||||
"author": "You",
|
||||
"email": "you@example.com",
|
||||
"license": "CC BY-SA 3.0"
|
||||
}
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_example_model /home/me/my_models
|
||||
|
||||
p This command will create a model package directory that should look like this:
|
||||
|
||||
+code("Directory structure", "yaml").
|
||||
└── /
|
||||
├── MANIFEST.in # to include meta.json
|
||||
├── meta.json # model meta data
|
||||
├── setup.py # setup file for pip installation
|
||||
└── en_example_model # model directory
|
||||
├── __init__.py # init for pip installation
|
||||
└── en_example_model-1.0.0 # model data
|
||||
|
||||
p
|
||||
| You can also find templates for all files in our
|
||||
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
||||
| If you're creating the package manually, keep in mind that the directories
|
||||
| need to be named according to the naming conventions of
|
||||
| #[code [language]_[type]] and #[code [language]_[type]-[version]]. The
|
||||
| #[code lang] setting in the meta.json is also used to create the
|
||||
| respective #[code Language] class in spaCy, which will later be returned
|
||||
| by the model's #[code load()] method.
|
||||
|
||||
+h(2, "building") Building a model package
|
||||
|
||||
p
|
||||
| To build the package, run the following command from within the
|
||||
| directory. This will create a #[code .tar.gz] archive in a directory
|
||||
| #[code /dist].
|
||||
|
||||
+code(false, "bash").
|
||||
python setup.py sdist
|
||||
|
||||
p
|
||||
| For more information on building Python packages, see the
|
||||
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
|
||||
|
||||
|
||||
+h(2, "loading") Loading a model package
|
||||
|
||||
p
|
||||
| Model packages can be installed by pointing pip to the model's
|
||||
| #[code .tar.gz] archive:
|
||||
|
||||
+code(false, "bash").
|
||||
pip install /path/to/en_example_model-1.0.0.tar.gz
|
||||
|
||||
p You'll then be able to load the model as follows:
|
||||
|
||||
+code.
|
||||
import en_example_model
|
||||
nlp = en_example_model.load()
|
||||
|
||||
p
|
||||
| To load the model via #[code spacy.load()], you can also
|
||||
| create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
|
||||
| package name to a custom model name of your choice:
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy link en_example_model example
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
nlp = spacy.load('example')
|
174
website/docs/usage/training-ner.jade
Normal file
174
website/docs/usage/training-ner.jade
Normal file
|
@ -0,0 +1,174 @@
|
|||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
|
||||
| you can update a pre-trained model with new examples. You can even add
|
||||
| new classes to an existing model, to recognise a new entity type,
|
||||
| part-of-speech, or syntactic relation. Updating an existing model is
|
||||
| particularly useful as a "quick and dirty solution", if you have only a
|
||||
| few corrections or annotations.
|
||||
|
||||
+h(2, "improving-accuracy") Improving accuracy on existing entity types
|
||||
|
||||
p
|
||||
| To update the model, you first need to create an instance of
|
||||
| #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
|
||||
| you want to learn. You will then pass this instance to the
|
||||
| #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
|
||||
| method. For example:
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
nlp = spacy.load('en')
|
||||
doc = nlp.make_doc(u'Facebook released React in 2014')
|
||||
gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
|
||||
nlp.entity.update(doc, gold)
|
||||
|
||||
p
|
||||
| You'll usually need to provide many examples to meaningfully improve the
|
||||
| system — a few hundred is a good start, although more is better. You
|
||||
| should avoid iterating over the same few examples multiple times, or the
|
||||
| model is likely to "forget" how to annotate other examples. If you
|
||||
| iterate over the same few examples, you're effectively changing the loss
|
||||
| function. The optimizer will find a way to minimize the loss on your
|
||||
| examples, without regard for the consequences on the examples it's no
|
||||
| longer paying attention to.
|
||||
|
||||
p
|
||||
| One way to avoid this "catastrophic forgetting" problem is to "remind"
|
||||
| the model of other examples by augmenting your annotations with sentences
|
||||
| annotated with entities automatically recognised by the original model.
|
||||
| Ultimately, this is an empirical process: you'll need to
|
||||
| #[strong experiment on your own data] to find a solution that works best
|
||||
| for you.
|
||||
|
||||
+h(2, "adding") Adding a new entity type
|
||||
|
||||
p
|
||||
| You can add new entity types to an existing model. Let's say we want to
|
||||
| recognise the category #[code TECHNOLOGY]. The new category will include
|
||||
| programming languages, frameworks and platforms. First, we need to
|
||||
| register the new entity type:
|
||||
|
||||
+code.
|
||||
nlp.entity.add_label('TECHNOLOGY')
|
||||
|
||||
p
|
||||
| Next, iterate over your examples, calling #[code entity.update()]. As
|
||||
| above, we want to avoid iterating over only a small number of sentences.
|
||||
| A useful compromise is to run the model over a number of plain-text
|
||||
| sentences, and pass the entities to #[code GoldParse], as "true"
|
||||
| annotations. This encourages the optimizer to find a solution that
|
||||
| predicts the new category with minimal difference from the previous
|
||||
| output.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
| After training our model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||
| method:
|
||||
|
||||
+code.
|
||||
nlp.save_to_directory('/home/me/data/en_technology')
|
||||
|
||||
p
|
||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||
| a Python package, so that you can install it via pip and load it as a
|
||||
| module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
|
||||
| to create all required files and directories.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||
|
||||
p
|
||||
| To build the package and create a #[code .tar.gz] archive, run
|
||||
| #[code python setup.py sdist] from within its directory.
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||
|
||||
p
|
||||
| After you've generated and installed the package, you'll be able to
|
||||
| load the model as follows:
|
||||
|
||||
+code.
|
||||
import en_technology
|
||||
nlp = en_technology.load()
|
||||
|
||||
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
|
||||
|
||||
p
|
||||
| This script shows how to add a new entity type to an existing pre-trained
|
||||
| NER model. To keep the example short and simple, only four sentences are
|
||||
| provided as examples. In practice, you'll need many more —
|
||||
| #[strong a few hundred] would be a good start. You will also likely need
|
||||
| to mix in #[strong examples of other entity types], which might be
|
||||
| obtained by running the entity recognizer over unlabelled sentences, and
|
||||
| adding their annotations to the training set.
|
||||
|
||||
p
|
||||
| For the full, runnable script of this example, see
|
||||
| #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
|
||||
|
||||
+code("Training the entity recognizer").
|
||||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
import random
|
||||
|
||||
model_name = 'en'
|
||||
entity_label = 'ANIMAL'
|
||||
output_directory = '/path/to/model'
|
||||
train_data = [
|
||||
("Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("horses pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]),
|
||||
("they pretend to care about your feelings, those horses",
|
||||
[(48, 54, 'ANIMAL')])
|
||||
]
|
||||
|
||||
nlp = spacy.load(model_name)
|
||||
nlp.entity.add_label(entity_label)
|
||||
ner = train_ner(nlp, train_data, output_directory)
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
for itn in range(20):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
nlp.end_training()
|
||||
nlp.save_to_directory(output_dir)
|
||||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
|
||||
|
||||
p
|
||||
| The actual training is performed by looping over the examples, and
|
||||
| calling #[code nlp.entity.update()]. The #[code update()] method steps
|
||||
| through the words of the input. At each word, it makes a prediction. It
|
||||
| then consults the annotations provided on the #[code GoldParse] instance,
|
||||
| to see whether it was right. If it was wrong, it adjusts its weights so
|
||||
| that the correct action will score higher next time.
|
||||
|
||||
p
|
||||
| After training your model, you can
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
|
||||
| models as Python packages, for ease of deployment.
|
|
@ -1,13 +1,10 @@
|
|||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| This tutorial describes how to train new statistical models for spaCy's
|
||||
| This workflow describes how to train new statistical models for spaCy's
|
||||
| part-of-speech tagger, named entity recognizer and dependency parser.
|
||||
|
||||
p
|
||||
| I'll start with some quick code examples, that describe how to train
|
||||
| each model. I'll then provide a bit of background about the algorithms,
|
||||
| and explain how the data and feature templates work.
|
||||
| Once the model is trained, you can then
|
||||
| #[+a("/docs/usage/saving-loading") save and load] it.
|
||||
|
||||
+h(2, "train-pos-tagger") Training the part-of-speech tagger
|
||||
|
||||
|
@ -48,7 +45,21 @@ p
|
|||
p
|
||||
+button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
|
||||
|
||||
+h(2, "train-entity") Training the dependency parser
|
||||
+h(2, "extend-entity") Extending the named entity recognizer
|
||||
|
||||
p
|
||||
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
|
||||
| you can update a pre-trained model with new examples. You can even add
|
||||
| new classes to an existing model, to recognise a new entity type,
|
||||
| part-of-speech, or syntactic relation. Updating an existing model is
|
||||
| particularly useful as a "quick and dirty solution", if you have only a
|
||||
| few corrections or annotations.
|
||||
|
||||
p.o-inline-list
|
||||
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
|
||||
+button("/docs/usage/training-ner", false, "secondary") Usage Workflow
|
||||
|
||||
+h(2, "train-dependency") Training the dependency parser
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -67,7 +78,7 @@ p
|
|||
p
|
||||
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
||||
|
||||
+h(2, 'feature-templates') Customizing the feature extraction
|
||||
+h(2, "feature-templates") Customizing the feature extraction
|
||||
|
||||
p
|
||||
| spaCy currently uses linear models for the tagger, parser and entity
|
||||
|
|
Loading…
Reference in New Issue
Block a user