spaCy/examples/training/train_ner.py
Sofie Van Landeghem fcbf899b08
Feature/example only (#5707)
* remove _convert_examples

* fix test_gold, raise TypeError if tuples are used instead of Example's

* throwing proper errors when the wrong type of objects are passed

* fix deprectated format in tests

* fix deprectated format in parser tests

* fix tests for NEL, morph, senter, tagger, textcat

* update regression tests with new Example format

* use make_doc

* more fixes to nlp.update calls

* few more small fixes for rehearse and evaluate

* only import ml_datasets if really necessary
2020-07-06 13:02:36 +02:00

119 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function
import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.gold import Example
from spacy.util import minibatch, compounding
# training data
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
)
def main(model=None, output_dir=None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "simple_ner" not in nlp.pipe_names:
ner = nlp.create_pipe("simple_ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("simple_ner")
# add labels and create Example objects
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
print("Add label", ent[2])
ner.add_label(ent[2])
with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
# show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module="spacy")
# reset and initialize the weights randomly but only if we're
# training a new model
if model is None:
nlp.begin_training()
print(
"Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))
)
for itn in range(n_iter):
random.shuffle(train_examples)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
batch,
drop=0.0, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":
plac.call(main)
# Expected output:
# Entities [('Shaka Khan', 'PERSON')]
# Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
# ('Khan', 'PERSON', 1), ('?', '', 2)]
# Entities [('London', 'LOC'), ('Berlin', 'LOC')]
# Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
# ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]