spaCy/examples/training/train_ner_standalone.py

213 lines
6.8 KiB
Python
Raw Normal View History

2017-04-15 16:43:14 +03:00
#!/usr/bin/env python
2017-03-19 17:01:38 +03:00
'''Example of training a named entity recognition system from scratch using spaCy
This example is written to be self-contained and reasonably transparent.
To achieve that, it duplicates some of spaCy's internal functionality.
Specifically, in this example, we don't use spaCy's built-in Language class to
wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
our own simle Pipeline class, so that it's easier to see how the pieces
interact.
Input data:
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
Developed for: spaCy 1.7.1
2017-09-15 11:36:46 +03:00
Last tested for: spaCy 2.0.0a13
2017-03-19 17:01:38 +03:00
'''
from __future__ import unicode_literals, print_function
import plac
from pathlib import Path
import random
import json
2017-09-15 11:36:46 +03:00
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import tqdm
2017-03-19 17:01:38 +03:00
from spacy.vocab import Vocab
2017-09-15 11:36:46 +03:00
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
2017-03-19 17:01:38 +03:00
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.attrs import *
from spacy.gold import GoldParse
2017-09-15 11:36:46 +03:00
from spacy.gold import iob_to_biluo
from spacy.gold import minibatch
2017-03-19 17:01:38 +03:00
from spacy.scorer import Scorer
2017-09-15 11:36:46 +03:00
import spacy.util
2017-03-19 17:01:38 +03:00
try:
unicode
except NameError:
unicode = str
2017-09-15 11:36:46 +03:00
spacy.util.set_env_log(True)
2017-03-19 17:01:38 +03:00
def init_vocab():
return Vocab(
lex_attr_getters={
LOWER: lambda string: string.lower(),
2017-09-15 11:36:46 +03:00
NORM: lambda string: string.lower(),
2017-03-19 17:01:38 +03:00
PREFIX: lambda string: string[0],
SUFFIX: lambda string: string[-3:],
})
class Pipeline(object):
2017-09-15 11:36:46 +03:00
def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
2017-03-19 17:01:38 +03:00
if vocab is None:
2017-04-15 16:43:14 +03:00
vocab = init_vocab()
2017-03-19 17:01:38 +03:00
if tokenizer is None:
tokenizer = Tokenizer(vocab, {}, None, None, None)
2017-09-15 11:36:46 +03:00
if tensorizer is None:
tensorizer = TokenVectorEncoder(vocab)
2017-04-15 16:43:14 +03:00
if entity is None:
2017-09-15 11:36:46 +03:00
entity = NeuralEntityRecognizer(vocab)
2017-04-15 16:43:14 +03:00
self.vocab = vocab
self.tokenizer = tokenizer
2017-09-15 11:36:46 +03:00
self.tensorizer = tensorizer
2017-04-15 16:43:14 +03:00
self.entity = entity
2017-09-15 11:36:46 +03:00
self.pipeline = [tensorizer, self.entity]
def begin_training(self):
for model in self.pipeline:
model.begin_training([])
optimizer = Adam(NumpyOps(), 0.001)
return optimizer
2017-03-19 17:01:38 +03:00
def __call__(self, input_):
doc = self.make_doc(input_)
for process in self.pipeline:
process(doc)
return doc
def make_doc(self, input_):
if isinstance(input_, bytes):
input_ = input_.decode('utf8')
if isinstance(input_, unicode):
return self.tokenizer(input_)
else:
return Doc(self.vocab, words=input_)
def make_gold(self, input_, annotations):
doc = self.make_doc(input_)
gold = GoldParse(doc, entities=annotations)
return gold
2017-09-15 11:36:46 +03:00
def update(self, inputs, annots, sgd, losses=None, drop=0.):
if losses is None:
losses = {}
docs = [self.make_doc(input_) for input_ in inputs]
golds = [self.make_gold(input_, annot) for input_, annot in
zip(inputs, annots)]
tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
sgd=sgd, losses=losses)
bp_tensors(d_tensors, sgd=sgd)
return losses
2017-03-19 17:01:38 +03:00
def evaluate(self, examples):
scorer = Scorer()
for input_, annot in examples:
gold = self.make_gold(input_, annot)
doc = self(input_)
scorer.score(doc, gold)
return scorer.scores
2017-09-15 11:36:46 +03:00
def to_disk(self, path):
2017-03-19 17:01:38 +03:00
path = Path(path)
if not path.exists():
path.mkdir()
elif not path.is_dir():
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
2017-09-15 11:36:46 +03:00
self.vocab.to_disk(path / 'vocab')
self.tensorizer.to_disk(path / 'tensorizer')
self.entity.to_disk(path / 'ner')
def from_disk(self, path):
path = Path(path)
if not path.exists():
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
self.vocab = self.vocab.from_disk(path / 'vocab')
self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
self.entity = self.entity.from_disk(path / 'ner')
2017-03-19 17:01:38 +03:00
2017-09-15 11:36:46 +03:00
def train(nlp, train_examples, dev_examples, nr_epoch=5):
sgd = nlp.begin_training()
2017-03-19 17:01:38 +03:00
print("Iter", "Loss", "P", "R", "F")
for i in range(nr_epoch):
2017-09-15 11:36:46 +03:00
random.shuffle(train_examples)
losses = {}
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
inputs, annots = zip(*batch)
nlp.update(list(inputs), list(annots), sgd, losses=losses)
2017-03-19 17:01:38 +03:00
scores = nlp.evaluate(dev_examples)
2017-09-15 11:36:46 +03:00
report_scores(i, losses['ner'], scores)
2017-03-19 17:01:38 +03:00
scores = nlp.evaluate(dev_examples)
2017-04-15 16:43:14 +03:00
report_scores(channels, i+1, loss, scores)
def report_scores(i, loss, scores):
2017-04-15 16:43:14 +03:00
precision = '%.2f' % scores['ents_p']
recall = '%.2f' % scores['ents_r']
f_measure = '%.2f' % scores['ents_f']
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
2017-03-19 17:01:38 +03:00
def read_examples(path):
path = Path(path)
with path.open() as file_:
sents = file_.read().strip().split('\n\n')
for sent in sents:
2017-09-15 11:36:46 +03:00
sent = sent.strip()
if not sent:
2017-03-19 17:01:38 +03:00
continue
tokens = sent.split('\n')
while tokens and tokens[0].startswith('#'):
tokens.pop(0)
words = []
iob = []
for token in tokens:
if token.strip():
2017-09-15 11:36:46 +03:00
pieces = token.split('\t')
2017-03-19 17:01:38 +03:00
words.append(pieces[1])
iob.append(pieces[2])
yield words, iob_to_biluo(iob)
2017-09-15 11:36:46 +03:00
def get_labels(examples):
labels = set()
for words, tags in examples:
for tag in tags:
if '-' in tag:
labels.add(tag.split('-')[1])
return sorted(labels)
2017-03-19 17:01:38 +03:00
@plac.annotations(
model_dir=("Path to save the model", "positional", None, Path),
train_loc=("Path to your training data", "positional", None, Path),
dev_loc=("Path to your development data", "positional", None, Path),
)
2017-09-15 11:36:46 +03:00
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
print(model_dir, train_loc, dev_loc)
train_examples = list(read_examples(train_loc))
2017-03-19 17:01:38 +03:00
dev_examples = read_examples(dev_loc)
2017-09-15 11:36:46 +03:00
nlp = Pipeline()
for label in get_labels(train_examples):
nlp.entity.add_label(label)
print("Add label", label)
2017-03-19 17:01:38 +03:00
2017-09-15 11:36:46 +03:00
train(nlp, train_examples, list(dev_examples), nr_epoch)
2017-03-19 17:01:38 +03:00
2017-09-15 11:36:46 +03:00
nlp.to_disk(model_dir)
2017-03-19 17:01:38 +03:00
if __name__ == '__main__':
2017-09-15 11:36:46 +03:00
plac.call(main)