From 07726cf0a63939daaaa90bd15b752707989dce88 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 19 Mar 2017 15:01:38 +0100 Subject: [PATCH] Add example of standalone NER training --- examples/train_ner_standalone.py | 235 +++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 examples/train_ner_standalone.py diff --git a/examples/train_ner_standalone.py b/examples/train_ner_standalone.py new file mode 100644 index 000000000..612e7bec3 --- /dev/null +++ b/examples/train_ner_standalone.py @@ -0,0 +1,235 @@ +'''Example of training a named entity recognition system from scratch using spaCy + +This example is written to be self-contained and reasonably transparent. +To achieve that, it duplicates some of spaCy's internal functionality. + +Specifically, in this example, we don't use spaCy's built-in Language class to +wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write +our own simle Pipeline class, so that it's easier to see how the pieces +interact. + +Input data: +https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip + +Developed for: spaCy 1.7.1 +Last tested for: spaCy 1.7.1 +''' +from __future__ import unicode_literals, print_function +import plac +from pathlib import Path +import random +import json + +import spacy.orth as orth_funcs +from spacy.vocab import Vocab +from spacy.pipeline import BeamEntityRecognizer +from spacy.pipeline import EntityRecognizer +from spacy.tokenizer import Tokenizer +from spacy.tokens import Doc +from spacy.attrs import * +from spacy.gold import GoldParse +from spacy.gold import _iob_to_biluo as iob_to_biluo +from spacy.scorer import Scorer + +try: + unicode +except NameError: + unicode = str + + +def init_vocab(): + return Vocab( + lex_attr_getters={ + LOWER: lambda string: string.lower(), + SHAPE: orth_funcs.word_shape, + PREFIX: lambda string: string[0], + SUFFIX: lambda string: string[-3:], + CLUSTER: lambda string: 0, + IS_ALPHA: orth_funcs.is_alpha, + IS_ASCII: orth_funcs.is_ascii, + IS_DIGIT: lambda string: string.isdigit(), + IS_LOWER: orth_funcs.is_lower, + IS_PUNCT: orth_funcs.is_punct, + IS_SPACE: lambda string: string.isspace(), + IS_TITLE: orth_funcs.is_title, + IS_UPPER: orth_funcs.is_upper, + IS_STOP: lambda string: False, + IS_OOV: lambda string: True + }) + + +def save_vocab(vocab, path): + path = Path(path) + if not path.exists(): + path.mkdir() + elif not path.is_dir(): + raise IOError("Can't save vocab to %s\nNot a directory" % path) + with (path / 'strings.json').open('w') as file_: + vocab.strings.dump(file_) + vocab.dump((path / 'lexemes.bin').as_posix()) + + +def load_vocab(path): + path = Path(path) + if not path.exists(): + raise IOError("Cannot load vocab from %s\nDoes not exist" % path) + if not path.is_dir(): + raise IOError("Cannot load vocab from %s\nNot a directory" % path) + return Vocab.load(path) + + +def init_ner_model(vocab, features=None): + if features is None: + features = tuple(EntityRecognizer.feature_templates) + return BeamEntityRecognizer(vocab, features=features) + + +def save_ner_model(model, path): + path = Path(path) + if not path.exists(): + path.mkdir() + if not path.is_dir(): + raise IOError("Can't save model to %s\nNot a directory" % path) + model.model.dump((path / 'model').as_posix()) + with (path / 'config.json').open('w') as file_: + data = json.dumps(model.cfg) + if not isinstance(data, unicode): + data = data.decode('utf8') + file_.write(data) + + +def load_ner_model(vocab, path): + return BeamEntityRecognizer.load(path, vocab) + + +class Pipeline(object): + @classmethod + def load(cls, path): + path = Path(path) + if not path.exists(): + raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) + if not path.is_dir(): + raise IOError("Cannot load pipeline from %s\nNot a directory" % path) + vocab = load_vocab(path / 'vocab') + tokenizer = Tokenizer(vocab, {}, None, None, None) + ner_model = load_ner_model(vocab, path / 'ner') + return cls(vocab, tokenizer, ner_model) + + def __init__(self, vocab=None, tokenizer=None, ner_model=None): + if vocab is None: + self.vocab = init_vocab() + if tokenizer is None: + tokenizer = Tokenizer(vocab, {}, None, None, None) + if ner_model is None: + self.entity = init_ner_model(self.vocab) + self.pipeline = [self.entity] + + def __call__(self, input_): + doc = self.make_doc(input_) + for process in self.pipeline: + process(doc) + return doc + + def make_doc(self, input_): + if isinstance(input_, bytes): + input_ = input_.decode('utf8') + if isinstance(input_, unicode): + return self.tokenizer(input_) + else: + return Doc(self.vocab, words=input_) + + def make_gold(self, input_, annotations): + doc = self.make_doc(input_) + gold = GoldParse(doc, entities=annotations) + return gold + + def update(self, input_, annot): + doc = self.make_doc(input_) + gold = self.make_gold(input_, annot) + for ner in gold.ner: + if ner not in (None, '-', 'O'): + action, label = ner.split('-', 1) + self.entity.add_label(label) + return self.entity.update(doc, gold) + + def evaluate(self, examples): + scorer = Scorer() + for input_, annot in examples: + gold = self.make_gold(input_, annot) + doc = self(input_) + scorer.score(doc, gold) + return scorer.scores + + def average_weights(self): + self.entity.model.end_training() + + def save(self, path): + path = Path(path) + if not path.exists(): + path.mkdir() + elif not path.is_dir(): + raise IOError("Can't save pipeline to %s\nNot a directory" % path) + save_vocab(self.vocab, path / 'vocab') + save_ner_model(self.entity, path / 'ner') + + +def train(nlp, train_examples, dev_examples, nr_epoch=5): + next_epoch = train_examples + print("Iter", "Loss", "P", "R", "F") + for i in range(nr_epoch): + this_epoch = next_epoch + next_epoch = [] + loss = 0 + for input_, annot in this_epoch: + loss += nlp.update(input_, annot) + if (i+1) < nr_epoch: + next_epoch.append((input_, annot)) + random.shuffle(next_epoch) + scores = nlp.evaluate(dev_examples) + precision = '%.2f' % scores['ents_p'] + recall = '%.2f' % scores['ents_r'] + f_measure = '%.2f' % scores['ents_f'] + print(i, int(loss), precision, recall, f_measure) + nlp.average_weights() + scores = nlp.evaluate(dev_examples) + print("After averaging") + print(scores['ents_p'], scores['ents_r'], scores['ents_f']) + + +def read_examples(path): + path = Path(path) + with path.open() as file_: + sents = file_.read().strip().split('\n\n') + for sent in sents: + if not sent.strip(): + continue + tokens = sent.split('\n') + while tokens and tokens[0].startswith('#'): + tokens.pop(0) + words = [] + iob = [] + for token in tokens: + if token.strip(): + pieces = token.split() + words.append(pieces[1]) + iob.append(pieces[2]) + yield words, iob_to_biluo(iob) + + +@plac.annotations( + model_dir=("Path to save the model", "positional", None, Path), + train_loc=("Path to your training data", "positional", None, Path), + dev_loc=("Path to your development data", "positional", None, Path), +) +def main(model_dir, train_loc, dev_loc, nr_epoch=10): + train_examples = read_examples(train_loc) + dev_examples = read_examples(dev_loc) + nlp = Pipeline() + + train(nlp, train_examples, list(dev_examples), nr_epoch) + + nlp.save(model_dir) + + +if __name__ == '__main__': + plac.call(main)