From e5151056cfbcbcc2899b43fadbce28de209d97b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Oct 2016 11:41:20 +0200 Subject: [PATCH] Fix NER training example --- examples/train_ner.py | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 examples/train_ner.py diff --git a/examples/train_ner.py b/examples/train_ner.py new file mode 100644 index 000000000..3f748a488 --- /dev/null +++ b/examples/train_ner.py @@ -0,0 +1,64 @@ +from __future__ import unicode_literals, print_function +import json +import pathlib +import random + +import spacy +from spacy.pipeline import EntityRecognizer +from spacy.gold import GoldParse + + +def train_ner(nlp, train_data, entity_types): + ner = EntityRecognizer.blank(nlp.vocab, entity_types=entity_types, + features=nlp.Defaults.entity_features) + for itn in range(5): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + ner.update(doc, gold) + ner.model.end_training() + return ner + + +def main(model_dir=None): + if model_dir is not None: + model_dir = pathlb.Path(model_dir) + if not model_dir.exists(): + model_dir.mkdir() + assert model_dir.isdir() + + nlp = spacy.load('en', parser=False, entity=False, vectors=False) + + train_data = [ + ( + 'Who is Shaka Khan?', + [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] + ), + ( + 'I like London and Berlin.', + [(len('I like '), len('I like London'), 'LOC'), + (len('I like London and '), len('I like London and Berlin'), 'LOC')] + ) + ] + ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) + + doc = nlp.make_doc('Who is Shaka Khan?') + nlp.tagger(doc) + ner(doc) + for word in doc: + print(word.text, word.tag_, word.ent_type_, word.ent_iob) + + if model_dir is not None: + with (model_dir / 'config.json').open('wb') as file_: + json.dump(ner.cfg, file_) + ner.model.dump(str(model_dir / 'model')) + + +if __name__ == '__main__': + main() + # Who "" 2 + # is "" 2 + # Shaka "" PERSON 3 + # Khan "" PERSON 1 + # ? "" 2