From ba8841234ad250274065c4c40308d566a42bef11 Mon Sep 17 00:00:00 2001 From: kendricktan Date: Mon, 24 Oct 2016 16:09:23 +1000 Subject: [PATCH] Fixed training examples Changes: 1. train_ner won't crash if no data directory is not found 2. Fixed train_tagger expected spacy.gold.GoldParse, got list --- examples/training/train_ner.py | 10 ++++++++++ examples/training/train_tagger.py | 27 +++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 7e08f23d5..8c96dc0a4 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -6,6 +6,7 @@ import random import spacy from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse +from spacy.tagger import Tagger def train_ner(nlp, train_data, entity_types): @@ -29,6 +30,15 @@ def main(model_dir=None): nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) + # v1.1.2 onwards + if nlp.tagger is None: + print('---- WARNING ----') + print('Data directory not found') + print('please run: `python -m spacy.en.download –force all` for better performance') + print('Using feature templates for tagging') + print('-----------------') + nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) + train_data = [ ( 'Who is Shaka Khan?', diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 9fac234dc..6d8f66630 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -10,8 +10,9 @@ from pathlib import Path from spacy.vocab import Vocab from spacy.tagger import Tagger from spacy.tokens import Doc -import random +from spacy.gold import GoldParse +import random # You need to define a mapping from your data's part-of-speech tag names to the # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. @@ -20,24 +21,25 @@ import random # You may also specify morphological features for your tags, from the universal # scheme. TAG_MAP = { - 'N': {"pos": "NOUN"}, - 'V': {"pos": "VERB"}, - 'J': {"pos": "ADJ"} - } + 'N': {"pos": "NOUN"}, + 'V': {"pos": "VERB"}, + 'J': {"pos": "ADJ"} +} # Usually you'll read this in, of course. Data formats vary. # Ensure your strings are unicode. DATA = [ ( ["I", "like", "green", "eggs"], - ["N", "V", "J", "N"] + ["N", "V", "J", "N"] ), ( ["Eat", "blue", "ham"], - ["V", "J", "N"] + ["V", "J", "N"] ) ] - + + def ensure_dir(path): if not path.exists(): path.mkdir() @@ -49,18 +51,19 @@ def main(output_dir=None): ensure_dir(output_dir) ensure_dir(output_dir / "pos") ensure_dir(output_dir / "vocab") - + vocab = Vocab(tag_map=TAG_MAP) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger(vocab) - for i in range(5): + for i in range(25): for words, tags in DATA: doc = Doc(vocab, words=words) - tagger.update(doc, tags) + gold = GoldParse(doc, tags=tags) + tagger.update(doc, gold) random.shuffle(DATA) tagger.model.end_training() - doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True]*4)) + doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) tagger(doc) for word in doc: print(word.text, word.tag_, word.pos_)