diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index b51a4a10c..7e678a3d1 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -14,55 +14,49 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best ('hotel', 'PLACE', 'show') --> show PLACE hotel ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin + +Developed for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac import random import spacy -from spacy.gold import GoldParse -from spacy.tokens import Doc from pathlib import Path -# training data: words, head and dependency labels +# training data: texts, heads and dependency labels # for no relation, we simply chose an arbitrary dependency label, e.g. '-' TRAIN_DATA = [ - ( - ['find', 'a', 'cafe', 'with', 'great', 'wifi'], - [0, 2, 0, 5, 5, 2], # index of token head - ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] - ), - ( - ['find', 'a', 'hotel', 'near', 'the', 'beach'], - [0, 2, 0, 5, 5, 2], - ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] - ), - ( - ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'], - [0, 0, 4, 4, 0, 6, 4, 6, 6], - ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] - ), - ( - ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'], - [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! - ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] - ), - ( - ['find', 'a', 'nice', 'restaurant', 'in', 'london'], - [0, 3, 3, 0, 3, 3], - ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] - ), - ( - ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'], - [0, 0, 4, 4, 0, 4, 4], - ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] - ), - ( - ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'], - [0, 4, 4, 4, 0, 4, 5], - ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] - ) + ("find a cafe with great wifi", { + 'heads': [0, 2, 0, 5, 5, 2], # index of token head + 'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] + }), + ("find a hotel near the beach", { + 'heads': [0, 2, 0, 5, 5, 2], + 'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] + }), + ("find me the closest gym that's open late", { + 'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6], + 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] + }), + ("show me the cheapest store that sells flowers", { + 'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! + 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] + }), + ("find a nice restaurant in london", { + 'heads': [0, 3, 3, 0, 3, 3], + 'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + }), + ("show me the coolest hostel in berlin", { + 'heads': [0, 0, 4, 4, 0, 4, 4], + 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] + }), + ("find a good italian restaurant near work", { + 'heads': [0, 4, 4, 4, 0, 4, 5], + 'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION'] + }) ] @@ -88,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=100): else: parser = nlp.get_pipe('parser') - for _, _, deps in TRAIN_DATA: - for dep in deps: + for text, annotations in TRAIN_DATA: + for dep in annotations.get('deps', []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] @@ -98,10 +92,8 @@ def main(model=None, output_dir=None, n_iter=100): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for words, heads, deps in TRAIN_DATA: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - nlp.update([doc], [gold], sgd=optimizer, losses=losses) + for text, annotations in TRAIN_DATA: + nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model @@ -147,6 +139,7 @@ if __name__ == '__main__': # ('find', 'ROOT', 'find'), # ('cheapest', 'QUALITY', 'gym'), # ('gym', 'PLACE', 'find') + # ('work', 'LOCATION', 'near') # ] # show me the best hotel in berlin # [ diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index e95cce4c9..79b74535d 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -8,22 +8,24 @@ For more details, see the documentation: * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities Developed for: spaCy 2.0.0a18 -Last updated for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac import random from pathlib import Path - import spacy -from spacy.gold import GoldParse, biluo_tags_from_offsets # training data TRAIN_DATA = [ - ('Who is Shaka Khan?', [(7, 17, 'PERSON')]), - ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) + ('Who is Shaka Khan?', { + 'entities': [(7, 17, 'PERSON')] + }), + ('I like London and Berlin.', { + 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')] + }) ] @@ -45,25 +47,28 @@ def main(model=None, output_dir=None, n_iter=100): if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) + # otherwise, get it so we can add labels + else: + ner = nlp.get_pipe('ner') - # function that allows begin_training to get the training data - get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA) + # add labels + for _, annotations in TRAIN_DATA: + for ent in annotations.get('entities'): + ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER - optimizer = nlp.begin_training(get_data) + optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for raw_text, entity_offsets in TRAIN_DATA: - doc = nlp.make_doc(raw_text) - gold = GoldParse(doc, entities=entity_offsets) + for text, annotations in TRAIN_DATA: nlp.update( - [doc], # Batch of Doc objects - [gold], # Batch of GoldParse objects - drop=0.5, # Dropout -- make it harder to memorise data - sgd=optimizer, # Callable to update weights + [text], # batch of texts + [annotations], # batch of annotations + drop=0.5, # dropout - make it harder to memorise data + sgd=optimizer, # callable to update weights losses=losses) print(losses) @@ -90,25 +95,13 @@ def main(model=None, output_dir=None, n_iter=100): print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) -def reformat_train_data(tokenizer, examples): - """Reformat data to match JSON format. - https://alpha.spacy.io/api/annotation#json-input - - tokenizer (Tokenizer): Tokenizer to process the raw text. - examples (list): The trainig data. - RETURNS (list): The reformatted training data.""" - output = [] - for i, (text, entity_offsets) in enumerate(examples): - doc = tokenizer(text) - ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets) - words = [w.text for w in doc] - tags = ['-'] * len(doc) - heads = [0] * len(doc) - deps = [''] * len(doc) - sentence = (range(len(doc)), words, tags, heads, deps, ner_tags) - output.append((text, [(sentence, [])])) - return output - - if __name__ == '__main__': plac.call(main) + + # Expected output: + # Entities [('Shaka Khan', 'PERSON')] + # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), + # ('Khan', 'PERSON', 1), ('?', '', 2)] + # Entities [('London', 'LOC'), ('Berlin', 'LOC')] + # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), + # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index b43c5b61f..7ce7dc1d5 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -24,16 +24,14 @@ For more details, see the documentation: * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities Developed for: spaCy 2.0.0a18 -Last updated for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac import random from pathlib import Path - import spacy -from spacy.gold import GoldParse, minibatch # new entity label @@ -45,20 +43,29 @@ LABEL = 'ANIMAL' # model might learn the new type, but "forget" what it previously knew. # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting TRAIN_DATA = [ - ("Horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')]), + ("Horses are too tall and they pretend to care about your feelings", { + 'entities': [(0, 6, 'ANIMAL')] + }), - ("Do they bite?", []), + ("Do they bite?", { + 'entities': [] + }), - ("horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')]), + ("horses are too tall and they pretend to care about your feelings", { + 'entities': [(0, 6, 'ANIMAL')] + }), - ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]), + ("horses pretend to care about your feelings", { + 'entities': [(0, 6, 'ANIMAL')] + }), - ("they pretend to care about your feelings, those horses", - [(48, 54, 'ANIMAL')]), + ("they pretend to care about your feelings, those horses", { + 'entities': [(48, 54, 'ANIMAL')] + }), - ("horses?", [(0, 6, 'ANIMAL')]) + ("horses?", { + 'entities': [(0, 6, 'ANIMAL')] + }) ] @@ -90,15 +97,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER - random.seed(0) optimizer = nlp.begin_training() for itn in range(n_iter): + random.shuffle(TRAIN_DATA) losses = {} - gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) - for batch in minibatch(gold_parses, size=3): - docs, golds = zip(*batch) - nlp.update(docs, golds, losses=losses, sgd=optimizer, - drop=0.35) + for text, annotations in TRAIN_DATA: + nlp.update([text], [annotations], sgd=optimizer, drop=0.35, + losses=losses) print(losses) # test the trained model @@ -125,19 +130,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50): print(ent.label_, ent.text) -def get_gold_parses(tokenizer, train_data): - """Shuffle and create GoldParse objects. - - tokenizer (Tokenizer): Tokenizer to processs the raw text. - train_data (list): The training data. - YIELDS (tuple): (doc, gold) tuples. - """ - random.shuffle(train_data) - for raw_text, entity_offsets in train_data: - doc = tokenizer(raw_text) - gold = GoldParse(doc, entities=entity_offsets) - yield doc, gold - - if __name__ == '__main__': plac.call(main) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 9e1d10414..c19ff7ac1 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -13,24 +13,19 @@ from __future__ import unicode_literals, print_function import plac import random from pathlib import Path - import spacy -from spacy.gold import GoldParse -from spacy.tokens import Doc # training data TRAIN_DATA = [ - ( - ['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], - [1, 1, 4, 4, 5, 1, 1], - ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] - ), - ( - ['I', 'like', 'London', 'and', 'Berlin', '.'], - [1, 1, 1, 2, 2, 1], - ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] - ) + ("They trade mortgage-backed securities.", { + 'heads': [1, 1, 4, 4, 5, 1, 1], + 'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] + }), + ("I like London and Berlin", { + 'heads': [1, 1, 1, 2, 2, 1], + 'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct'] + }) ] @@ -38,7 +33,7 @@ TRAIN_DATA = [ model=("Model name. Defaults to blank 'en' model.", "option", "m", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int)) -def main(model=None, output_dir=None, n_iter=1000): +def main(model=None, output_dir=None, n_iter=10): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model @@ -57,8 +52,8 @@ def main(model=None, output_dir=None, n_iter=1000): parser = nlp.get_pipe('parser') # add labels to the parser - for _, _, deps in TRAIN_DATA: - for dep in deps: + for _, annotations in TRAIN_DATA: + for dep in annotations.get('deps', []): parser.add_label(dep) # get names of other pipes to disable them during training @@ -68,10 +63,8 @@ def main(model=None, output_dir=None, n_iter=1000): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for words, heads, deps in TRAIN_DATA: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, heads=heads, deps=deps) - nlp.update([doc], [gold], sgd=optimizer, losses=losses) + for text, annotations in TRAIN_DATA: + nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 161f7910c..f1ec17663 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -9,17 +9,14 @@ the documentation: * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging Developed for: spaCy 2.0.0a18 -Last updated for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac import random from pathlib import Path - import spacy -from spacy.tokens import Doc -from spacy.gold import GoldParse # You need to define a mapping from your data's part-of-speech tag names to the @@ -29,16 +26,16 @@ from spacy.gold import GoldParse # You may also specify morphological features for your tags, from the universal # scheme. TAG_MAP = { - 'N': {"pos": "NOUN"}, - 'V': {"pos": "VERB"}, - 'J': {"pos": "ADJ"} + 'N': {'pos': 'NOUN'}, + 'V': {'pos': 'VERB'}, + 'J': {'pos': 'ADJ'} } # Usually you'll read this in, of course. Data formats vary. # Ensure your strings are unicode. TRAIN_DATA = [ - (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]), - (["Eat", "blue", "ham"], ["V", "J", "N"]) + ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}), + ("Eat blue ham", {'tags': ['V', 'J', 'N']}) ] @@ -64,10 +61,8 @@ def main(lang='en', output_dir=None, n_iter=25): for i in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for words, tags in TRAIN_DATA: - doc = Doc(nlp.vocab, words=words) - gold = GoldParse(doc, tags=tags) - nlp.update([doc], [gold], sgd=optimizer, losses=losses) + for text, annotations in TRAIN_DATA: + nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index d1cf3ab8a..07fba47c6 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -9,7 +9,7 @@ see the documentation: * Text classification: https://alpha.spacy.io/usage/text-classification Developed for: spaCy 2.0.0a18 -Last updated for: spaCy 2.0.0a18 +Last updated for: spaCy 2.0.0a19 """ from __future__ import unicode_literals, print_function import plac @@ -18,9 +18,8 @@ from pathlib import Path import thinc.extra.datasets import spacy -from spacy.gold import GoldParse, minibatch +from spacy.gold import minibatch from spacy.util import compounding -from spacy.pipeline import TextCategorizer @plac.annotations( @@ -52,10 +51,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000): print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using %d training examples" % n_texts) - train_docs = [nlp.tokenizer(text) for text in train_texts] - train_gold = [GoldParse(doc, cats=cats) for doc, cats in - zip(train_docs, train_cats)] - train_data = list(zip(train_docs, train_gold)) + train_data = list(zip(train_texts, + [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] @@ -68,8 +65,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000): # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: - docs, golds = zip(*batch) - nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, drop=0.2, + losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)