Update training examples to use "simple style"

This commit is contained in:
ines 2017-11-06 23:14:04 +01:00
parent 906aece532
commit fe498b3d5e
6 changed files with 116 additions and 153 deletions

View File

@ -14,55 +14,49 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
('best', 'QUALITY', 'hotel') --> hotel with QUALITY best ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
('hotel', 'PLACE', 'show') --> show PLACE hotel ('hotel', 'PLACE', 'show') --> show PLACE hotel
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
import spacy import spacy
from spacy.gold import GoldParse
from spacy.tokens import Doc
from pathlib import Path from pathlib import Path
# training data: words, head and dependency labels # training data: texts, heads and dependency labels
# for no relation, we simply chose an arbitrary dependency label, e.g. '-' # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
TRAIN_DATA = [ TRAIN_DATA = [
( ("find a cafe with great wifi", {
['find', 'a', 'cafe', 'with', 'great', 'wifi'], 'heads': [0, 2, 0, 5, 5, 2], # index of token head
[0, 2, 0, 5, 5, 2], # index of token head 'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE'] }),
), ("find a hotel near the beach", {
( 'heads': [0, 2, 0, 5, 5, 2],
['find', 'a', 'hotel', 'near', 'the', 'beach'], 'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
[0, 2, 0, 5, 5, 2], }),
['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE'] ("find me the closest gym that's open late", {
), 'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
( 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'], }),
[0, 0, 4, 4, 0, 6, 4, 6, 6], ("show me the cheapest store that sells flowers", {
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME'] 'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
), 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
( }),
['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'], ("find a nice restaurant in london", {
[0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store! 'heads': [0, 3, 3, 0, 3, 3],
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT'] 'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
), }),
( ("show me the coolest hostel in berlin", {
['find', 'a', 'nice', 'restaurant', 'in', 'london'], 'heads': [0, 0, 4, 4, 0, 4, 4],
[0, 3, 3, 0, 3, 3], 'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION'] }),
), ("find a good italian restaurant near work", {
( 'heads': [0, 4, 4, 4, 0, 4, 5],
['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'], 'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
[0, 0, 4, 4, 0, 4, 4], })
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
),
(
['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'],
[0, 4, 4, 4, 0, 4, 5],
['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
)
] ]
@ -88,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=100):
else: else:
parser = nlp.get_pipe('parser') parser = nlp.get_pipe('parser')
for _, _, deps in TRAIN_DATA: for text, annotations in TRAIN_DATA:
for dep in deps: for dep in annotations.get('deps', []):
parser.add_label(dep) parser.add_label(dep)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
@ -98,10 +92,8 @@ def main(model=None, output_dir=None, n_iter=100):
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
for words, heads, deps in TRAIN_DATA: for text, annotations in TRAIN_DATA:
doc = Doc(nlp.vocab, words=words) nlp.update([text], [annotations], sgd=optimizer, losses=losses)
gold = GoldParse(doc, heads=heads, deps=deps)
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
print(losses) print(losses)
# test the trained model # test the trained model
@ -147,6 +139,7 @@ if __name__ == '__main__':
# ('find', 'ROOT', 'find'), # ('find', 'ROOT', 'find'),
# ('cheapest', 'QUALITY', 'gym'), # ('cheapest', 'QUALITY', 'gym'),
# ('gym', 'PLACE', 'find') # ('gym', 'PLACE', 'find')
# ('work', 'LOCATION', 'near')
# ] # ]
# show me the best hotel in berlin # show me the best hotel in berlin
# [ # [

View File

@ -8,22 +8,24 @@ For more details, see the documentation:
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Developed for: spaCy 2.0.0a18 Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18 Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.gold import GoldParse, biluo_tags_from_offsets
# training data # training data
TRAIN_DATA = [ TRAIN_DATA = [
('Who is Shaka Khan?', [(7, 17, 'PERSON')]), ('Who is Shaka Khan?', {
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) 'entities': [(7, 17, 'PERSON')]
}),
('I like London and Berlin.', {
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
})
] ]
@ -45,25 +47,28 @@ def main(model=None, output_dir=None, n_iter=100):
if 'ner' not in nlp.pipe_names: if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner') ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True) nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe('ner')
# function that allows begin_training to get the training data # add labels
get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA) for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training(get_data) optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
for raw_text, entity_offsets in TRAIN_DATA: for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.update( nlp.update(
[doc], # Batch of Doc objects [text], # batch of texts
[gold], # Batch of GoldParse objects [annotations], # batch of annotations
drop=0.5, # Dropout -- make it harder to memorise data drop=0.5, # dropout - make it harder to memorise data
sgd=optimizer, # Callable to update weights sgd=optimizer, # callable to update weights
losses=losses) losses=losses)
print(losses) print(losses)
@ -90,25 +95,13 @@ def main(model=None, output_dir=None, n_iter=100):
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def reformat_train_data(tokenizer, examples):
"""Reformat data to match JSON format.
https://alpha.spacy.io/api/annotation#json-input
tokenizer (Tokenizer): Tokenizer to process the raw text.
examples (list): The trainig data.
RETURNS (list): The reformatted training data."""
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text)
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
return output
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)
# Expected output:
# Entities [('Shaka Khan', 'PERSON')]
# Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
# ('Khan', 'PERSON', 1), ('?', '', 2)]
# Entities [('London', 'LOC'), ('Berlin', 'LOC')]
# Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
# ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

View File

@ -24,16 +24,14 @@ For more details, see the documentation:
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Developed for: spaCy 2.0.0a18 Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18 Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.gold import GoldParse, minibatch
# new entity label # new entity label
@ -45,20 +43,29 @@ LABEL = 'ANIMAL'
# model might learn the new type, but "forget" what it previously knew. # model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = [ TRAIN_DATA = [
("Horses are too tall and they pretend to care about your feelings", ("Horses are too tall and they pretend to care about your feelings", {
[(0, 6, 'ANIMAL')]), 'entities': [(0, 6, 'ANIMAL')]
}),
("Do they bite?", []), ("Do they bite?", {
'entities': []
}),
("horses are too tall and they pretend to care about your feelings", ("horses are too tall and they pretend to care about your feelings", {
[(0, 6, 'ANIMAL')]), 'entities': [(0, 6, 'ANIMAL')]
}),
("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]), ("horses pretend to care about your feelings", {
'entities': [(0, 6, 'ANIMAL')]
}),
("they pretend to care about your feelings, those horses", ("they pretend to care about your feelings, those horses", {
[(48, 54, 'ANIMAL')]), 'entities': [(48, 54, 'ANIMAL')]
}),
("horses?", [(0, 6, 'ANIMAL')]) ("horses?", {
'entities': [(0, 6, 'ANIMAL')]
})
] ]
@ -90,15 +97,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER with nlp.disable_pipes(*other_pipes): # only train NER
random.seed(0)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {} losses = {}
gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA) for text, annotations in TRAIN_DATA:
for batch in minibatch(gold_parses, size=3): nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
docs, golds = zip(*batch) losses=losses)
nlp.update(docs, golds, losses=losses, sgd=optimizer,
drop=0.35)
print(losses) print(losses)
# test the trained model # test the trained model
@ -125,19 +130,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
print(ent.label_, ent.text) print(ent.label_, ent.text)
def get_gold_parses(tokenizer, train_data):
"""Shuffle and create GoldParse objects.
tokenizer (Tokenizer): Tokenizer to processs the raw text.
train_data (list): The training data.
YIELDS (tuple): (doc, gold) tuples.
"""
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = tokenizer(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
yield doc, gold
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -13,24 +13,19 @@ from __future__ import unicode_literals, print_function
import plac import plac
import random import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.gold import GoldParse
from spacy.tokens import Doc
# training data # training data
TRAIN_DATA = [ TRAIN_DATA = [
( ("They trade mortgage-backed securities.", {
['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'], 'heads': [1, 1, 4, 4, 5, 1, 1],
[1, 1, 4, 4, 5, 1, 1], 'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct'] }),
), ("I like London and Berlin", {
( 'heads': [1, 1, 1, 2, 2, 1],
['I', 'like', 'London', 'and', 'Berlin', '.'], 'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
[1, 1, 1, 2, 2, 1], })
['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
)
] ]
@ -38,7 +33,7 @@ TRAIN_DATA = [
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=1000): def main(model=None, output_dir=None, n_iter=10):
"""Load the model, set up the pipeline and train the parser.""" """Load the model, set up the pipeline and train the parser."""
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
@ -57,8 +52,8 @@ def main(model=None, output_dir=None, n_iter=1000):
parser = nlp.get_pipe('parser') parser = nlp.get_pipe('parser')
# add labels to the parser # add labels to the parser
for _, _, deps in TRAIN_DATA: for _, annotations in TRAIN_DATA:
for dep in deps: for dep in annotations.get('deps', []):
parser.add_label(dep) parser.add_label(dep)
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
@ -68,10 +63,8 @@ def main(model=None, output_dir=None, n_iter=1000):
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
for words, heads, deps in TRAIN_DATA: for text, annotations in TRAIN_DATA:
doc = Doc(nlp.vocab, words=words) nlp.update([text], [annotations], sgd=optimizer, losses=losses)
gold = GoldParse(doc, heads=heads, deps=deps)
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
print(losses) print(losses)
# test the trained model # test the trained model

View File

@ -9,17 +9,14 @@ the documentation:
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
Developed for: spaCy 2.0.0a18 Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18 Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
import random import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.tokens import Doc
from spacy.gold import GoldParse
# You need to define a mapping from your data's part-of-speech tag names to the # You need to define a mapping from your data's part-of-speech tag names to the
@ -29,16 +26,16 @@ from spacy.gold import GoldParse
# You may also specify morphological features for your tags, from the universal # You may also specify morphological features for your tags, from the universal
# scheme. # scheme.
TAG_MAP = { TAG_MAP = {
'N': {"pos": "NOUN"}, 'N': {'pos': 'NOUN'},
'V': {"pos": "VERB"}, 'V': {'pos': 'VERB'},
'J': {"pos": "ADJ"} 'J': {'pos': 'ADJ'}
} }
# Usually you'll read this in, of course. Data formats vary. # Usually you'll read this in, of course. Data formats vary.
# Ensure your strings are unicode. # Ensure your strings are unicode.
TRAIN_DATA = [ TRAIN_DATA = [
(["I", "like", "green", "eggs"], ["N", "V", "J", "N"]), ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
(["Eat", "blue", "ham"], ["V", "J", "N"]) ("Eat blue ham", {'tags': ['V', 'J', 'N']})
] ]
@ -64,10 +61,8 @@ def main(lang='en', output_dir=None, n_iter=25):
for i in range(n_iter): for i in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
losses = {} losses = {}
for words, tags in TRAIN_DATA: for text, annotations in TRAIN_DATA:
doc = Doc(nlp.vocab, words=words) nlp.update([text], [annotations], sgd=optimizer, losses=losses)
gold = GoldParse(doc, tags=tags)
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
print(losses) print(losses)
# test the trained model # test the trained model

View File

@ -9,7 +9,7 @@ see the documentation:
* Text classification: https://alpha.spacy.io/usage/text-classification * Text classification: https://alpha.spacy.io/usage/text-classification
Developed for: spaCy 2.0.0a18 Developed for: spaCy 2.0.0a18
Last updated for: spaCy 2.0.0a18 Last updated for: spaCy 2.0.0a19
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import plac import plac
@ -18,9 +18,8 @@ from pathlib import Path
import thinc.extra.datasets import thinc.extra.datasets
import spacy import spacy
from spacy.gold import GoldParse, minibatch from spacy.gold import minibatch
from spacy.util import compounding from spacy.util import compounding
from spacy.pipeline import TextCategorizer
@plac.annotations( @plac.annotations(
@ -52,10 +51,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
print("Loading IMDB data...") print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using %d training examples" % n_texts) print("Using %d training examples" % n_texts)
train_docs = [nlp.tokenizer(text) for text in train_texts] train_data = list(zip(train_texts,
train_gold = [GoldParse(doc, cats=cats) for doc, cats in [{'cats': cats} for cats in train_cats]))
zip(train_docs, train_cats)]
train_data = list(zip(train_docs, train_gold))
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
@ -68,8 +65,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001)) batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches: for batch in batches:
docs, golds = zip(*batch) texts, annotations = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
losses=losses)
with textcat.model.use_params(optimizer.averages): with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data() # evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)