mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update training examples to use "simple style"
This commit is contained in:
parent
906aece532
commit
fe498b3d5e
|
@ -14,55 +14,49 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
|
||||||
('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
|
('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
|
||||||
('hotel', 'PLACE', 'show') --> show PLACE hotel
|
('hotel', 'PLACE', 'show') --> show PLACE hotel
|
||||||
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
|
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
|
||||||
|
|
||||||
|
Developed for: spaCy 2.0.0a18
|
||||||
|
Last updated for: spaCy 2.0.0a19
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
# training data: words, head and dependency labels
|
# training data: texts, heads and dependency labels
|
||||||
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
|
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
("find a cafe with great wifi", {
|
||||||
['find', 'a', 'cafe', 'with', 'great', 'wifi'],
|
'heads': [0, 2, 0, 5, 5, 2], # index of token head
|
||||||
[0, 2, 0, 5, 5, 2], # index of token head
|
'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
|
||||||
['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
|
}),
|
||||||
),
|
("find a hotel near the beach", {
|
||||||
(
|
'heads': [0, 2, 0, 5, 5, 2],
|
||||||
['find', 'a', 'hotel', 'near', 'the', 'beach'],
|
'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
|
||||||
[0, 2, 0, 5, 5, 2],
|
}),
|
||||||
['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
|
("find me the closest gym that's open late", {
|
||||||
),
|
'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
|
||||||
(
|
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
|
||||||
['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'],
|
}),
|
||||||
[0, 0, 4, 4, 0, 6, 4, 6, 6],
|
("show me the cheapest store that sells flowers", {
|
||||||
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
|
'heads': [0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
|
||||||
),
|
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
|
||||||
(
|
}),
|
||||||
['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'],
|
("find a nice restaurant in london", {
|
||||||
[0, 0, 4, 4, 0, 4, 4, 4], # attach "flowers" to store!
|
'heads': [0, 3, 3, 0, 3, 3],
|
||||||
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
|
'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
||||||
),
|
}),
|
||||||
(
|
("show me the coolest hostel in berlin", {
|
||||||
['find', 'a', 'nice', 'restaurant', 'in', 'london'],
|
'heads': [0, 0, 4, 4, 0, 4, 4],
|
||||||
[0, 3, 3, 0, 3, 3],
|
'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
||||||
['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
}),
|
||||||
),
|
("find a good italian restaurant near work", {
|
||||||
(
|
'heads': [0, 4, 4, 4, 0, 4, 5],
|
||||||
['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'],
|
'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
|
||||||
[0, 0, 4, 4, 0, 4, 4],
|
})
|
||||||
['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
|
|
||||||
),
|
|
||||||
(
|
|
||||||
['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'],
|
|
||||||
[0, 4, 4, 4, 0, 4, 5],
|
|
||||||
['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,8 +82,8 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
else:
|
else:
|
||||||
parser = nlp.get_pipe('parser')
|
parser = nlp.get_pipe('parser')
|
||||||
|
|
||||||
for _, _, deps in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
for dep in deps:
|
for dep in annotations.get('deps', []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
|
||||||
|
@ -98,10 +92,8 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for words, heads, deps in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
doc = Doc(nlp.vocab, words=words)
|
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||||
gold = GoldParse(doc, heads=heads, deps=deps)
|
|
||||||
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
|
|
||||||
print(losses)
|
print(losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
@ -147,6 +139,7 @@ if __name__ == '__main__':
|
||||||
# ('find', 'ROOT', 'find'),
|
# ('find', 'ROOT', 'find'),
|
||||||
# ('cheapest', 'QUALITY', 'gym'),
|
# ('cheapest', 'QUALITY', 'gym'),
|
||||||
# ('gym', 'PLACE', 'find')
|
# ('gym', 'PLACE', 'find')
|
||||||
|
# ('work', 'LOCATION', 'near')
|
||||||
# ]
|
# ]
|
||||||
# show me the best hotel in berlin
|
# show me the best hotel in berlin
|
||||||
# [
|
# [
|
||||||
|
|
|
@ -8,22 +8,24 @@ For more details, see the documentation:
|
||||||
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Developed for: spaCy 2.0.0a18
|
Developed for: spaCy 2.0.0a18
|
||||||
Last updated for: spaCy 2.0.0a18
|
Last updated for: spaCy 2.0.0a19
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse, biluo_tags_from_offsets
|
|
||||||
|
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
('Who is Shaka Khan?', [(7, 17, 'PERSON')]),
|
('Who is Shaka Khan?', {
|
||||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
|
'entities': [(7, 17, 'PERSON')]
|
||||||
|
}),
|
||||||
|
('I like London and Berlin.', {
|
||||||
|
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
|
||||||
|
})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,25 +47,28 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
if 'ner' not in nlp.pipe_names:
|
if 'ner' not in nlp.pipe_names:
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe('ner')
|
||||||
nlp.add_pipe(ner, last=True)
|
nlp.add_pipe(ner, last=True)
|
||||||
|
# otherwise, get it so we can add labels
|
||||||
|
else:
|
||||||
|
ner = nlp.get_pipe('ner')
|
||||||
|
|
||||||
# function that allows begin_training to get the training data
|
# add labels
|
||||||
get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA)
|
for _, annotations in TRAIN_DATA:
|
||||||
|
for ent in annotations.get('entities'):
|
||||||
|
ner.add_label(ent[2])
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||||
optimizer = nlp.begin_training(get_data)
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
doc = nlp.make_doc(raw_text)
|
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
[doc], # Batch of Doc objects
|
[text], # batch of texts
|
||||||
[gold], # Batch of GoldParse objects
|
[annotations], # batch of annotations
|
||||||
drop=0.5, # Dropout -- make it harder to memorise data
|
drop=0.5, # dropout - make it harder to memorise data
|
||||||
sgd=optimizer, # Callable to update weights
|
sgd=optimizer, # callable to update weights
|
||||||
losses=losses)
|
losses=losses)
|
||||||
print(losses)
|
print(losses)
|
||||||
|
|
||||||
|
@ -90,25 +95,13 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||||
|
|
||||||
|
|
||||||
def reformat_train_data(tokenizer, examples):
|
|
||||||
"""Reformat data to match JSON format.
|
|
||||||
https://alpha.spacy.io/api/annotation#json-input
|
|
||||||
|
|
||||||
tokenizer (Tokenizer): Tokenizer to process the raw text.
|
|
||||||
examples (list): The trainig data.
|
|
||||||
RETURNS (list): The reformatted training data."""
|
|
||||||
output = []
|
|
||||||
for i, (text, entity_offsets) in enumerate(examples):
|
|
||||||
doc = tokenizer(text)
|
|
||||||
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
|
|
||||||
words = [w.text for w in doc]
|
|
||||||
tags = ['-'] * len(doc)
|
|
||||||
heads = [0] * len(doc)
|
|
||||||
deps = [''] * len(doc)
|
|
||||||
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
|
|
||||||
output.append((text, [(sentence, [])]))
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
|
# Expected output:
|
||||||
|
# Entities [('Shaka Khan', 'PERSON')]
|
||||||
|
# Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
|
||||||
|
# ('Khan', 'PERSON', 1), ('?', '', 2)]
|
||||||
|
# Entities [('London', 'LOC'), ('Berlin', 'LOC')]
|
||||||
|
# Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
|
||||||
|
# ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
|
||||||
|
|
|
@ -24,16 +24,14 @@ For more details, see the documentation:
|
||||||
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
|
||||||
|
|
||||||
Developed for: spaCy 2.0.0a18
|
Developed for: spaCy 2.0.0a18
|
||||||
Last updated for: spaCy 2.0.0a18
|
Last updated for: spaCy 2.0.0a19
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse, minibatch
|
|
||||||
|
|
||||||
|
|
||||||
# new entity label
|
# new entity label
|
||||||
|
@ -45,20 +43,29 @@ LABEL = 'ANIMAL'
|
||||||
# model might learn the new type, but "forget" what it previously knew.
|
# model might learn the new type, but "forget" what it previously knew.
|
||||||
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
|
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Horses are too tall and they pretend to care about your feelings",
|
("Horses are too tall and they pretend to care about your feelings", {
|
||||||
[(0, 6, 'ANIMAL')]),
|
'entities': [(0, 6, 'ANIMAL')]
|
||||||
|
}),
|
||||||
|
|
||||||
("Do they bite?", []),
|
("Do they bite?", {
|
||||||
|
'entities': []
|
||||||
|
}),
|
||||||
|
|
||||||
("horses are too tall and they pretend to care about your feelings",
|
("horses are too tall and they pretend to care about your feelings", {
|
||||||
[(0, 6, 'ANIMAL')]),
|
'entities': [(0, 6, 'ANIMAL')]
|
||||||
|
}),
|
||||||
|
|
||||||
("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]),
|
("horses pretend to care about your feelings", {
|
||||||
|
'entities': [(0, 6, 'ANIMAL')]
|
||||||
|
}),
|
||||||
|
|
||||||
("they pretend to care about your feelings, those horses",
|
("they pretend to care about your feelings, those horses", {
|
||||||
[(48, 54, 'ANIMAL')]),
|
'entities': [(48, 54, 'ANIMAL')]
|
||||||
|
}),
|
||||||
|
|
||||||
("horses?", [(0, 6, 'ANIMAL')])
|
("horses?", {
|
||||||
|
'entities': [(0, 6, 'ANIMAL')]
|
||||||
|
})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,15 +97,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
||||||
with nlp.disable_pipes(*other_pipes): # only train NER
|
with nlp.disable_pipes(*other_pipes): # only train NER
|
||||||
random.seed(0)
|
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA)
|
for text, annotations in TRAIN_DATA:
|
||||||
for batch in minibatch(gold_parses, size=3):
|
nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
|
||||||
docs, golds = zip(*batch)
|
losses=losses)
|
||||||
nlp.update(docs, golds, losses=losses, sgd=optimizer,
|
|
||||||
drop=0.35)
|
|
||||||
print(losses)
|
print(losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
@ -125,19 +130,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
|
|
||||||
|
|
||||||
def get_gold_parses(tokenizer, train_data):
|
|
||||||
"""Shuffle and create GoldParse objects.
|
|
||||||
|
|
||||||
tokenizer (Tokenizer): Tokenizer to processs the raw text.
|
|
||||||
train_data (list): The training data.
|
|
||||||
YIELDS (tuple): (doc, gold) tuples.
|
|
||||||
"""
|
|
||||||
random.shuffle(train_data)
|
|
||||||
for raw_text, entity_offsets in train_data:
|
|
||||||
doc = tokenizer(raw_text)
|
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
|
||||||
yield doc, gold
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -13,24 +13,19 @@ from __future__ import unicode_literals, print_function
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
("They trade mortgage-backed securities.", {
|
||||||
['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'],
|
'heads': [1, 1, 4, 4, 5, 1, 1],
|
||||||
[1, 1, 4, 4, 5, 1, 1],
|
'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
|
||||||
['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
|
}),
|
||||||
),
|
("I like London and Berlin", {
|
||||||
(
|
'heads': [1, 1, 1, 2, 2, 1],
|
||||||
['I', 'like', 'London', 'and', 'Berlin', '.'],
|
'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
|
||||||
[1, 1, 1, 2, 2, 1],
|
})
|
||||||
['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +33,7 @@ TRAIN_DATA = [
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int))
|
||||||
def main(model=None, output_dir=None, n_iter=1000):
|
def main(model=None, output_dir=None, n_iter=10):
|
||||||
"""Load the model, set up the pipeline and train the parser."""
|
"""Load the model, set up the pipeline and train the parser."""
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
|
@ -57,8 +52,8 @@ def main(model=None, output_dir=None, n_iter=1000):
|
||||||
parser = nlp.get_pipe('parser')
|
parser = nlp.get_pipe('parser')
|
||||||
|
|
||||||
# add labels to the parser
|
# add labels to the parser
|
||||||
for _, _, deps in TRAIN_DATA:
|
for _, annotations in TRAIN_DATA:
|
||||||
for dep in deps:
|
for dep in annotations.get('deps', []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
|
@ -68,10 +63,8 @@ def main(model=None, output_dir=None, n_iter=1000):
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for words, heads, deps in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
doc = Doc(nlp.vocab, words=words)
|
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||||
gold = GoldParse(doc, heads=heads, deps=deps)
|
|
||||||
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
|
|
||||||
print(losses)
|
print(losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -9,17 +9,14 @@ the documentation:
|
||||||
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
|
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
|
||||||
|
|
||||||
Developed for: spaCy 2.0.0a18
|
Developed for: spaCy 2.0.0a18
|
||||||
Last updated for: spaCy 2.0.0a18
|
Last updated for: spaCy 2.0.0a19
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
|
|
||||||
# You need to define a mapping from your data's part-of-speech tag names to the
|
# You need to define a mapping from your data's part-of-speech tag names to the
|
||||||
|
@ -29,16 +26,16 @@ from spacy.gold import GoldParse
|
||||||
# You may also specify morphological features for your tags, from the universal
|
# You may also specify morphological features for your tags, from the universal
|
||||||
# scheme.
|
# scheme.
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
'N': {"pos": "NOUN"},
|
'N': {'pos': 'NOUN'},
|
||||||
'V': {"pos": "VERB"},
|
'V': {'pos': 'VERB'},
|
||||||
'J': {"pos": "ADJ"}
|
'J': {'pos': 'ADJ'}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Usually you'll read this in, of course. Data formats vary.
|
# Usually you'll read this in, of course. Data formats vary.
|
||||||
# Ensure your strings are unicode.
|
# Ensure your strings are unicode.
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(["I", "like", "green", "eggs"], ["N", "V", "J", "N"]),
|
("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
|
||||||
(["Eat", "blue", "ham"], ["V", "J", "N"])
|
("Eat blue ham", {'tags': ['V', 'J', 'N']})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,10 +61,8 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for words, tags in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
doc = Doc(nlp.vocab, words=words)
|
nlp.update([text], [annotations], sgd=optimizer, losses=losses)
|
||||||
gold = GoldParse(doc, tags=tags)
|
|
||||||
nlp.update([doc], [gold], sgd=optimizer, losses=losses)
|
|
||||||
print(losses)
|
print(losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -9,7 +9,7 @@ see the documentation:
|
||||||
* Text classification: https://alpha.spacy.io/usage/text-classification
|
* Text classification: https://alpha.spacy.io/usage/text-classification
|
||||||
|
|
||||||
Developed for: spaCy 2.0.0a18
|
Developed for: spaCy 2.0.0a18
|
||||||
Last updated for: spaCy 2.0.0a18
|
Last updated for: spaCy 2.0.0a19
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import plac
|
import plac
|
||||||
|
@ -18,9 +18,8 @@ from pathlib import Path
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.gold import GoldParse, minibatch
|
from spacy.gold import minibatch
|
||||||
from spacy.util import compounding
|
from spacy.util import compounding
|
||||||
from spacy.pipeline import TextCategorizer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -52,10 +51,8 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
||||||
print("Loading IMDB data...")
|
print("Loading IMDB data...")
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
|
||||||
print("Using %d training examples" % n_texts)
|
print("Using %d training examples" % n_texts)
|
||||||
train_docs = [nlp.tokenizer(text) for text in train_texts]
|
train_data = list(zip(train_texts,
|
||||||
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
[{'cats': cats} for cats in train_cats]))
|
||||||
zip(train_docs, train_cats)]
|
|
||||||
train_data = list(zip(train_docs, train_gold))
|
|
||||||
|
|
||||||
# get names of other pipes to disable them during training
|
# get names of other pipes to disable them during training
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
||||||
|
@ -68,8 +65,9 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
|
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
docs, golds = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
|
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
|
||||||
|
losses=losses)
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user