mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Tmp commit to example
This commit is contained in:
parent
e6ee7e130f
commit
a7626bd7fd
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
'''Example of training a named entity recognition system from scratch using spaCy
|
'''Example of training a named entity recognition system from scratch using spaCy
|
||||||
|
|
||||||
This example is written to be self-contained and reasonably transparent.
|
This example is written to be self-contained and reasonably transparent.
|
||||||
|
@ -31,6 +32,8 @@ from spacy.gold import GoldParse
|
||||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
|
||||||
|
from deepsense import neptune
|
||||||
|
|
||||||
try:
|
try:
|
||||||
unicode
|
unicode
|
||||||
except NameError:
|
except NameError:
|
||||||
|
@ -81,7 +84,7 @@ def load_vocab(path):
|
||||||
def init_ner_model(vocab, features=None):
|
def init_ner_model(vocab, features=None):
|
||||||
if features is None:
|
if features is None:
|
||||||
features = tuple(EntityRecognizer.feature_templates)
|
features = tuple(EntityRecognizer.feature_templates)
|
||||||
return BeamEntityRecognizer(vocab, features=features)
|
return EntityRecognizer(vocab, features=features)
|
||||||
|
|
||||||
|
|
||||||
def save_ner_model(model, path):
|
def save_ner_model(model, path):
|
||||||
|
@ -99,7 +102,7 @@ def save_ner_model(model, path):
|
||||||
|
|
||||||
|
|
||||||
def load_ner_model(vocab, path):
|
def load_ner_model(vocab, path):
|
||||||
return BeamEntityRecognizer.load(path, vocab)
|
return EntityRecognizer.load(path, vocab)
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
|
@ -110,18 +113,21 @@ class Pipeline(object):
|
||||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||||
vocab = load_vocab(path / 'vocab')
|
vocab = load_vocab(path)
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
ner_model = load_ner_model(vocab, path / 'ner')
|
ner_model = load_ner_model(vocab, path / 'ner')
|
||||||
return cls(vocab, tokenizer, ner_model)
|
return cls(vocab, tokenizer, ner_model)
|
||||||
|
|
||||||
def __init__(self, vocab=None, tokenizer=None, ner_model=None):
|
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||||
if vocab is None:
|
if vocab is None:
|
||||||
self.vocab = init_vocab()
|
vocab = init_vocab()
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
if ner_model is None:
|
if entity is None:
|
||||||
self.entity = init_ner_model(self.vocab)
|
entity = init_ner_model(self.vocab)
|
||||||
|
self.vocab = vocab
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.entity = entity
|
||||||
self.pipeline = [self.entity]
|
self.pipeline = [self.entity]
|
||||||
|
|
||||||
def __call__(self, input_):
|
def __call__(self, input_):
|
||||||
|
@ -173,7 +179,25 @@ class Pipeline(object):
|
||||||
save_ner_model(self.entity, path / 'ner')
|
save_ner_model(self.entity, path / 'ner')
|
||||||
|
|
||||||
|
|
||||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||||
|
channels = {}
|
||||||
|
channels['loss'] = ctx.job.create_channel(
|
||||||
|
name='loss',
|
||||||
|
channel_type=neptune.ChannelType.NUMERIC)
|
||||||
|
|
||||||
|
channels['f'] = ctx.job.create_channel(
|
||||||
|
name='F-Measure',
|
||||||
|
channel_type=neptune.ChannelType.NUMERIC)
|
||||||
|
channels['p'] = ctx.job.create_channel(
|
||||||
|
name='Precision',
|
||||||
|
channel_type=neptune.ChannelType.NUMERIC)
|
||||||
|
channels['r'] = ctx.job.create_channel(
|
||||||
|
name='Recall',
|
||||||
|
channel_type=neptune.ChannelType.NUMERIC)
|
||||||
|
channels['log'] = ctx.job.create_channel(
|
||||||
|
name='logs',
|
||||||
|
channel_type=neptune.ChannelType.TEXT)
|
||||||
|
|
||||||
next_epoch = train_examples
|
next_epoch = train_examples
|
||||||
print("Iter", "Loss", "P", "R", "F")
|
print("Iter", "Loss", "P", "R", "F")
|
||||||
for i in range(nr_epoch):
|
for i in range(nr_epoch):
|
||||||
|
@ -186,14 +210,25 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||||
next_epoch.append((input_, annot))
|
next_epoch.append((input_, annot))
|
||||||
random.shuffle(next_epoch)
|
random.shuffle(next_epoch)
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
precision = '%.2f' % scores['ents_p']
|
report_scores(channels, i, loss, scores)
|
||||||
recall = '%.2f' % scores['ents_r']
|
|
||||||
f_measure = '%.2f' % scores['ents_f']
|
|
||||||
print(i, int(loss), precision, recall, f_measure)
|
|
||||||
nlp.average_weights()
|
nlp.average_weights()
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
print("After averaging")
|
report_scores(channels, i+1, loss, scores)
|
||||||
print(scores['ents_p'], scores['ents_r'], scores['ents_f'])
|
|
||||||
|
|
||||||
|
def report_scores(channels, i, loss, scores):
|
||||||
|
precision = '%.2f' % scores['ents_p']
|
||||||
|
recall = '%.2f' % scores['ents_r']
|
||||||
|
f_measure = '%.2f' % scores['ents_f']
|
||||||
|
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
|
||||||
|
channels['log'].send(x=i, y='%d %s %s %s' % (int(loss), precision, recall,
|
||||||
|
f_measure))
|
||||||
|
channels['f'].send(x=i, y=scores['ents_f'])
|
||||||
|
channels['p'].send(x=i, y=scores['ents_p'])
|
||||||
|
channels['r'].send(x=i, y=scores['ents_r'])
|
||||||
|
channels['loss'].send(x=i, y=loss)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_examples(path):
|
def read_examples(path):
|
||||||
|
@ -221,15 +256,22 @@ def read_examples(path):
|
||||||
train_loc=("Path to your training data", "positional", None, Path),
|
train_loc=("Path to your training data", "positional", None, Path),
|
||||||
dev_loc=("Path to your development data", "positional", None, Path),
|
dev_loc=("Path to your development data", "positional", None, Path),
|
||||||
)
|
)
|
||||||
def main(model_dir, train_loc, dev_loc, nr_epoch=10):
|
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||||
|
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||||
|
ctx = neptune.Context()
|
||||||
|
|
||||||
|
train_loc = Path(ctx.params.train_loc)
|
||||||
|
dev_loc = Path(ctx.params.dev_loc)
|
||||||
|
model_dir = model_dir.resolve()
|
||||||
|
|
||||||
train_examples = read_examples(train_loc)
|
train_examples = read_examples(train_loc)
|
||||||
dev_examples = read_examples(dev_loc)
|
dev_examples = read_examples(dev_loc)
|
||||||
nlp = Pipeline()
|
nlp = Pipeline.load(model_dir)
|
||||||
|
|
||||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||||
|
|
||||||
nlp.save(model_dir)
|
nlp.save(model_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
main()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user