mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Update train_ner_standalone example
This commit is contained in:
parent
d84607f6bb
commit
027a5d8b75
|
@ -13,24 +13,27 @@ Input data:
|
||||||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
||||||
|
|
||||||
Developed for: spaCy 1.7.1
|
Developed for: spaCy 1.7.1
|
||||||
Last tested for: spaCy 1.7.1
|
Last tested for: spaCy 2.0.0a13
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
import random
|
||||||
import json
|
import json
|
||||||
|
from thinc.neural.optimizers import Adam
|
||||||
|
from thinc.neural.ops import NumpyOps
|
||||||
|
import tqdm
|
||||||
|
|
||||||
import spacy.orth as orth_funcs
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.pipeline import BeamEntityRecognizer
|
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.attrs import *
|
from spacy.attrs import *
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
from spacy.gold import iob_to_biluo
|
||||||
|
from spacy.gold import minibatch
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
import spacy.util
|
||||||
|
|
||||||
try:
|
try:
|
||||||
unicode
|
unicode
|
||||||
|
@ -38,95 +41,40 @@ except NameError:
|
||||||
unicode = str
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
|
spacy.util.set_env_log(True)
|
||||||
|
|
||||||
|
|
||||||
def init_vocab():
|
def init_vocab():
|
||||||
return Vocab(
|
return Vocab(
|
||||||
lex_attr_getters={
|
lex_attr_getters={
|
||||||
LOWER: lambda string: string.lower(),
|
LOWER: lambda string: string.lower(),
|
||||||
SHAPE: orth_funcs.word_shape,
|
NORM: lambda string: string.lower(),
|
||||||
PREFIX: lambda string: string[0],
|
PREFIX: lambda string: string[0],
|
||||||
SUFFIX: lambda string: string[-3:],
|
SUFFIX: lambda string: string[-3:],
|
||||||
CLUSTER: lambda string: 0,
|
|
||||||
IS_ALPHA: orth_funcs.is_alpha,
|
|
||||||
IS_ASCII: orth_funcs.is_ascii,
|
|
||||||
IS_DIGIT: lambda string: string.isdigit(),
|
|
||||||
IS_LOWER: orth_funcs.is_lower,
|
|
||||||
IS_PUNCT: orth_funcs.is_punct,
|
|
||||||
IS_SPACE: lambda string: string.isspace(),
|
|
||||||
IS_TITLE: orth_funcs.is_title,
|
|
||||||
IS_UPPER: orth_funcs.is_upper,
|
|
||||||
IS_STOP: lambda string: False,
|
|
||||||
IS_OOV: lambda string: True
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
def save_vocab(vocab, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
path.mkdir()
|
|
||||||
elif not path.is_dir():
|
|
||||||
raise IOError("Can't save vocab to %s\nNot a directory" % path)
|
|
||||||
with (path / 'strings.json').open('w') as file_:
|
|
||||||
vocab.strings.dump(file_)
|
|
||||||
vocab.dump((path / 'lexemes.bin').as_posix())
|
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
|
|
||||||
return Vocab.load(path)
|
|
||||||
|
|
||||||
|
|
||||||
def init_ner_model(vocab, features=None):
|
|
||||||
if features is None:
|
|
||||||
features = tuple(EntityRecognizer.feature_templates)
|
|
||||||
return EntityRecognizer(vocab, features=features)
|
|
||||||
|
|
||||||
|
|
||||||
def save_ner_model(model, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
path.mkdir()
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Can't save model to %s\nNot a directory" % path)
|
|
||||||
model.model.dump((path / 'model').as_posix())
|
|
||||||
with (path / 'config.json').open('w') as file_:
|
|
||||||
data = json.dumps(model.cfg)
|
|
||||||
if not isinstance(data, unicode):
|
|
||||||
data = data.decode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
|
|
||||||
|
|
||||||
def load_ner_model(vocab, path):
|
|
||||||
return EntityRecognizer.load(path, vocab)
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
@classmethod
|
def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
|
||||||
def load(cls, path):
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
|
||||||
if not path.is_dir():
|
|
||||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
|
||||||
vocab = load_vocab(path)
|
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
|
||||||
ner_model = load_ner_model(vocab, path / 'ner')
|
|
||||||
return cls(vocab, tokenizer, ner_model)
|
|
||||||
|
|
||||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
|
||||||
if vocab is None:
|
if vocab is None:
|
||||||
vocab = init_vocab()
|
vocab = init_vocab()
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
|
if tensorizer is None:
|
||||||
|
tensorizer = TokenVectorEncoder(vocab)
|
||||||
if entity is None:
|
if entity is None:
|
||||||
entity = init_ner_model(self.vocab)
|
entity = NeuralEntityRecognizer(vocab)
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
self.tensorizer = tensorizer
|
||||||
self.entity = entity
|
self.entity = entity
|
||||||
self.pipeline = [self.entity]
|
self.pipeline = [tensorizer, self.entity]
|
||||||
|
|
||||||
|
def begin_training(self):
|
||||||
|
for model in self.pipeline:
|
||||||
|
model.begin_training([])
|
||||||
|
optimizer = Adam(NumpyOps(), 0.001)
|
||||||
|
return optimizer
|
||||||
|
|
||||||
def __call__(self, input_):
|
def __call__(self, input_):
|
||||||
doc = self.make_doc(input_)
|
doc = self.make_doc(input_)
|
||||||
|
@ -147,14 +95,18 @@ class Pipeline(object):
|
||||||
gold = GoldParse(doc, entities=annotations)
|
gold = GoldParse(doc, entities=annotations)
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
def update(self, input_, annot):
|
def update(self, inputs, annots, sgd, losses=None, drop=0.):
|
||||||
doc = self.make_doc(input_)
|
if losses is None:
|
||||||
gold = self.make_gold(input_, annot)
|
losses = {}
|
||||||
for ner in gold.ner:
|
docs = [self.make_doc(input_) for input_ in inputs]
|
||||||
if ner not in (None, '-', 'O'):
|
golds = [self.make_gold(input_, annot) for input_, annot in
|
||||||
action, label = ner.split('-', 1)
|
zip(inputs, annots)]
|
||||||
self.entity.add_label(label)
|
|
||||||
return self.entity.update(doc, gold)
|
tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
|
||||||
|
d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
|
||||||
|
sgd=sgd, losses=losses)
|
||||||
|
bp_tensors(d_tensors, sgd=sgd)
|
||||||
|
return losses
|
||||||
|
|
||||||
def evaluate(self, examples):
|
def evaluate(self, examples):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
@ -164,34 +116,38 @@ class Pipeline(object):
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
return scorer.scores
|
return scorer.scores
|
||||||
|
|
||||||
def average_weights(self):
|
def to_disk(self, path):
|
||||||
self.entity.model.end_training()
|
|
||||||
|
|
||||||
def save(self, path):
|
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
elif not path.is_dir():
|
elif not path.is_dir():
|
||||||
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
||||||
save_vocab(self.vocab, path / 'vocab')
|
self.vocab.to_disk(path / 'vocab')
|
||||||
save_ner_model(self.entity, path / 'ner')
|
self.tensorizer.to_disk(path / 'tensorizer')
|
||||||
|
self.entity.to_disk(path / 'ner')
|
||||||
|
|
||||||
|
def from_disk(self, path):
|
||||||
|
path = Path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||||
|
if not path.is_dir():
|
||||||
|
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||||
|
self.vocab = self.vocab.from_disk(path / 'vocab')
|
||||||
|
self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
|
||||||
|
self.entity = self.entity.from_disk(path / 'ner')
|
||||||
|
|
||||||
|
|
||||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||||
next_epoch = train_examples
|
sgd = nlp.begin_training()
|
||||||
print("Iter", "Loss", "P", "R", "F")
|
print("Iter", "Loss", "P", "R", "F")
|
||||||
for i in range(nr_epoch):
|
for i in range(nr_epoch):
|
||||||
this_epoch = next_epoch
|
random.shuffle(train_examples)
|
||||||
next_epoch = []
|
losses = {}
|
||||||
loss = 0
|
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
|
||||||
for input_, annot in this_epoch:
|
inputs, annots = zip(*batch)
|
||||||
loss += nlp.update(input_, annot)
|
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||||
if (i+1) < nr_epoch:
|
|
||||||
next_epoch.append((input_, annot))
|
|
||||||
random.shuffle(next_epoch)
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
report_scores(i, loss, scores)
|
report_scores(i, losses['ner'], scores)
|
||||||
nlp.average_weights()
|
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
report_scores(channels, i+1, loss, scores)
|
report_scores(channels, i+1, loss, scores)
|
||||||
|
|
||||||
|
@ -208,7 +164,8 @@ def read_examples(path):
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
sents = file_.read().strip().split('\n\n')
|
sents = file_.read().strip().split('\n\n')
|
||||||
for sent in sents:
|
for sent in sents:
|
||||||
if not sent.strip():
|
sent = sent.strip()
|
||||||
|
if not sent:
|
||||||
continue
|
continue
|
||||||
tokens = sent.split('\n')
|
tokens = sent.split('\n')
|
||||||
while tokens and tokens[0].startswith('#'):
|
while tokens and tokens[0].startswith('#'):
|
||||||
|
@ -217,28 +174,39 @@ def read_examples(path):
|
||||||
iob = []
|
iob = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if token.strip():
|
if token.strip():
|
||||||
pieces = token.split()
|
pieces = token.split('\t')
|
||||||
words.append(pieces[1])
|
words.append(pieces[1])
|
||||||
iob.append(pieces[2])
|
iob.append(pieces[2])
|
||||||
yield words, iob_to_biluo(iob)
|
yield words, iob_to_biluo(iob)
|
||||||
|
|
||||||
|
|
||||||
|
def get_labels(examples):
|
||||||
|
labels = set()
|
||||||
|
for words, tags in examples:
|
||||||
|
for tag in tags:
|
||||||
|
if '-' in tag:
|
||||||
|
labels.add(tag.split('-')[1])
|
||||||
|
return sorted(labels)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model_dir=("Path to save the model", "positional", None, Path),
|
model_dir=("Path to save the model", "positional", None, Path),
|
||||||
train_loc=("Path to your training data", "positional", None, Path),
|
train_loc=("Path to your training data", "positional", None, Path),
|
||||||
dev_loc=("Path to your development data", "positional", None, Path),
|
dev_loc=("Path to your development data", "positional", None, Path),
|
||||||
)
|
)
|
||||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
|
||||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
print(model_dir, train_loc, dev_loc)
|
||||||
|
train_examples = list(read_examples(train_loc))
|
||||||
train_examples = read_examples(train_loc)
|
|
||||||
dev_examples = read_examples(dev_loc)
|
dev_examples = read_examples(dev_loc)
|
||||||
nlp = Pipeline.load(model_dir)
|
nlp = Pipeline()
|
||||||
|
for label in get_labels(train_examples):
|
||||||
|
nlp.entity.add_label(label)
|
||||||
|
print("Add label", label)
|
||||||
|
|
||||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||||
|
|
||||||
nlp.save(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
plac.call(main)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user