* Update conll_train for tagger, to use neural network tagger

This commit is contained in:
Matthew Honnibal 2016-02-22 00:16:40 +01:00
parent c3f334cef1
commit 5f53ef1a43

181
bin/tagger/conll_train.py Executable file
View File

@ -0,0 +1,181 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import time
import gzip
import plac
import cProfile
import pstats
import numpy.random
from spacy.en import English
from spacy.de import German
import spacy.util
from spacy.syntax.util import Config
from spacy.scorer import Scorer
from spacy.tagger import Tagger
from spacy.tagger import P2_orth, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_flags
from spacy.tagger import P1_orth, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_flags
from spacy.tagger import W_orth, W_shape, W_prefix, W_suffix, W_pos, W_flags
from spacy.tagger import N1_orth, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_flags
from spacy.tagger import N2_orth, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_flags
templates = {
'de': [
(W_orth,),
(P1_orth, P1_pos),
(P2_orth, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,)
]
}
def read_conll(file_):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
words = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
if line.startswith('#'):
continue
idx, word, pos_string = _parse_line(line)
words.append(word)
tags.append(pos_string)
sents.append((words, tags))
return sents
def _parse_line(line):
pieces = line.split()
id_ = int(pieces[0].split('_')[-1])-1
word = pieces[1]
pos = pieces[4]
return id_, word, pos
def score_model(nlp, gold_tuples, verbose=False):
scorer = Scorer()
for words, gold_tags in gold_tuples:
tokens = nlp.tokenizer.tokens_from_list(words)
nlp.tagger(tokens)
for token, gold in zip(tokens, gold_tags):
scorer.tags.tp += token.tag_ == gold
scorer.tags.fp += token.tag_ != gold
scorer.tags.fn += token.tag_ != gold
return scorer.tags_acc
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
gold_preproc=False, eta=0.005):
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(pos_model_dir)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
# Insert words into the vocab. Yes, confusing...
for words, tags in train_sents:
for word in words:
_ = nlp.vocab[word]
nlp.tagger = Tagger.blank(nlp.vocab, templates['de'], learn_rate=eta)
print(nlp.tagger.model.widths)
print("Itn.\tTrain\tCheck\tDev")
nr_train = len(train_sents)
random.shuffle(train_sents)
heldout_sents = train_sents[:int(nr_train * 0.1)]
train_sents = train_sents[len(heldout_sents):]
assert len(heldout_sents) < len(train_sents)
prev_score = 0.0
variance = 0.001
last_good_learn_rate = nlp.tagger.model.eta
for itn in range(n_iter):
random.shuffle(train_sents)
acc = 0
total = 0
for words, gold_tags in train_sents:
tokens = nlp.tokenizer.tokens_from_list(words)
acc += nlp.tagger.train(tokens, gold_tags)
total += len(tokens)
dev_score = score_model(nlp, heldout_sents)
eval_score = score_model(nlp, dev_sents)
if dev_score >= prev_score:
nlp.tagger.model.keep_update()
prev_score = dev_score
variance = 0.001
last_good_learn_rate = nlp.tagger.model.eta
nlp.tagger.model.eta *= 1.05
print('%d:\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, acc/total, dev_score, eval_score, nlp.tagger.model.eta))
else:
nlp.tagger.model.backtrack()
new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
if new_eta >= 0.00001:
nlp.tagger.model.eta = new_eta
else:
nlp.tagger.model.eta = 0.00001
print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta))
variance *= 1.1
prev_score *= 0.9999
nlp.end_training(data_dir=model_dir)
return nlp
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eta=("Learning rate for Adagrad optimizer", "option", "e", float),
n_iter=("Number of training iterations", "option", "i", int),
)
def main(lang_id, train_loc, dev_loc, model_dir, n_iter=5, eta=0.005):
if lang_id == 'en':
Language = English
elif lang_id == 'de':
Language = German
elif lang_id == 'fi':
Language = Finnish
elif lang_id == 'it':
Language = Italian
with codecs.open(train_loc, 'r', 'utf8') as file_:
train_sents = read_conll(file_)
dev_sents = read_conll(codecs.open(dev_loc, 'r', 'utf8'))
nlp = train(Language, train_sents, dev_sents, model_dir, n_iter=n_iter, eta=eta)
#nlp = Language(data_dir=model_dir)
scorer = score_model(nlp, dev_sents)
print('TOK', 100-scorer.token_acc)
print('POS', scorer.tags_acc)
if __name__ == '__main__':
plac.call(main)