spaCy/bin/parser/conll_train.py
2016-09-04 16:57:10 +02:00

261 lines
8.8 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import re
import numpy
from math import sqrt
import plac
import cProfile
import pstats
import spacy.util
from spacy.en import English
from spacy.gold import GoldParse
from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.scorer import Scorer
from spacy.tagger import Tagger
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax import _parse_features as pf
# Last updated for spaCy v0.97
def read_conll(file_, n=0):
"""Read a standard CoNLL/MALT-style format"""
text = file_.read().strip()
sent_strs = re.split(r'\n\s*\n', text)
for sent_id, sent_str in enumerate(sent_strs):
if not sent_str.strip():
continue
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.strip().split('\n')):
word, pos_string, head_idx, label = _parse_line(line)
words.append(word)
if head_idx < 0:
head_idx = i
ids.append(i)
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
yield (None, [(annot, None)])
if n and sent_id >= n:
break
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
word, pos, head_idx, label = pieces
head_idx = int(head_idx)
elif len(pieces) == 15:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[8])-1
label = pieces[10]
else:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[6])-1
label = pieces[7]
if head_idx < 0:
label = 'ROOT'
return word, pos, head_idx, label
def print_words(strings, words, embeddings):
ids = {strings[word]: word for word in words}
vectors = {}
for key, values in embeddings[5]:
if key in ids:
vectors[strings[key]] = values
for word in words:
if word in vectors:
print(word, vectors[word])
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def score_file(nlp, loc):
scorer = Scorer()
with io.open(loc, 'r', encoding='utf8') as file_:
for _, sents in read_conll(file_):
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
return scorer
def score_sents(nlp, gold_tuples):
scorer = Scorer()
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
return scorer
def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
width=128, depth=3,
learn_rate=0.001, noise=0.01, update_step='sgd_cm', regularization=0.0,
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
if feat_set != 'neural':
Config.write(dep_model_dir, 'config', feat_set=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples),
eta=learn_rate, rho=regularization)
else:
hidden_layers = [width] * depth
Config.write(dep_model_dir, 'config',
model='neural',
seed=seed,
labels=ArcEager.get_labels(gold_tuples),
feat_set=feat_set,
hidden_layers=hidden_layers,
update_step=update_step,
batch_norm=batch_norm,
eta=learn_rate,
mu=0.9,
noise=noise,
rho=regularization)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
# Insert into vocab
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
for word in annot_tuples[1]:
_ = nlp.vocab[word]
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
#nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
for word in nlp.vocab:
word.norm = word.orth
print(nlp.parser.model.widths)
print("Itn.\tP.Loss\tTrain\tDev\tnr_weight\tnr_feat")
last_score = 0.0
nr_trimmed = 0
eg_seen = 0
loss = 0
micro_eval = gold_tuples[:50]
for itn in range(n_iter):
try:
eg_seen = _train_epoch(nlp, gold_tuples, eg_seen, itn,
dev_loc, micro_eval)
except KeyboardInterrupt:
print("Saving model...")
break
dev_uas = score_file(nlp, dev_loc).uas
print("Dev before average", dev_uas)
nlp.parser.model.end_training()
nlp.parser.model.dump(path.join(model_dir, 'deps', 'model'))
print("Saved. Evaluating...")
return nlp
def _train_epoch(nlp, gold_tuples, eg_seen, itn, dev_loc, micro_eval):
random.shuffle(gold_tuples)
loss = 0
nr_trimmed = 0
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
gold = GoldParse(tokens, annot_tuples)
loss += nlp.parser.train(tokens, gold, itn=itn)
eg_seen += 1
if eg_seen % 1000 == 0:
if eg_seen % 20000 == 0:
dev_uas = score_file(nlp, dev_loc).uas
else:
dev_uas = 0.0
train_uas = score_sents(nlp, micro_eval).uas
nr_upd = nlp.parser.model.time
nr_weight = nlp.parser.model.nr_weight
nr_feat = nlp.parser.model.nr_active_feat
print('%d,%d:\t%d\t%.3f\t%.3f\t%d\t%d' % (itn, nr_upd, int(loss),
train_uas, dev_uas,
nr_weight, nr_feat))
loss = 0
nlp.parser.model.learn_rate *= 0.99
return eg_seen
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
n_iter=("Number of training iterations", "option", "i", int),
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
update_step=("Update step", "option", "u", str),
learn_rate=("Learn rate", "option", "e", float),
regularization=("Regularization penalty", "option", "r", float),
gradient_noise=("Gradient noise", "option", "W", float),
neural=("Use neural network?", "flag", "N"),
width=("Width of hidden layers", "option", "w", int),
depth=("Number of hidden layers", "option", "d", int),
)
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
width=128, depth=3, learn_rate=0.001, gradient_noise=0.0, regularization=0.0,
update_step='sgd_cm'):
with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = list(read_conll(file_))
# Preprocess training data here before ArcEager.get_labels() is called
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
width=width, depth=depth,
feat_set='neural' if neural else 'basic',
batch_norm=batch_norm,
learn_rate=learn_rate,
regularization=regularization,
update_step=update_step,
noise=gradient_noise)
scorer = score_file(nlp, dev_loc)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('nr_weight', nlp.parser.model.nr_weight)
print('nr_feat', nlp.parser.model.nr_active_feat)
if __name__ == '__main__':
plac.call(main)