mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
261 lines
8.8 KiB
Python
Executable File
261 lines
8.8 KiB
Python
Executable File
#!/usr/bin/env python
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import os
|
|
from os import path
|
|
import shutil
|
|
import io
|
|
import random
|
|
import time
|
|
import gzip
|
|
import re
|
|
import numpy
|
|
from math import sqrt
|
|
|
|
import plac
|
|
import cProfile
|
|
import pstats
|
|
|
|
import spacy.util
|
|
from spacy.en import English
|
|
from spacy.gold import GoldParse
|
|
|
|
from spacy.syntax.util import Config
|
|
from spacy.syntax.arc_eager import ArcEager
|
|
from spacy.syntax.parser import Parser, get_templates
|
|
from spacy.syntax.beam_parser import BeamParser
|
|
from spacy.scorer import Scorer
|
|
from spacy.tagger import Tagger
|
|
from spacy.syntax.nonproj import PseudoProjectivity
|
|
from spacy.syntax import _parse_features as pf
|
|
|
|
# Last updated for spaCy v0.97
|
|
|
|
|
|
def read_conll(file_, n=0):
|
|
"""Read a standard CoNLL/MALT-style format"""
|
|
text = file_.read().strip()
|
|
sent_strs = re.split(r'\n\s*\n', text)
|
|
for sent_id, sent_str in enumerate(sent_strs):
|
|
if not sent_str.strip():
|
|
continue
|
|
ids = []
|
|
words = []
|
|
heads = []
|
|
labels = []
|
|
tags = []
|
|
for i, line in enumerate(sent_str.strip().split('\n')):
|
|
word, pos_string, head_idx, label = _parse_line(line)
|
|
words.append(word)
|
|
if head_idx < 0:
|
|
head_idx = i
|
|
ids.append(i)
|
|
heads.append(head_idx)
|
|
labels.append(label)
|
|
tags.append(pos_string)
|
|
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
|
yield (None, [(annot, None)])
|
|
if n and sent_id >= n:
|
|
break
|
|
|
|
|
|
def _parse_line(line):
|
|
pieces = line.split()
|
|
if len(pieces) == 4:
|
|
word, pos, head_idx, label = pieces
|
|
head_idx = int(head_idx)
|
|
elif len(pieces) == 15:
|
|
id_ = int(pieces[0].split('_')[-1])
|
|
word = pieces[1]
|
|
pos = pieces[4]
|
|
head_idx = int(pieces[8])-1
|
|
label = pieces[10]
|
|
else:
|
|
id_ = int(pieces[0].split('_')[-1])
|
|
word = pieces[1]
|
|
pos = pieces[4]
|
|
head_idx = int(pieces[6])-1
|
|
label = pieces[7]
|
|
if head_idx < 0:
|
|
label = 'ROOT'
|
|
return word, pos, head_idx, label
|
|
|
|
|
|
def print_words(strings, words, embeddings):
|
|
ids = {strings[word]: word for word in words}
|
|
vectors = {}
|
|
for key, values in embeddings[5]:
|
|
if key in ids:
|
|
vectors[strings[key]] = values
|
|
for word in words:
|
|
if word in vectors:
|
|
print(word, vectors[word])
|
|
|
|
|
|
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
|
nlp.parser(tokens)
|
|
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
|
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
|
|
|
|
|
def score_file(nlp, loc):
|
|
scorer = Scorer()
|
|
with io.open(loc, 'r', encoding='utf8') as file_:
|
|
for _, sents in read_conll(file_):
|
|
for annot_tuples, _ in sents:
|
|
score_model(scorer, nlp, None, annot_tuples)
|
|
return scorer
|
|
|
|
|
|
def score_sents(nlp, gold_tuples):
|
|
scorer = Scorer()
|
|
for _, sents in gold_tuples:
|
|
for annot_tuples, _ in sents:
|
|
score_model(scorer, nlp, None, annot_tuples)
|
|
return scorer
|
|
|
|
|
|
def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
|
|
width=128, depth=3,
|
|
learn_rate=0.001, noise=0.01, update_step='sgd_cm', regularization=0.0,
|
|
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
|
|
dep_model_dir = path.join(model_dir, 'deps')
|
|
pos_model_dir = path.join(model_dir, 'pos')
|
|
if path.exists(dep_model_dir):
|
|
shutil.rmtree(dep_model_dir)
|
|
if path.exists(pos_model_dir):
|
|
shutil.rmtree(pos_model_dir)
|
|
os.mkdir(dep_model_dir)
|
|
os.mkdir(pos_model_dir)
|
|
|
|
if feat_set != 'neural':
|
|
Config.write(dep_model_dir, 'config', feat_set=feat_set, seed=seed,
|
|
labels=ArcEager.get_labels(gold_tuples),
|
|
eta=learn_rate, rho=regularization)
|
|
|
|
else:
|
|
hidden_layers = [width] * depth
|
|
Config.write(dep_model_dir, 'config',
|
|
model='neural',
|
|
seed=seed,
|
|
labels=ArcEager.get_labels(gold_tuples),
|
|
feat_set=feat_set,
|
|
hidden_layers=hidden_layers,
|
|
update_step=update_step,
|
|
batch_norm=batch_norm,
|
|
eta=learn_rate,
|
|
mu=0.9,
|
|
noise=noise,
|
|
rho=regularization)
|
|
|
|
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
|
# Insert into vocab
|
|
for _, sents in gold_tuples:
|
|
for annot_tuples, _ in sents:
|
|
for word in annot_tuples[1]:
|
|
_ = nlp.vocab[word]
|
|
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
|
#nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
|
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
|
for word in nlp.vocab:
|
|
word.norm = word.orth
|
|
|
|
print(nlp.parser.model.widths)
|
|
|
|
print("Itn.\tP.Loss\tTrain\tDev\tnr_weight\tnr_feat")
|
|
last_score = 0.0
|
|
nr_trimmed = 0
|
|
eg_seen = 0
|
|
loss = 0
|
|
micro_eval = gold_tuples[:50]
|
|
for itn in range(n_iter):
|
|
try:
|
|
eg_seen = _train_epoch(nlp, gold_tuples, eg_seen, itn,
|
|
dev_loc, micro_eval)
|
|
except KeyboardInterrupt:
|
|
print("Saving model...")
|
|
break
|
|
dev_uas = score_file(nlp, dev_loc).uas
|
|
print("Dev before average", dev_uas)
|
|
|
|
nlp.parser.model.end_training()
|
|
nlp.parser.model.dump(path.join(model_dir, 'deps', 'model'))
|
|
print("Saved. Evaluating...")
|
|
return nlp
|
|
|
|
|
|
def _train_epoch(nlp, gold_tuples, eg_seen, itn, dev_loc, micro_eval):
|
|
random.shuffle(gold_tuples)
|
|
loss = 0
|
|
nr_trimmed = 0
|
|
for _, sents in gold_tuples:
|
|
for annot_tuples, _ in sents:
|
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
|
gold = GoldParse(tokens, annot_tuples)
|
|
loss += nlp.parser.train(tokens, gold, itn=itn)
|
|
eg_seen += 1
|
|
if eg_seen % 1000 == 0:
|
|
if eg_seen % 20000 == 0:
|
|
dev_uas = score_file(nlp, dev_loc).uas
|
|
else:
|
|
dev_uas = 0.0
|
|
train_uas = score_sents(nlp, micro_eval).uas
|
|
nr_upd = nlp.parser.model.time
|
|
nr_weight = nlp.parser.model.nr_weight
|
|
nr_feat = nlp.parser.model.nr_active_feat
|
|
print('%d,%d:\t%d\t%.3f\t%.3f\t%d\t%d' % (itn, nr_upd, int(loss),
|
|
train_uas, dev_uas,
|
|
nr_weight, nr_feat))
|
|
loss = 0
|
|
nlp.parser.model.learn_rate *= 0.99
|
|
return eg_seen
|
|
|
|
|
|
@plac.annotations(
|
|
train_loc=("Location of CoNLL 09 formatted training file"),
|
|
dev_loc=("Location of CoNLL 09 formatted development file"),
|
|
model_dir=("Location of output model directory"),
|
|
n_iter=("Number of training iterations", "option", "i", int),
|
|
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
|
|
update_step=("Update step", "option", "u", str),
|
|
learn_rate=("Learn rate", "option", "e", float),
|
|
regularization=("Regularization penalty", "option", "r", float),
|
|
gradient_noise=("Gradient noise", "option", "W", float),
|
|
neural=("Use neural network?", "flag", "N"),
|
|
width=("Width of hidden layers", "option", "w", int),
|
|
depth=("Number of hidden layers", "option", "d", int),
|
|
)
|
|
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
|
|
width=128, depth=3, learn_rate=0.001, gradient_noise=0.0, regularization=0.0,
|
|
update_step='sgd_cm'):
|
|
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
|
train_sents = list(read_conll(file_))
|
|
# Preprocess training data here before ArcEager.get_labels() is called
|
|
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
|
|
|
nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
|
|
width=width, depth=depth,
|
|
feat_set='neural' if neural else 'basic',
|
|
batch_norm=batch_norm,
|
|
learn_rate=learn_rate,
|
|
regularization=regularization,
|
|
update_step=update_step,
|
|
noise=gradient_noise)
|
|
|
|
scorer = score_file(nlp, dev_loc)
|
|
print('TOK', scorer.token_acc)
|
|
print('POS', scorer.tags_acc)
|
|
print('UAS', scorer.uas)
|
|
print('LAS', scorer.las)
|
|
print('nr_weight', nlp.parser.model.nr_weight)
|
|
print('nr_feat', nlp.parser.model.nr_active_feat)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
plac.call(main)
|