From eb3057d8066f01fef1ed63ea8b35a7037b73738b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 6 Jul 2015 09:33:00 +0200 Subject: [PATCH] * Add updated unsupervised_train script, from the wsd directory --- bin/wsd/unsupervised_train.py | 47 ++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/bin/wsd/unsupervised_train.py b/bin/wsd/unsupervised_train.py index c6535b438..653f34a9b 100644 --- a/bin/wsd/unsupervised_train.py +++ b/bin/wsd/unsupervised_train.py @@ -4,16 +4,16 @@ from __future__ import unicode_literals import os from os import path -import shutil -import codecs import random +import shutil import plac -import cProfile -import pstats -import re + +from spacy.munge.corpus import DocsDB +from spacy.munge.read_semcor import read_semcor from spacy.en import English +from spacy.syntax.util import Config def score_model(nlp, semcor_docs): @@ -24,8 +24,11 @@ def score_model(nlp, semcor_docs): for pnum, para in paras: for snum, sent in para: words = [t.orth for t in sent] + if len(words) < 2: + continue tokens = nlp.tokenizer.tokens_from_list(words) nlp.tagger(tokens) + nlp.parser(tokens) nlp.senser(tokens) for i, token in enumerate(tokens): if '_' in sent[i].orth: @@ -33,40 +36,44 @@ def score_model(nlp, semcor_docs): elif sent[i].supersense != 'NO_SENSE': n_right += token.sense_ == sent[i].supersense n_wrong += token.sense_ != sent[i].supersense - return n_multi, n_right, n_wrong + return n_right / (n_right + n_wrong) -def train(Language, model_dir, docs, annotations, report_every=1000, n_docs=1000): +def train(Language, model_dir, train_docs, dev_docs, + report_every=1000, n_docs=1000, seed=0): wsd_model_dir = path.join(model_dir, 'wsd') - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) + if path.exists(wsd_model_dir): + shutil.rmtree(wsd_model_dir) os.mkdir(wsd_model_dir) Config.write(wsd_model_dir, 'config', seed=seed) - nlp = Language(data_dir=model_dir) + nlp = Language(data_dir=model_dir, load_vectors=False) - for doc in corpus: - tokens = nlp(doc, senser=False) + loss = 0 + n_tokens = 0 + for i, doc in enumerate(train_docs): + tokens = nlp(doc, parse=True, entity=False) loss += nlp.senser.train(tokens) - if i and not i % report_every: + n_tokens += len(tokens) + if i and i % report_every == 0: acc = score_model(nlp, dev_docs) - print loss, n_right / (n_right + n_wrong) + print i, loss / n_tokens, acc nlp.senser.end_training() nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) @plac.annotations( - docs_db_loc=("Location of the documents SQLite database"), + train_loc=("Location of the documents SQLite database"), + dev_loc=("Location of the SemCor corpus directory"), model_dir=("Location of the models directory"), n_docs=("Number of training documents", "option", "n", int), - verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), + seed=("Random seed", "option", "s", int), ) -def main(train_loc, dev_loc, model_dir, n_docs=0): - train_docs = DocsDB(train_loc) +def main(train_loc, dev_loc, model_dir, n_docs=1000000, seed=0): + train_docs = DocsDB(train_loc, limit=n_docs) dev_docs = read_semcor(dev_loc) - train(English, model_dir, train_docs, dev_docs, report_every=10, n_docs=1000): + train(English, model_dir, train_docs, dev_docs, report_every=100, seed=seed) if __name__ == '__main__':