From eb3057d8066f01fef1ed63ea8b35a7037b73738b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 6 Jul 2015 09:33:00 +0200
Subject: [PATCH] * Add updated unsupervised_train script, from the wsd
 directory

---
 bin/wsd/unsupervised_train.py | 47 ++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/bin/wsd/unsupervised_train.py b/bin/wsd/unsupervised_train.py
index c6535b438..653f34a9b 100644
--- a/bin/wsd/unsupervised_train.py
+++ b/bin/wsd/unsupervised_train.py
@@ -4,16 +4,16 @@ from __future__ import unicode_literals
 
 import os
 from os import path
-import shutil
-import codecs
 import random
+import shutil
 
 import plac
-import cProfile
-import pstats
-import re
+
+from spacy.munge.corpus import DocsDB
+from spacy.munge.read_semcor import read_semcor
 
 from spacy.en import English
+from spacy.syntax.util import Config
 
 
 def score_model(nlp, semcor_docs):
@@ -24,8 +24,11 @@ def score_model(nlp, semcor_docs):
         for pnum, para in paras:
             for snum, sent in para:
                 words = [t.orth for t in sent]
+                if len(words) < 2:
+                    continue
                 tokens = nlp.tokenizer.tokens_from_list(words)
                 nlp.tagger(tokens)
+                nlp.parser(tokens)
                 nlp.senser(tokens)
                 for i, token in enumerate(tokens):
                     if '_' in sent[i].orth:
@@ -33,40 +36,44 @@ def score_model(nlp, semcor_docs):
                     elif sent[i].supersense != 'NO_SENSE':
                         n_right += token.sense_ == sent[i].supersense
                         n_wrong += token.sense_ != sent[i].supersense
-    return n_multi, n_right, n_wrong
+    return n_right / (n_right + n_wrong)
 
 
-def train(Language, model_dir, docs, annotations, report_every=1000, n_docs=1000):
+def train(Language, model_dir, train_docs, dev_docs,
+          report_every=1000, n_docs=1000, seed=0):
     wsd_model_dir = path.join(model_dir, 'wsd')
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
+    if path.exists(wsd_model_dir):
+        shutil.rmtree(wsd_model_dir)
     os.mkdir(wsd_model_dir)
     
     Config.write(wsd_model_dir, 'config', seed=seed)
 
-    nlp = Language(data_dir=model_dir)
+    nlp = Language(data_dir=model_dir, load_vectors=False)
 
-    for doc in corpus:
-        tokens = nlp(doc, senser=False)
+    loss = 0
+    n_tokens = 0
+    for i, doc in enumerate(train_docs):
+        tokens = nlp(doc, parse=True, entity=False)
         loss += nlp.senser.train(tokens)
-        if i and not i % report_every:
+        n_tokens += len(tokens)
+        if i and i % report_every == 0:
             acc = score_model(nlp, dev_docs)
-            print loss, n_right / (n_right + n_wrong)
+            print i, loss / n_tokens, acc
     nlp.senser.end_training()
     nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
 
 
 @plac.annotations(
-    docs_db_loc=("Location of the documents SQLite database"),
+    train_loc=("Location of the documents SQLite database"),
+    dev_loc=("Location of the SemCor corpus directory"),
     model_dir=("Location of the models directory"),
     n_docs=("Number of training documents", "option", "n", int),
-    verbose=("Verbose error reporting", "flag", "v", bool),
-    debug=("Debug mode", "flag", "d", bool),
+    seed=("Random seed", "option", "s", int),
 )
-def main(train_loc, dev_loc, model_dir, n_docs=0):
-    train_docs = DocsDB(train_loc)
+def main(train_loc, dev_loc, model_dir, n_docs=1000000, seed=0):
+    train_docs = DocsDB(train_loc, limit=n_docs)
     dev_docs = read_semcor(dev_loc)
-    train(English, model_dir, train_docs, dev_docs, report_every=10, n_docs=1000):
+    train(English, model_dir, train_docs, dev_docs, report_every=100, seed=seed)
 
 
 if __name__ == '__main__':