Delete old training scripts (resolves #911)

2025-08-31 09:25:01 +03:00 · 2017-03-23 11:07:59 +01:00 · 2017-03-23 11:07:59 +01:00 · 8bc05c2ba9
commit 8bc05c2ba9
parent 3f20efe165
2 changed files with 0 additions and 391 deletions
--- a/bin/parser/conll_parse.py
+++ b/bin/parser/conll_parse.py
@ -1,130 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import codecs
-import random
-import time
-import gzip
-
-import plac
-import cProfile
-import pstats
-
-import spacy.util
-from spacy.en import English
-from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
-
-from spacy.syntax.parser import GreedyParser
-from spacy.syntax.parser import OracleError
-from spacy.syntax.util import Config
-
-
-def is_punct_label(label):
-    return label == 'P' or label.lower() == 'punct'
-
-
-def read_gold(file_):
-    """Read a standard CoNLL/MALT-style format"""
-    sents = []
-    for sent_str in file_.read().strip().split('\n\n'):
-        ids = []
-        words = []
-        heads = []
-        labels = []
-        tags = []
-        for i, line in enumerate(sent_str.split('\n')):
-            id_, word, pos_string, head_idx, label = _parse_line(line)
-            words.append(word)
-            if head_idx == -1:
-                head_idx = i
-            ids.append(id_)
-            heads.append(head_idx)
-            labels.append(label)
-            tags.append(pos_string)
-        text = ' '.join(words)
-        sents.append((text, [words], ids, words, tags, heads, labels))
-    return sents
-
-
-def _parse_line(line):
-    pieces = line.split()
-    id_ = int(pieces[0])
-    word = pieces[1]
-    pos = pieces[3]
-    head_idx = int(pieces[6])
-    label = pieces[7]
-    return id_, word, pos, head_idx, label
-
-        
-def iter_data(paragraphs, tokenizer, gold_preproc=False):
-    for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
-        assert len(words) == len(heads)
-        for words in tokenized:
-            sent_ids = ids[:len(words)]
-            sent_tags = tags[:len(words)]
-            sent_heads = heads[:len(words)]
-            sent_labels = labels[:len(words)]
-            sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
-            tokens = tokenizer.tokens_from_list(words)
-            yield tokens, sent_tags, sent_heads, sent_labels
-            ids = ids[len(words):]
-            tags = tags[len(words):]
-            heads = heads[len(words):]
-            labels = labels[len(words):]
-
-
-def _map_indices_to_tokens(ids, heads):
-    mapped = []
-    for head in heads:
-        if head not in ids:
-            mapped.append(None)
-        else:
-            mapped.append(ids.index(head))
-    return mapped
-
-
-
-def evaluate(Language, dev_loc, model_dir):
-    global loss
-    nlp = Language()
-    n_corr = 0
-    pos_corr = 0
-    n_tokens = 0
-    total = 0
-    skipped = 0
-    loss = 0
-    with codecs.open(dev_loc, 'r', 'utf8') as file_:
-        paragraphs = read_gold(file_)
-    for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
-        assert len(tokens) == len(labels)
-        nlp.tagger.tag_from_strings(tokens, tag_strs)
-        nlp.parser(tokens)
-        for i, token in enumerate(tokens):
-            try:
-                pos_corr += token.tag_ == tag_strs[i]
-            except:
-                print i, token.orth_, token.tag
-                raise
-            n_tokens += 1
-            if heads[i] is None:
-                skipped += 1
-                continue
-            if is_punct_label(labels[i]):
-                continue
-            n_corr += token.head.i == heads[i]
-            total += 1
-    print loss, skipped, (loss+skipped + total)
-    print pos_corr / n_tokens
-    return float(n_corr) / (total + loss)
-
-
-def main(dev_loc, model_dir):
-    print evaluate(English, dev_loc, model_dir)
-    
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/parser/nn_train.py
+++ b/bin/parser/nn_train.py
@ -1,261 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import codecs
-import random
-
-import plac
-import cProfile
-import pstats
-import re
-
-import spacy.util
-from spacy.en import English
-from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
-
-from spacy.syntax.util import Config
-from spacy.gold import read_json_file
-from spacy.gold import GoldParse
-
-from spacy.scorer import Scorer
-
-from spacy.syntax.parser import Parser, get_templates
-from spacy._theano import TheanoModel
-
-import theano
-import theano.tensor as T
-
-from theano.printing import Print
-
-import numpy
-from collections import OrderedDict, defaultdict
-
-
-theano.config.profile = False
-theano.config.floatX = 'float32'
-floatX = theano.config.floatX
-
-
-def L1(L1_reg, *weights):
-    return L1_reg * sum(abs(w).sum() for w in weights)
-
-
-def L2(L2_reg, *weights):
-    return L2_reg * sum((w ** 2).sum() for w in weights)
-
-
-def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
-    updates = OrderedDict()
-    for param in params:
-        value = param.get_value(borrow=True)
-        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
-                             broadcastable=param.broadcastable)
-
-        grad = T.grad(loss, param)
-        accu_new = rho * accu + (1 - rho) * grad ** 2
-        updates[accu] = accu_new
-        updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
-    return updates
-
-
-def relu(x):
-    return x * (x > 0)
-
-
-def feed_layer(activation, weights, bias, input_):
-    return activation(T.dot(input_, weights) + bias)
-
-
-def init_weights(n_in, n_out):
-    rng = numpy.random.RandomState(1235)
-    
-    weights = numpy.asarray(
-        rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
-        dtype=theano.config.floatX
-    )
-    bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
-    return [wrapper(weights, name='W'), wrapper(bias, name='b')]
-
-
-def compile_model(n_classes, n_hidden, n_in, optimizer):
-    x = T.vector('x') 
-    costs = T.ivector('costs')
-    loss = T.scalar('loss')
-
-    maxent_W, maxent_b = init_weights(n_hidden, n_classes)
-    hidden_W, hidden_b = init_weights(n_in, n_hidden)
-
-    # Feed the inputs forward through the network
-    p_y_given_x = feed_layer(
-                    T.nnet.softmax,
-                    maxent_W,
-                    maxent_b,
-                      feed_layer(
-                        relu,
-                        hidden_W,
-                        hidden_b,
-                        x))
-
-    loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
-
-    train_model = theano.function(
-        name='train_model',
-        inputs=[x, costs],
-        outputs=[p_y_given_x[0], T.grad(loss, x), loss],
-        updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
-        on_unused_input='warn'
-    )
-
-    evaluate_model = theano.function(
-        name='evaluate_model',
-        inputs=[x],
-        outputs=[
-            feed_layer(
-              T.nnet.softmax,
-              maxent_W,
-              maxent_b,
-              feed_layer(
-                relu,
-                hidden_W,
-                hidden_b,
-                x
-              )
-            )[0]
-        ]
-    )
-    return train_model, evaluate_model
-
-
-def score_model(scorer, nlp, annot_tuples, verbose=False):
-    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    nlp.tagger(tokens)
-    nlp.parser(tokens)
-    gold = GoldParse(tokens, annot_tuples)
-    scorer.score(tokens, gold, verbose=verbose)
-
-
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
-          seed=0, n_sents=0,  verbose=False):
-
-    dep_model_dir = path.join(model_dir, 'deps')
-    pos_model_dir = path.join(model_dir, 'pos')
-    if path.exists(dep_model_dir):
-        shutil.rmtree(dep_model_dir)
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
-    os.mkdir(dep_model_dir)
-    os.mkdir(pos_model_dir)
-    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
-
-    Config.write(dep_model_dir, 'config',
-        seed=seed,
-        templates=tuple(),
-        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
-        vector_lengths=(nv_word, nv_tag, nv_label),
-        hidden_nodes=nv_hidden,
-        eta=eta,
-        mu=mu
-    )
-  
-    # Bake-in hyper-parameters
-    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
-    nlp = Language(data_dir=model_dir)
-    n_classes = nlp.parser.model.n_classes
-    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
-    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
-                                   predict, model_loc)
- 
-    if n_sents > 0:
-        gold_tuples = gold_tuples[:n_sents]
-    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
-    log_loc = path.join(model_dir, 'job.log')
-    for itn in range(n_iter):
-        scorer = Scorer()
-        loss = 0
-        for _, sents in gold_tuples:
-            for annot_tuples, ctnt in sents:
-                if len(annot_tuples[1]) == 1:
-                    continue
-                score_model(scorer, nlp, annot_tuples)
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
-                assert gold.is_projective
-                loss += nlp.parser.train(tokens, gold)
-                nlp.tagger.train(tokens, gold.tags)
-        random.shuffle(gold_tuples)
-        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
-                                                 scorer.tags_acc,
-                                                 scorer.token_acc)
-        print logline
-        with open(log_loc, 'aw') as file_:
-            file_.write(logline + '\n')
-    nlp.parser.model.end_training()
-    nlp.tagger.model.end_training()
-    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
-    return nlp
-
-
-def evaluate(nlp, gold_tuples, gold_preproc=True):
-    scorer = Scorer()
-    for raw_text, sents in gold_tuples:
-        for annot_tuples, brackets in sents:
-            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-            nlp.tagger(tokens)
-            nlp.parser(tokens)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold)
-    return scorer
-
-
-@plac.annotations(
-    train_loc=("Location of training file or directory"),
-    dev_loc=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    n_sents=("Number of training sentences", "option", "n", int),
-    n_iter=("Number of training iterations", "option", "i", int),
-    verbose=("Verbose error reporting", "flag", "v", bool),
-
-    nv_word=("Word vector length", "option", "W", int),
-    nv_tag=("Tag vector length", "option", "T", int),
-    nv_label=("Label vector length", "option", "L", int),
-    nv_hidden=("Hidden nodes length", "option", "H", int),
-    eta=("Learning rate", "option", "E", float),
-    mu=("Momentum", "option", "M", float),
-)
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
-         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
-         eta=0.1, mu=0.9, eval_only=False):
-
-
-
-
-    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
-
-    nlp = train(English, gold_train, model_dir,
-               feat_set='embed',
-               eta=eta, mu=mu,
-               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
-               n_sents=n_sents, n_iter=n_iter,
-               verbose=verbose)
-
-    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
-    
-    print 'TOK', 100-scorer.token_acc
-    print 'POS', scorer.tags_acc
-    print 'UAS', scorer.uas
-    print 'LAS', scorer.las
-
-    print 'NER P', scorer.ents_p
-    print 'NER R', scorer.ents_r
-    print 'NER F', scorer.ents_f
-
-
-if __name__ == '__main__':
-    plac.call(main)