spaCy/bin/parser/conll_train.py

#!/usr/bin/env python
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import io
import random
import time
import gzip
import re
import numpy
from math import sqrt

import plac
import cProfile
import pstats

import spacy.util
from spacy.en import English
from spacy.gold import GoldParse

from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.scorer import Scorer
from spacy.tagger import Tagger
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax import _parse_features as pf

# Last updated for spaCy v0.97


def read_conll(file_, n=0):
    """Read a standard CoNLL/MALT-style format"""
    text = file_.read().strip()
    sent_strs = re.split(r'\n\s*\n', text)
    for sent_id, sent_str in enumerate(sent_strs):
        if not sent_str.strip():
            continue
        ids = []
        words = []
        heads = []
        labels = []
        tags = []
        for i, line in enumerate(sent_str.strip().split('\n')):
            word, pos_string, head_idx, label = _parse_line(line)
            words.append(word)
            if head_idx < 0:
                head_idx = i
            ids.append(i)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
        annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
        yield (None, [(annot, None)])
        if n and sent_id >= n:
            break


def _parse_line(line):
    pieces = line.split()
    if len(pieces) == 4:
        word, pos, head_idx, label = pieces
        head_idx = int(head_idx)
    elif len(pieces) == 15:
        id_ = int(pieces[0].split('_')[-1])
        word = pieces[1]
        pos = pieces[4]
        head_idx = int(pieces[8])-1
        label = pieces[10]
    else:
        id_ = int(pieces[0].split('_')[-1])
        word = pieces[1]
        pos = pieces[4]
        head_idx = int(pieces[6])-1
        label = pieces[7]
    if head_idx < 0:
        label = 'ROOT'
    return word, pos, head_idx, label


def print_words(strings, words, embeddings):
    ids = {strings[word]: word for word in words}
    vectors = {}
    for key, values in embeddings[5]:
        if key in ids:
            vectors[strings[key]] = values
    for word in words:
        if word in vectors:
            print(word, vectors[word])

        
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples, make_projective=False)
    scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))


def score_file(nlp, loc):
    scorer = Scorer()
    with io.open(loc, 'r', encoding='utf8') as file_:
        for _, sents in read_conll(file_):
            for annot_tuples, _ in sents:
                score_model(scorer, nlp, None, annot_tuples)
    return scorer


def score_sents(nlp, gold_tuples):
    scorer = Scorer()
    for _, sents in gold_tuples:
        for annot_tuples, _ in sents:
            score_model(scorer, nlp, None, annot_tuples)
    return scorer


def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
          width=128, depth=3,
          learn_rate=0.001, noise=0.01, update_step='sgd_cm', regularization=0.0,
          batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)

    if feat_set != 'neural':
        Config.write(dep_model_dir, 'config', feat_set=feat_set, seed=seed,
            labels=ArcEager.get_labels(gold_tuples),
            eta=learn_rate, rho=regularization)

    else:
        hidden_layers = [width] * depth
        Config.write(dep_model_dir, 'config',
                     model='neural',
                     seed=seed,
                     labels=ArcEager.get_labels(gold_tuples),
                     feat_set=feat_set,
                     hidden_layers=hidden_layers,
                     update_step=update_step,
                     batch_norm=batch_norm,
                     eta=learn_rate,
                     mu=0.9,
                     noise=noise,
                     rho=regularization)

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    # Insert into vocab
    for _, sents in gold_tuples:
        for annot_tuples, _ in sents:
            for word in annot_tuples[1]:
                _ = nlp.vocab[word] 
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    #nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    for word in nlp.vocab:
        word.norm = word.orth
    
    print(nlp.parser.model.widths)
 
    print("Itn.\tP.Loss\tTrain\tDev\tnr_weight\tnr_feat")
    last_score = 0.0
    nr_trimmed = 0
    eg_seen = 0
    loss = 0
    micro_eval = gold_tuples[:50]
    for itn in range(n_iter):
        try:
            eg_seen = _train_epoch(nlp, gold_tuples, eg_seen, itn,
                                   dev_loc, micro_eval)
        except KeyboardInterrupt:
            print("Saving model...")
            break
    dev_uas = score_file(nlp, dev_loc).uas
    print("Dev before average", dev_uas)

    nlp.parser.model.end_training()
    nlp.parser.model.dump(path.join(model_dir, 'deps', 'model'))
    print("Saved. Evaluating...")
    return nlp


def _train_epoch(nlp, gold_tuples, eg_seen, itn, dev_loc, micro_eval):
    random.shuffle(gold_tuples)
    loss = 0
    nr_trimmed = 0
    for _, sents in gold_tuples:
        for annot_tuples, _ in sents:
            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
            nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
            gold = GoldParse(tokens, annot_tuples)
            loss += nlp.parser.train(tokens, gold, itn=itn)
            eg_seen += 1
            if eg_seen % 1000 == 0:
                if eg_seen % 20000 == 0:
                    dev_uas = score_file(nlp, dev_loc).uas
                else:
                    dev_uas = 0.0
                train_uas = score_sents(nlp, micro_eval).uas
                nr_upd = nlp.parser.model.time
                nr_weight = nlp.parser.model.nr_weight
                nr_feat = nlp.parser.model.nr_active_feat
                print('%d,%d:\t%d\t%.3f\t%.3f\t%d\t%d' % (itn, nr_upd, int(loss),
                                                          train_uas, dev_uas,
                                                          nr_weight, nr_feat))
                loss = 0
    nlp.parser.model.learn_rate *= 0.99
    return eg_seen


@plac.annotations(
    train_loc=("Location of CoNLL 09 formatted training file"),
    dev_loc=("Location of CoNLL 09 formatted development file"),
    model_dir=("Location of output model directory"),
    n_iter=("Number of training iterations", "option", "i", int),
    batch_norm=("Use batch normalization and residual connections", "flag", "b"),
    update_step=("Update step", "option", "u", str),
    learn_rate=("Learn rate", "option", "e", float),
    regularization=("Regularization penalty", "option", "r", float),
    gradient_noise=("Gradient noise", "option", "W", float),
    neural=("Use neural network?", "flag", "N"),
    width=("Width of hidden layers", "option", "w", int),
    depth=("Number of hidden layers", "option", "d", int),
)
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
         width=128, depth=3, learn_rate=0.001, gradient_noise=0.0, regularization=0.0,
         update_step='sgd_cm'):
    with io.open(train_loc, 'r', encoding='utf8') as file_:
        train_sents = list(read_conll(file_))
    # Preprocess training data here before ArcEager.get_labels() is called
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
                width=width, depth=depth,
                feat_set='neural' if neural else 'basic',
                batch_norm=batch_norm,
                learn_rate=learn_rate,
                regularization=regularization,
                update_step=update_step,
                noise=gradient_noise)

    scorer = score_file(nlp, dev_loc)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
    print('nr_weight', nlp.parser.model.nr_weight)
    print('nr_feat', nlp.parser.model.nr_active_feat)


if __name__ == '__main__':
    plac.call(main)
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`#!/usr/bin/env python`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`from __future__ import print_function`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`import io`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`import random`
			`import time`
			`import gzip`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`import re`
			`import numpy`
Decay learn rate for parser 2016-09-04 17:57:10 +03:00			`from math import sqrt`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00
			`import plac`
			`import cProfile`
			`import pstats`

			`import spacy.util`
			`from spacy.en import English`
			`from spacy.gold import GoldParse`

			`from spacy.syntax.util import Config`
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00			`from spacy.syntax.arc_eager import ArcEager`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`from spacy.syntax.parser import Parser, get_templates`
			`from spacy.syntax.beam_parser import BeamParser`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`from spacy.scorer import Scorer`
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00			`from spacy.tagger import Tagger`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`from spacy.syntax.nonproj import PseudoProjectivity`
			`from spacy.syntax import _parse_features as pf`
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00
			`# Last updated for spaCy v0.97`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00

Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`def read_conll(file_, n=0):`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`"""Read a standard CoNLL/MALT-style format"""`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`text = file_.read().strip()`
			`sent_strs = re.split(r'\n\s*\n', text)`
			`for sent_id, sent_str in enumerate(sent_strs):`
			`if not sent_str.strip():`
			`continue`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`ids = []`
			`words = []`
			`heads = []`
			`labels = []`
			`tags = []`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`for i, line in enumerate(sent_str.strip().split('\n')):`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`word, pos_string, head_idx, label = _parse_line(line)`
			`words.append(word)`
			`if head_idx < 0:`
			`head_idx = i`
			`ids.append(i)`
			`heads.append(head_idx)`
			`labels.append(label)`
			`tags.append(pos_string)`
			`annot = (ids, words, tags, heads, labels, ['O'] * len(ids))`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`yield (None, [(annot, None)])`
			`if n and sent_id >= n:`
			`break`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00

			`def _parse_line(line):`
			`pieces = line.split()`
			`if len(pieces) == 4:`
			`word, pos, head_idx, label = pieces`
			`head_idx = int(head_idx)`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`elif len(pieces) == 15:`
			`id_ = int(pieces[0].split('_')[-1])`
			`word = pieces[1]`
			`pos = pieces[4]`
			`head_idx = int(pieces[8])-1`
			`label = pieces[10]`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`else:`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`id_ = int(pieces[0].split('_')[-1])`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`word = pieces[1]`
			`pos = pieces[4]`
			`head_idx = int(pieces[6])-1`
			`label = pieces[7]`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`if head_idx < 0:`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`label = 'ROOT'`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`return word, pos, head_idx, label`

Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00
			`def print_words(strings, words, embeddings):`
			`ids = {strings[word]: word for word in words}`
			`vectors = {}`
			`for key, values in embeddings[5]:`
			`if key in ids:`
			`vectors[strings[key]] = values`
			`for word in words:`
			`if word in vectors:`
			`print(word, vectors[word])`

* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00
			`def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`nlp.tagger.tag_from_strings(tokens, annot_tuples[2])`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`nlp.parser(tokens)`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`gold = GoldParse(tokens, annot_tuples, make_projective=False)`
			`scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00

* Work on neural network beam 2016-07-24 11:44:59 +03:00			`def score_file(nlp, loc):`
			`scorer = Scorer()`
			`with io.open(loc, 'r', encoding='utf8') as file_:`
			`for _, sents in read_conll(file_):`
			`for annot_tuples, _ in sents:`
			`score_model(scorer, nlp, None, annot_tuples)`
			`return scorer`


			`def score_sents(nlp, gold_tuples):`
			`scorer = Scorer()`
			`for _, sents in gold_tuples:`
			`for annot_tuples, _ in sents:`
			`score_model(scorer, nlp, None, annot_tuples)`
			`return scorer`


Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',`
Update conll_train script 2016-08-29 15:24:30 +03:00			`width=128, depth=3,`
			`learn_rate=0.001, noise=0.01, update_step='sgd_cm', regularization=0.0,`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`batch_norm=False, seed=0, gold_preproc=False, force_gold=False):`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`

Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`if feat_set != 'neural':`
* Work on neural network beam 2016-07-24 11:44:59 +03:00			`Config.write(dep_model_dir, 'config', feat_set=feat_set, seed=seed,`
Update conll_train script 2016-08-29 15:24:30 +03:00			`labels=ArcEager.get_labels(gold_tuples),`
			`eta=learn_rate, rho=regularization)`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00
			`else:`
Update conll_train script 2016-08-29 15:24:30 +03:00			`hidden_layers = [width] * depth`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`Config.write(dep_model_dir, 'config',`
			`model='neural',`
			`seed=seed,`
			`labels=ArcEager.get_labels(gold_tuples),`
			`feat_set=feat_set,`
			`hidden_layers=hidden_layers,`
			`update_step=update_step,`
			`batch_norm=batch_norm,`
			`eta=learn_rate,`
			`mu=0.9,`
Add parameter for gradient noise 2016-08-05 19:24:01 +03:00			`noise=noise,`
Update conll_train script 2016-08-29 15:24:30 +03:00			`rho=regularization)`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00			`nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)`
Insert into vocab, and print nr_weight and nr_feat 2016-09-01 11:45:06 +03:00			`# Insert into vocab`
			`for _, sents in gold_tuples:`
			`for annot_tuples, _ in sents:`
			`for word in annot_tuples[1]:`
			`_ = nlp.vocab[word]`
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00			`nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())`
Update conll_train script 2016-08-29 15:24:30 +03:00			`#nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)`
			`nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`for word in nlp.vocab:`
			`word.norm = word.orth`

			`print(nlp.parser.model.widths)`
* Update conll_train.py script for spaCy v0.97 2015-10-30 16:53:51 +03:00
Insert into vocab, and print nr_weight and nr_feat 2016-09-01 11:45:06 +03:00			`print("Itn.\tP.Loss\tTrain\tDev\tnr_weight\tnr_feat")`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`last_score = 0.0`
			`nr_trimmed = 0`
			`eg_seen = 0`
			`loss = 0`
Tmp 2016-07-27 03:56:36 +03:00			`micro_eval = gold_tuples[:50]`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`for itn in range(n_iter):`
Tmp 2016-07-27 03:56:36 +03:00			`try:`
			`eg_seen = _train_epoch(nlp, gold_tuples, eg_seen, itn,`
			`dev_loc, micro_eval)`
			`except KeyboardInterrupt:`
			`print("Saving model...")`
			`break`
Print dev score before averaging 2016-08-20 05:16:50 +03:00			`dev_uas = score_file(nlp, dev_loc).uas`
			`print("Dev before average", dev_uas)`

Add parameter for gradient noise 2016-08-05 19:24:01 +03:00			`nlp.parser.model.end_training()`
Insert into vocab, and print nr_weight and nr_feat 2016-09-01 11:45:06 +03:00			`nlp.parser.model.dump(path.join(model_dir, 'deps', 'model'))`
Tmp 2016-07-27 03:56:36 +03:00			`print("Saved. Evaluating...")`
			`return nlp`

Print dev score before averaging 2016-08-20 05:16:50 +03:00
Tmp 2016-07-27 03:56:36 +03:00			`def _train_epoch(nlp, gold_tuples, eg_seen, itn, dev_loc, micro_eval):`
			`random.shuffle(gold_tuples)`
			`loss = 0`
			`nr_trimmed = 0`
			`for _, sents in gold_tuples:`
			`for annot_tuples, _ in sents:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger.tag_from_strings(tokens, annot_tuples[2])`
			`gold = GoldParse(tokens, annot_tuples)`
			`loss += nlp.parser.train(tokens, gold, itn=itn)`
			`eg_seen += 1`
			`if eg_seen % 1000 == 0:`
			`if eg_seen % 20000 == 0:`
* Work on neural network beam 2016-07-24 11:44:59 +03:00			`dev_uas = score_file(nlp, dev_loc).uas`
Tmp 2016-07-27 03:56:36 +03:00			`else:`
			`dev_uas = 0.0`
			`train_uas = score_sents(nlp, micro_eval).uas`
			`nr_upd = nlp.parser.model.time`
Insert into vocab, and print nr_weight and nr_feat 2016-09-01 11:45:06 +03:00			`nr_weight = nlp.parser.model.nr_weight`
			`nr_feat = nlp.parser.model.nr_active_feat`
			`print('%d,%d:\t%d\t%.3f\t%.3f\t%d\t%d' % (itn, nr_upd, int(loss),`
			`train_uas, dev_uas,`
			`nr_weight, nr_feat))`
Tmp 2016-07-27 03:56:36 +03:00			`loss = 0`
Decay learn rate for parser 2016-09-04 17:57:10 +03:00			`nlp.parser.model.learn_rate *= 0.99`
Tmp 2016-07-27 03:56:36 +03:00			`return eg_seen`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00

* Fancy up the CLI for the conll train script 2016-02-03 00:58:06 +03:00			`@plac.annotations(`
			`train_loc=("Location of CoNLL 09 formatted training file"),`
			`dev_loc=("Location of CoNLL 09 formatted development file"),`
			`model_dir=("Location of output model directory"),`
			`n_iter=("Number of training iterations", "option", "i", int),`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`batch_norm=("Use batch normalization and residual connections", "flag", "b"),`
			`update_step=("Update step", "option", "u", str),`
			`learn_rate=("Learn rate", "option", "e", float),`
Update conll_train script 2016-08-29 15:24:30 +03:00			`regularization=("Regularization penalty", "option", "r", float),`
			`gradient_noise=("Gradient noise", "option", "W", float),`
			`neural=("Use neural network?", "flag", "N"),`
			`width=("Width of hidden layers", "option", "w", int),`
			`depth=("Number of hidden layers", "option", "d", int),`
* Fancy up the CLI for the conll train script 2016-02-03 00:58:06 +03:00			`)`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,`
Update conll_train script 2016-08-29 15:24:30 +03:00			`width=128, depth=3, learn_rate=0.001, gradient_noise=0.0, regularization=0.0,`
			`update_step='sgd_cm'):`
* Update the CoNLL train script, to get working on other languages 2016-02-03 00:29:34 +03:00			`with io.open(train_loc, 'r', encoding='utf8') as file_:`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`train_sents = list(read_conll(file_))`
Save models properly in conll_train.py 2016-07-31 12:42:17 +03:00			`# Preprocess training data here before ArcEager.get_labels() is called`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`train_sents = PseudoProjectivity.preprocess_training_data(train_sents)`

			`nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,`
Update conll_train script 2016-08-29 15:24:30 +03:00			`width=width, depth=depth,`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`feat_set='neural' if neural else 'basic',`
			`batch_norm=batch_norm,`
			`learn_rate=learn_rate,`
Update conll_train script 2016-08-29 15:24:30 +03:00			`regularization=regularization,`
Add parameter for gradient noise 2016-08-05 19:24:01 +03:00			`update_step=update_step,`
			`noise=gradient_noise)`
Tmp 2016-07-27 03:56:36 +03:00
			`scorer = score_file(nlp, dev_loc)`
Working NN, but very messy. Relies on BLIS. 2016-07-20 17:28:02 +03:00			`print('TOK', scorer.token_acc)`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00			`print('POS', scorer.tags_acc)`
			`print('UAS', scorer.uas)`
			`print('LAS', scorer.las)`
Insert into vocab, and print nr_weight and nr_feat 2016-09-01 11:45:06 +03:00			`print('nr_weight', nlp.parser.model.nr_weight)`
			`print('nr_feat', nlp.parser.model.nr_active_feat)`
* Add train and parse scripts that use CoNLL formatted data 2015-10-30 04:54:49 +03:00

			`if __name__ == '__main__':`
			`plac.call(main)`