spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import codecs
import random

import plac
import cProfile
import pstats
import re

import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse

from spacy.scorer import Scorer


def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()


def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return ''.join(_corrupt(c, noise_level) for c in orig)


def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
        tokens = nlp.tokenizer(raw_text)
    nlp.tagger(tokens)
    nlp.entity(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)


def _merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), brackets in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]


def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir)

    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                               scorer.tags_acc,
                                               scorer.token_acc)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)
            for t in tokens:
                out_file.write(
                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
                )
    return scorer


@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    beam_width=("Number of candidates to maintain in the beam", "option", "k", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
    use_orig_arc_eager=("Use the original, monotonic arc-eager system", "flag", "m", bool)
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
         eval_only=False, use_orig_arc_eager=False):
    if use_orig_arc_eager:
        English.ParserTransitionSystem = TreeArcEager
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              beam_width=beam_width, verbose=verbose,
              use_orig_arc_eager=use_orig_arc_eager)
    #if out_loc:
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose,
                      beam_width=beam_width)
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 20:53:26 +03:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
			`import codecs`
			`import random`

			`import plac`
			`import cProfile`
			`import pstats`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`import re`
* Add parser training script 2015-01-09 20:53:26 +03:00
			`import spacy.util`
			`from spacy.en import English`
			`from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir`

			`from spacy.syntax.util import Config`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`from spacy.gold import read_json_file`
			`from spacy.gold import GoldParse`
* Add parser training script 2015-01-09 20:53:26 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`from spacy.scorer import Scorer`

* Add parser training script 2015-01-09 20:53:26 +03:00
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`def _corrupt(c, noise_level):`
* Tmp commit 2015-05-24 03:50:14 +03:00			`if random.random() >= noise_level:`
			`return c`
			`elif c == ' ':`
			`return '\n'`
			`elif c == '\n':`
			`return ' '`
			`elif c in ['.', "'", "!", "?"]:`
			`return ''`
			`else:`
			`return c.lower()`


* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`def add_noise(orig, noise_level):`
			`if random.random() >= noise_level:`
			`return orig`
			`elif type(orig) == list:`
			`corrupted = [_corrupt(word, noise_level) for word in orig]`
			`corrupted = [w for w in corrupted if w]`
			`return corrupted`
			`else:`
			`return ''.join(_corrupt(c, noise_level) for c in orig)`


* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`else:`
* Fix bug in train.py 2015-05-31 07:49:06 +03:00			`tokens = nlp.tokenizer(raw_text)`
* Fix train.py 2015-06-05 16:50:24 +03:00			`nlp.tagger(tokens)`
* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00			`nlp.entity(tokens)`
			`nlp.parser(tokens)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`gold = GoldParse(tokens, annot_tuples)`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`scorer.score(tokens, gold, verbose=verbose)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00

* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`def _merge_sents(sents):`
			`m_deps = [[], [], [], [], [], []]`
			`m_brackets = []`
			`i = 0`
			`for (ids, words, tags, heads, labels, ner), brackets in sents:`
			`m_deps[0].extend(id_ + i for id_ in ids)`
			`m_deps[1].extend(words)`
			`m_deps[2].extend(tags)`
			`m_deps[3].extend(head + i for head in heads)`
			`m_deps[4].extend(labels)`
			`m_deps[5].extend(ner)`
			`m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)`
			`i += len(ids)`
			`return [(m_deps, m_brackets)]`

* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00
			`def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',`
			`seed=0, gold_preproc=False, n_sents=0, corruption_level=0,`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`beam_width=1, verbose=False,`
			`use_orig_arc_eager=False):`
* Add parser training script 2015-01-09 20:53:26 +03:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Add parser training script 2015-01-09 20:53:26 +03:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`os.mkdir(ner_model_dir)`

			`setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)`

* Add parser training script 2015-01-09 20:53:26 +03:00			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Wire up beam-width command line argument 2015-06-02 01:54:12 +03:00			`labels=Language.ParserTransitionSystem.get_labels(gold_tuples),`
* Fix bugs in new greedy/beam parser 2015-06-02 03:01:33 +03:00			`beam_width=beam_width)`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Fix bugs in new greedy/beam parser 2015-06-02 03:01:33 +03:00			`labels=Language.EntityTransitionSystem.get_labels(gold_tuples),`
* Fix train.py 2015-06-05 16:50:24 +03:00			`beam_width=0)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"`
* Add parser training script 2015-01-09 20:53:26 +03:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`loss = 0`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`sents = _merge_sents(sents)`
			`for annot_tuples, ctnt in sents:`
* Skip sentences of length 1 in training 2015-06-05 03:29:03 +03:00			`if len(annot_tuples[1]) == 1:`
			`continue`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`score_model(scorer, nlp, raw_text, annot_tuples,`
			`verbose=verbose if itn >= 2 else False)`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`words = add_noise(annot_tuples[1], corruption_level)`
			`tokens = nlp.tokenizer.tokens_from_list(words)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`else:`
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`raw_text = add_noise(raw_text, corruption_level)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer(raw_text)`
* Clean up train.py, removing unused tag jackknifing code 2015-06-05 16:01:28 +03:00			`nlp.tagger(tokens)`
* Remove projectivity constraint in train.py, but raise Exception if non-projective sentence is encountered, since we've told GoldParse to projectivize 2015-06-23 06:04:46 +03:00			`gold = GoldParse(tokens, annot_tuples, make_projective=True)`
* Raise exception on non-projective input 2015-06-23 01:01:55 +03:00			`if not gold.is_projective:`
			`raise Exception(`
			`"Non-projective sentence in training, after we should "`
			`"have enforced projectivity: %s" % annot_tuples`
			`)`
* Remove projectivity constraint in train.py, but raise Exception if non-projective sentence is encountered, since we've told GoldParse to projectivize 2015-06-23 06:04:46 +03:00			`loss += nlp.parser.train(tokens, gold)`
* Uncomment NER training 2015-06-17 00:36:54 +03:00			`nlp.entity.train(tokens, gold)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`nlp.tagger.train(tokens, gold.tags)`
* Ensure tagger and NER are trained, even if non-projective problem 2015-05-27 04:16:21 +03:00			`random.shuffle(gold_tuples)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`scorer.tags_acc,`
			`scorer.token_acc)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.parser.model.end_training()`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`nlp.entity.model.end_training()`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.tagger.model.end_training()`
* Ensure StringStore is dumped during training 2015-03-25 03:08:24 +03:00			`nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))`
* Add parser training script 2015-01-09 20:53:26 +03:00

* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,`
			`beam_width=None):`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`if beam_width is not None:`
			`nlp.parser.cfg.beam_width = beam_width`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`sents = _merge_sents(sents)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for annot_tuples, brackets in sents:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
* Uncomment NER training 2015-06-17 00:36:54 +03:00			`nlp.entity(tokens)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`nlp.parser(tokens)`
			`else:`
			`tokens = nlp(raw_text, merge_mwes=False)`
			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00

* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):`
			`nlp = Language(data_dir=model_dir)`
			`if beam_width is not None:`
			`nlp.parser.cfg.beam_width = beam_width`
			`gold_tuples = read_json_file(dev_loc)`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`scorer = Scorer()`
			`out_file = codecs.open(out_loc, 'w', 'utf8')`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`for raw_text, sents in gold_tuples:`
			`sents = _merge_sents(sents)`
			`for annot_tuples, brackets in sents:`
			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
* Uncomment NER training 2015-06-17 00:36:54 +03:00			`nlp.entity(tokens)`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`nlp.parser(tokens)`
			`else:`
			`tokens = nlp(raw_text, merge_mwes=False)`
			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=False)`
			`for t in tokens:`
			`out_file.write(`
			`'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`return scorer`


* Tmp commit 2015-02-23 22:05:04 +03:00			`@plac.annotations(`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 04:52:55 +03:00			`train_loc=("Location of training file or directory"),`
			`dev_loc=("Location of development file or directory"),`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`model_dir=("Location of output model directory",),`
			`eval_only=("Skip training, and only evaluate", "flag", "e", bool),`
* Tmp commit 2015-05-24 03:50:14 +03:00			`corruption_level=("Amount of noise to add to training data", "option", "c", float),`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`n_sents=("Number of training sentences", "option", "n", int),`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`n_iter=("Number of training iterations", "option", "i", int),`
* Wire up beam-width command line argument 2015-06-02 01:54:12 +03:00			`beam_width=("Number of candidates to maintain in the beam", "option", "k", int),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`debug=("Debug mode", "flag", "d", bool),`
			`use_orig_arc_eager=("Use the original, monotonic arc-eager system", "flag", "m", bool)`
* Tmp commit 2015-02-23 22:05:04 +03:00			`)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`eval_only=False, use_orig_arc_eager=False):`
* Use tree_arc_eager system as baseline in experiments 2015-06-15 09:23:43 +03:00			`if use_orig_arc_eager:`
			`English.ParserTransitionSystem = TreeArcEager`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`if not eval_only:`
			`gold_train = list(read_json_file(train_loc))`
			`train(English, gold_train, model_dir,`
			`feat_set='basic' if not debug else 'debug',`
			`gold_preproc=gold_preproc, n_sents=n_sents,`
			`corruption_level=corruption_level, n_iter=n_iter,`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`beam_width=beam_width, verbose=verbose,`
			`use_orig_arc_eager=use_orig_arc_eager)`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`#if out_loc:`
			`# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`scorer = evaluate(English, list(read_json_file(dev_loc)),`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`model_dir, gold_preproc=gold_preproc, verbose=verbose,`
			`beam_width=beam_width)`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`print 'TOK', 100-scorer.token_acc`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`print 'POS', scorer.tags_acc`
			`print 'UAS', scorer.uas`
			`print 'LAS', scorer.las`

			`print 'NER P', scorer.ents_p`
			`print 'NER R', scorer.ents_r`
			`print 'NER F', scorer.ents_f`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Add parser training script 2015-01-09 20:53:26 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`