spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function

import os
from os import path
import shutil
import io
import random

import plac
import re

import spacy.util
from spacy.en import English

from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse

from spacy.scorer import Scorer

from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser


def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()


def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return ''.join(_corrupt(c, noise_level) for c in orig)


def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
        tokens = nlp.tokenizer(raw_text)
    nlp.tagger(tokens)
    nlp.entity(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)


def _merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), brackets in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]


def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None, cand_preproc=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.parser(tokens)
                nlp.entity(tokens)
            else:
                tokens = nlp(raw_text)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = io.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text)
            #gold = GoldParse(tokens, annot_tuples)
            #scorer.score(tokens, gold, verbose=False)
            for sent in tokens.sents:
                for t in sent:
                    if not t.is_space:
                        out_file.write(
                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
                        )
                out_file.write('\n')


@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose)
    if out_loc:
        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 20:53:26 +03:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 05:52:35 +03:00			`from __future__ import print_function`
* Add parser training script 2015-01-09 20:53:26 +03:00
			`import os`
			`from os import path`
			`import shutil`
caught another codecs.open 2015-09-30 21:16:52 +03:00			`import io`
* Add parser training script 2015-01-09 20:53:26 +03:00			`import random`

			`import plac`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`import re`
* Add parser training script 2015-01-09 20:53:26 +03:00
			`import spacy.util`
			`from spacy.en import English`

			`from spacy.syntax.util import Config`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`from spacy.gold import read_json_file`
			`from spacy.gold import GoldParse`
* Add parser training script 2015-01-09 20:53:26 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`from spacy.scorer import Scorer`

* Update train.py for language-generic spaCy 2015-09-06 18:51:48 +03:00			`from spacy.syntax.arc_eager import ArcEager`
			`from spacy.syntax.ner import BiluoPushDown`
			`from spacy.tagger import Tagger`
			`from spacy.syntax.parser import Parser`

* Add parser training script 2015-01-09 20:53:26 +03:00
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`def _corrupt(c, noise_level):`
* Tmp commit 2015-05-24 03:50:14 +03:00			`if random.random() >= noise_level:`
			`return c`
			`elif c == ' ':`
			`return '\n'`
			`elif c == '\n':`
			`return ' '`
			`elif c in ['.', "'", "!", "?"]:`
			`return ''`
			`else:`
			`return c.lower()`


* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`def add_noise(orig, noise_level):`
			`if random.random() >= noise_level:`
			`return orig`
			`elif type(orig) == list:`
			`corrupted = [_corrupt(word, noise_level) for word in orig]`
			`corrupted = [w for w in corrupted if w]`
			`return corrupted`
			`else:`
			`return ''.join(_corrupt(c, noise_level) for c in orig)`


* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`else:`
* Fix bug in train.py 2015-05-31 07:49:06 +03:00			`tokens = nlp.tokenizer(raw_text)`
* Fix train.py 2015-06-05 16:50:24 +03:00			`nlp.tagger(tokens)`
* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00			`nlp.entity(tokens)`
			`nlp.parser(tokens)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`gold = GoldParse(tokens, annot_tuples)`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`scorer.score(tokens, gold, verbose=verbose)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00

* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`def _merge_sents(sents):`
			`m_deps = [[], [], [], [], [], []]`
			`m_brackets = []`
			`i = 0`
			`for (ids, words, tags, heads, labels, ner), brackets in sents:`
			`m_deps[0].extend(id_ + i for id_ in ids)`
			`m_deps[1].extend(words)`
			`m_deps[2].extend(tags)`
			`m_deps[3].extend(head + i for head in heads)`
			`m_deps[4].extend(labels)`
			`m_deps[5].extend(ner)`
			`m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)`
			`i += len(ids)`
			`return [(m_deps, m_brackets)]`

* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00
			`def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',`
			`seed=0, gold_preproc=False, n_sents=0, corruption_level=0,`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`beam_width=1, verbose=False,`
			`use_orig_arc_eager=False):`
* Add parser training script 2015-01-09 20:53:26 +03:00			`dep_model_dir = path.join(model_dir, 'deps')`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Create POS model dir in training script 2015-09-08 16:36:23 +03:00			`pos_model_dir = path.join(model_dir, 'pos')`
* Add parser training script 2015-01-09 20:53:26 +03:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Create POS model dir in training script 2015-09-08 16:36:23 +03:00			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`os.mkdir(dep_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`os.mkdir(ner_model_dir)`
* Create POS model dir in training script 2015-09-08 16:36:23 +03:00			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00
* Add parser training script 2015-01-09 20:53:26 +03:00			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Update train.py for language-generic spaCy 2015-09-06 18:51:48 +03:00			`labels=ArcEager.get_labels(gold_tuples),`
* Fix bugs in new greedy/beam parser 2015-06-02 03:01:33 +03:00			`beam_width=beam_width)`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Update train.py for language-generic spaCy 2015-09-06 18:51:48 +03:00			`labels=BiluoPushDown.get_labels(gold_tuples),`
* Fix train.py 2015-06-05 16:50:24 +03:00			`beam_width=0)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Allow parser to jackknife POS tags before training. 2015-05-31 02:11:11 +03:00
* Update train.py for language-generic spaCy 2015-09-06 18:51:48 +03:00			`nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)`
			`nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())`
			`nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)`
			`nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 05:52:35 +03:00			`print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")`
* Add parser training script 2015-01-09 20:53:26 +03:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`loss = 0`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`sents = _merge_sents(sents)`
			`for annot_tuples, ctnt in sents:`
* Skip sentences of length 1 in training 2015-06-05 03:29:03 +03:00			`if len(annot_tuples[1]) == 1:`
			`continue`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 18:45:31 +03:00			`score_model(scorer, nlp, raw_text, annot_tuples,`
			`verbose=verbose if itn >= 2 else False)`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`words = add_noise(annot_tuples[1], corruption_level)`
			`tokens = nlp.tokenizer.tokens_from_list(words)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`else:`
* Update input corruption method to work with lists as well as trings 2015-06-05 20:33:32 +03:00			`raw_text = add_noise(raw_text, corruption_level)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer(raw_text)`
* Clean up train.py, removing unused tag jackknifing code 2015-06-05 16:01:28 +03:00			`nlp.tagger(tokens)`
* Remove projectivity constraint in train.py, but raise Exception if non-projective sentence is encountered, since we've told GoldParse to projectivize 2015-06-23 06:04:46 +03:00			`gold = GoldParse(tokens, annot_tuples, make_projective=True)`
* Raise exception on non-projective input 2015-06-23 01:01:55 +03:00			`if not gold.is_projective:`
			`raise Exception(`
			`"Non-projective sentence in training, after we should "`
			`"have enforced projectivity: %s" % annot_tuples`
			`)`
* Remove projectivity constraint in train.py, but raise Exception if non-projective sentence is encountered, since we've told GoldParse to projectivize 2015-06-23 06:04:46 +03:00			`loss += nlp.parser.train(tokens, gold)`
* Uncomment NER training 2015-06-17 00:36:54 +03:00			`nlp.entity.train(tokens, gold)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`nlp.tagger.train(tokens, gold.tags)`
* Ensure tagger and NER are trained, even if non-projective problem 2015-05-27 04:16:21 +03:00			`random.shuffle(gold_tuples)`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 05:52:35 +03:00			`print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,`
* Whitespace 2015-07-23 02:19:26 +03:00			`scorer.tags_acc,`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 05:52:35 +03:00			`scorer.token_acc))`
* Create POS model dir in training script 2015-09-08 16:36:23 +03:00			`print('end training')`
* Update train.py for language-generic spaCy 2015-09-06 18:51:48 +03:00			`nlp.end_training(model_dir)`
* Create POS model dir in training script 2015-09-08 16:36:23 +03:00			`print('done')`
* Add parser training script 2015-01-09 20:53:26 +03:00
* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,`
* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00			`beam_width=None, cand_preproc=None):`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`if beam_width is not None:`
			`nlp.parser.cfg.beam_width = beam_width`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`sents = _merge_sents(sents)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for annot_tuples, brackets in sents:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
			`nlp.parser(tokens)`
* Train after parsing, not before. 2015-11-11 20:43:52 +03:00			`nlp.entity(tokens)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`else:`
* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00			`tokens = nlp(raw_text)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00

* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`nlp = Language(data_dir=model_dir)`
			`gold_tuples = read_json_file(dev_loc)`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`scorer = Scorer()`
* Use io module insteads of deprecated codecs module 2015-10-10 06:13:01 +03:00			`out_file = io.open(out_loc, 'w', 'utf8')`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`for raw_text, sents in gold_tuples:`
			`sents = _merge_sents(sents)`
			`for annot_tuples, brackets in sents:`
			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
* Uncomment NER training 2015-06-17 00:36:54 +03:00			`nlp.entity(tokens)`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 20:08:48 +03:00			`nlp.parser(tokens)`
			`else:`
* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00			`tokens = nlp(raw_text)`
			`#gold = GoldParse(tokens, annot_tuples)`
			`#scorer.score(tokens, gold, verbose=False)`
			`for sent in tokens.sents:`
			`for t in sent:`
			`if not t.is_space:`
			`out_file.write(`
			`'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`out_file.write('\n')`
* Add write_parses function 2015-03-20 03:14:20 +03:00

* Tmp commit 2015-02-23 22:05:04 +03:00			`@plac.annotations(`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 04:52:55 +03:00			`train_loc=("Location of training file or directory"),`
			`dev_loc=("Location of development file or directory"),`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`model_dir=("Location of output model directory",),`
			`eval_only=("Skip training, and only evaluate", "flag", "e", bool),`
* Tmp commit 2015-05-24 03:50:14 +03:00			`corruption_level=("Amount of noise to add to training data", "option", "c", float),`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`n_sents=("Number of training sentences", "option", "n", int),`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`n_iter=("Number of training iterations", "option", "i", int),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Add toggle for OrigArcEager system 2015-06-14 21:28:14 +03:00			`debug=("Debug mode", "flag", "d", bool),`
* Tmp commit 2015-02-23 22:05:04 +03:00			`)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,`
* Fix redundant options in train.py 2015-07-17 23:38:05 +03:00			`debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):`
* Add more options to bin/parser/train 2015-06-06 00:49:26 +03:00			`if not eval_only:`
			`gold_train = list(read_json_file(train_loc))`
			`train(English, gold_train, model_dir,`
			`feat_set='basic' if not debug else 'debug',`
			`gold_preproc=gold_preproc, n_sents=n_sents,`
			`corruption_level=corruption_level, n_iter=n_iter,`
* Fix redundant options in train.py 2015-07-17 23:38:05 +03:00			`verbose=verbose)`
* Update bin/parser/train for printing output. 2015-10-06 02:35:22 +03:00			`if out_loc:`
			`write_parses(English, dev_loc, model_dir, out_loc)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`scorer = evaluate(English, list(read_json_file(dev_loc)),`
* Fix redundant options in train.py 2015-07-17 23:38:05 +03:00			`model_dir, gold_preproc=gold_preproc, verbose=verbose)`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 05:52:35 +03:00			`print('TOK', scorer.token_acc)`
			`print('POS', scorer.tags_acc)`
			`print('UAS', scorer.uas)`
			`print('LAS', scorer.las)`

			`print('NER P', scorer.ents_p)`
			`print('NER R', scorer.ents_r)`
			`print('NER F', scorer.ents_f)`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Add parser training script 2015-01-09 20:53:26 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`