spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import codecs
import random

import plac
import cProfile
import pstats

import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
from spacy.syntax.conll import read_docparse_file, read_json_file
from spacy.syntax.conll import GoldParse

from spacy.scorer import Scorer


def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, n_sents=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tUAS\tNER F.\tTag %"
    for itn in range(n_iter):
        scorer = Scorer()
        for raw_text, segmented_text, annot_tuples, ctnt in gold_tuples:
            # Eval before train
            tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)

            if gold_preproc:
                sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
            else:
                sents = [nlp.tokenizer(raw_text)]
            for tokens in sents:
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                nlp.parser.train(tokens, gold)
                if gold.ents:
                    nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)

        print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
        random.shuffle(gold_tuples)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True):
    assert not gold_preproc
    nlp = Language(data_dir=model_dir)
    scorer = Scorer()
    for raw_text, segmented_text, annot_tuples, brackets in gold_tuples:
        tokens = nlp(raw_text, merge_mwes=False)
        gold = GoldParse(tokens, annot_tuples)
        scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language()
    gold_tuples = read_docparse_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, segmented_text, annot_tuples in gold_tuples:
        tokens = nlp(raw_text)
        for t in tokens:
            out_file.write(
                '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
            )
    return scorer


def get_sents(json_dir, section):
    if section == 'train':
        file_range = range(2, 22)
    elif section == 'dev':
        file_range = range(22, 23)

    for i in file_range:
        sec = str(i)
        if len(sec) == 1:
            sec = '0' + sec
        loc = path.join(json_dir, sec + '.json')
        for sent in read_json_file(loc):
            yield sent


@plac.annotations(
    json_dir=("Annotated JSON files directory",),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool)
)
def main(json_dir, model_dir, n_sents=0, out_loc="", verbose=False,
         debug=False):
    train(English, list(get_sents(json_dir, 'train')), model_dir,
          feat_set='basic' if not debug else 'debug',
          gold_preproc=False, n_sents=n_sents)
    if out_loc:
        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, list(get_sents(json_dir, 'dev')),
                      model_dir, gold_preproc=False, verbose=verbose)
    print 'TOK', scorer.mistokened
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 20:53:26 +03:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
			`import codecs`
			`import random`

			`import plac`
			`import cProfile`
			`import pstats`

			`import spacy.util`
			`from spacy.en import English`
			`from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir`

			`from spacy.syntax.parser import GreedyParser`
* Fix standard conll file reading. Script needs refactoring. 2015-02-02 15:02:48 +03:00			`from spacy.syntax.parser import OracleError`
* Add parser training script 2015-01-09 20:53:26 +03:00			`from spacy.syntax.util import Config`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`from spacy.syntax.conll import read_docparse_file, read_json_file`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`from spacy.syntax.conll import GoldParse`
* Add parser training script 2015-01-09 20:53:26 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`from spacy.scorer import Scorer`

* Add parser training script 2015-01-09 20:53:26 +03:00
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 07:12:37 +03:00			`gold_preproc=False, n_sents=0):`
* Add parser training script 2015-01-09 20:53:26 +03:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Add parser training script 2015-01-09 20:53:26 +03:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`os.mkdir(ner_model_dir)`

			`setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)`

* Add parser training script 2015-01-09 20:53:26 +03:00			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`labels=Language.ParserTransitionSystem.get_labels(gold_tuples))`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`labels=Language.EntityTransitionSystem.get_labels(gold_tuples))`

* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00
			`print "Itn.\tUAS\tNER F.\tTag %"`
* Add parser training script 2015-01-09 20:53:26 +03:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`for raw_text, segmented_text, annot_tuples, ctnt in gold_tuples:`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`# Eval before train`
* Use merge_mwe=False in evaluation in train.py 2015-04-08 01:35:19 +03:00			`tokens = nlp(raw_text, merge_mwes=False)`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=False)`

* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00			`if gold_preproc:`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]`
* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00			`else:`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`sents = [nlp.tokenizer(raw_text)]`
			`for tokens in sents:`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`gold = GoldParse(tokens, annot_tuples)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`nlp.tagger(tokens)`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 07:12:37 +03:00			`nlp.parser.train(tokens, gold)`
* Clean up train.py 2015-04-15 07:02:04 +03:00			`if gold.ents:`
			`nlp.entity.train(tokens, gold)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`nlp.tagger.train(tokens, gold.tags)`
* Move scoring away from training. Does not support scoring on gold preproc. 2015-03-23 19:32:55 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)`
* Restore shuffling, and remove print statements from train.py 2015-05-07 23:52:27 +03:00			`random.shuffle(gold_tuples)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.parser.model.end_training()`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`nlp.entity.model.end_training()`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.tagger.model.end_training()`
* Ensure StringStore is dumped during training 2015-03-25 03:08:24 +03:00			`nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))`
* Add parser training script 2015-01-09 20:53:26 +03:00

* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`assert not gold_preproc`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`for raw_text, segmented_text, annot_tuples, brackets in gold_tuples:`
* Use merge_mwe=False in evaluation in train.py 2015-04-08 01:35:19 +03:00			`tokens = nlp(raw_text, merge_mwes=False)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`gold = GoldParse(tokens, annot_tuples)`
* Add verbose flag for Scorer, for debugging, and fix ent_strings bug 2015-03-11 09:27:22 +03:00			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00

* Add write_parses function 2015-03-20 03:14:20 +03:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
			`nlp = Language()`
			`gold_tuples = read_docparse_file(dev_loc)`
			`scorer = Scorer()`
			`out_file = codecs.open(out_loc, 'w', 'utf8')`
			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
			`tokens = nlp(raw_text)`
			`for t in tokens:`
			`out_file.write(`
			`'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`return scorer`


* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`def get_sents(json_dir, section):`
			`if section == 'train':`
			`file_range = range(2, 22)`
			`elif section == 'dev':`
			`file_range = range(22, 23)`

			`for i in file_range:`
			`sec = str(i)`
			`if len(sec) == 1:`
			`sec = '0' + sec`
			`loc = path.join(json_dir, sec + '.json')`
			`for sent in read_json_file(loc):`
			`yield sent`


* Tmp commit 2015-02-23 22:05:04 +03:00			`@plac.annotations(`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`json_dir=("Annotated JSON files directory",),`
* Tmp commit 2015-02-23 22:05:04 +03:00			`model_dir=("Location of output model directory",),`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`n_sents=("Number of training sentences", "option", "n", int),`
			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`debug=("Debug mode", "flag", "d", bool)`
* Tmp commit 2015-02-23 22:05:04 +03:00			`)`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`def main(json_dir, model_dir, n_sents=0, out_loc="", verbose=False,`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`debug=False):`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`train(English, list(get_sents(json_dir, 'train')), model_dir,`
			`feat_set='basic' if not debug else 'debug',`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 07:12:37 +03:00			`gold_preproc=False, n_sents=n_sents)`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`if out_loc:`
			`write_parses(English, dev_loc, model_dir, out_loc)`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`scorer = evaluate(English, list(get_sents(json_dir, 'dev')),`
			`model_dir, gold_preproc=False, verbose=verbose)`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`print 'TOK', scorer.mistokened`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`print 'POS', scorer.tags_acc`
			`print 'UAS', scorer.uas`
			`print 'LAS', scorer.las`

			`print 'NER P', scorer.ents_p`
			`print 'NER R', scorer.ents_r`
			`print 'NER F', scorer.ents_f`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Add parser training script 2015-01-09 20:53:26 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`