spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import codecs
import random

import plac
import cProfile
import pstats
import re

import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse

from spacy.scorer import Scorer


def add_noise(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()


def score_model(scorer, nlp, raw_text, annot_tuples):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
        nlp.tagger(tokens)
        nlp.entity(tokens)
        nlp.parser(tokens)
    else:
        tokens = nlp(raw_text, merge_mwes=False)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=False)


def _merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), brackets in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]
        

def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, n_sents=0, corruption_level=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
                    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                else:
                    tokens = nlp.tokenizer(raw_text)
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                if gold.is_projective:
                    loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                               scorer.tags_acc,
                                               scorer.token_acc)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False):
    nlp = Language(data_dir=model_dir)
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language()
    gold_tuples = read_docparse_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, segmented_text, annot_tuples in gold_tuples:
        tokens = nlp(raw_text)
        for t in tokens:
            out_file.write(
                '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
            )
    return scorer


@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool)
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False):
    gold_train = list(read_json_file(train_loc))
    train(English, gold_train, model_dir,
          feat_set='basic' if not debug else 'debug',
          gold_preproc=gold_preproc, n_sents=n_sents,
          corruption_level=corruption_level, n_iter=n_iter)
    if out_loc:
        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 20:53:26 +03:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
			`import codecs`
			`import random`

			`import plac`
			`import cProfile`
			`import pstats`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`import re`
* Add parser training script 2015-01-09 20:53:26 +03:00
			`import spacy.util`
			`from spacy.en import English`
			`from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir`

			`from spacy.syntax.parser import GreedyParser`
* Fix standard conll file reading. Script needs refactoring. 2015-02-02 15:02:48 +03:00			`from spacy.syntax.parser import OracleError`
* Add parser training script 2015-01-09 20:53:26 +03:00			`from spacy.syntax.util import Config`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`from spacy.gold import read_json_file`
			`from spacy.gold import GoldParse`
* Add parser training script 2015-01-09 20:53:26 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`from spacy.scorer import Scorer`

* Add parser training script 2015-01-09 20:53:26 +03:00
* Tmp commit 2015-05-24 03:50:14 +03:00			`def add_noise(c, noise_level):`
			`if random.random() >= noise_level:`
			`return c`
			`elif c == ' ':`
			`return '\n'`
			`elif c == '\n':`
			`return ' '`
			`elif c in ['.', "'", "!", "?"]:`
			`return ''`
			`else:`
			`return c.lower()`


* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`def score_model(scorer, nlp, raw_text, annot_tuples):`
			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
			`nlp.entity(tokens)`
			`nlp.parser(tokens)`
			`else:`
			`tokens = nlp(raw_text, merge_mwes=False)`
			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=False)`


* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`def _merge_sents(sents):`
			`m_deps = [[], [], [], [], [], []]`
			`m_brackets = []`
			`i = 0`
			`for (ids, words, tags, heads, labels, ner), brackets in sents:`
			`m_deps[0].extend(id_ + i for id_ in ids)`
			`m_deps[1].extend(words)`
			`m_deps[2].extend(tags)`
			`m_deps[3].extend(head + i for head in heads)`
			`m_deps[4].extend(labels)`
			`m_deps[5].extend(ner)`
			`m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)`
			`i += len(ids)`
			`return [(m_deps, m_brackets)]`


* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 17:38:54 +03:00			`def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,`
* Tmp commit 2015-05-24 03:50:14 +03:00			`gold_preproc=False, n_sents=0, corruption_level=0):`
* Add parser training script 2015-01-09 20:53:26 +03:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Add parser training script 2015-01-09 20:53:26 +03:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`os.mkdir(ner_model_dir)`

			`setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)`

* Add parser training script 2015-01-09 20:53:26 +03:00			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`labels=Language.ParserTransitionSystem.get_labels(gold_tuples))`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 08:46:53 +03:00			`labels=Language.EntityTransitionSystem.get_labels(gold_tuples))`

* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"`
* Add parser training script 2015-01-09 20:53:26 +03:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`loss = 0`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`sents = _merge_sents(sents)`
			`for annot_tuples, ctnt in sents:`
			`score_model(scorer, nlp, raw_text, annot_tuples)`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`else:`
			`tokens = nlp.tokenizer(raw_text)`
			`gold = GoldParse(tokens, annot_tuples)`
			`nlp.tagger(tokens)`
			`if gold.is_projective:`
			`loss += nlp.parser.train(tokens, gold)`
			`nlp.entity.train(tokens, gold)`
			`nlp.tagger.train(tokens, gold.tags)`
* Ensure tagger and NER are trained, even if non-projective problem 2015-05-27 04:16:21 +03:00			`random.shuffle(gold_tuples)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`scorer.tags_acc,`
			`scorer.token_acc)`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.parser.model.end_training()`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`nlp.entity.model.end_training()`
* Add parser training script 2015-01-09 20:53:26 +03:00			`nlp.tagger.model.end_training()`
* Ensure StringStore is dumped during training 2015-03-25 03:08:24 +03:00			`nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))`
* Add parser training script 2015-01-09 20:53:26 +03:00

* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False):`
* Respect the model_dir input parameter to train.py 2015-04-08 23:48:26 +03:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`scorer = Scorer()`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`sents = _merge_sents(sents)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`for annot_tuples, brackets in sents:`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if raw_text is None:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
			`nlp.entity(tokens)`
			`nlp.parser(tokens)`
			`else:`
			`tokens = nlp(raw_text, merge_mwes=False)`
			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 08:17:12 +03:00

* Add write_parses function 2015-03-20 03:14:20 +03:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
			`nlp = Language()`
			`gold_tuples = read_docparse_file(dev_loc)`
			`scorer = Scorer()`
			`out_file = codecs.open(out_loc, 'w', 'utf8')`
			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
			`tokens = nlp(raw_text)`
			`for t in tokens:`
			`out_file.write(`
			`'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`return scorer`


* Tmp commit 2015-02-23 22:05:04 +03:00			`@plac.annotations(`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 04:52:55 +03:00			`train_loc=("Location of training file or directory"),`
			`dev_loc=("Location of development file or directory"),`
* Tmp commit 2015-05-24 03:50:14 +03:00			`corruption_level=("Amount of noise to add to training data", "option", "c", float),`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),`
* Tmp commit 2015-02-23 22:05:04 +03:00			`model_dir=("Location of output model directory",),`
* Add write_parses function 2015-03-20 03:14:20 +03:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`n_sents=("Number of training sentences", "option", "n", int),`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`n_iter=("Number of training iterations", "option", "i", int),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 18:09:55 +03:00			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Ensure better separation between score printing and training in train.py 2015-03-24 06:25:38 +03:00			`debug=("Debug mode", "flag", "d", bool)`
* Tmp commit 2015-02-23 22:05:04 +03:00			`)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 22:35:02 +03:00			`def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`debug=False, corruption_level=0.0, gold_preproc=False):`
* Recomment in training in train.py 2015-05-28 23:40:26 +03:00			`gold_train = list(read_json_file(train_loc))`
			`train(English, gold_train, model_dir,`
			`feat_set='basic' if not debug else 'debug',`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`gold_preproc=gold_preproc, n_sents=n_sents,`
* Recomment in training in train.py 2015-05-28 23:40:26 +03:00			`corruption_level=corruption_level, n_iter=n_iter)`
* Fix gold_preproc flag in train.py 2015-05-30 06:23:02 +03:00			`if out_loc:`
			`write_parses(English, dev_loc, model_dir, out_loc)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 20:14:02 +03:00			`scorer = evaluate(English, list(read_json_file(dev_loc)),`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`model_dir, gold_preproc=gold_preproc, verbose=verbose)`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 18:21:25 +03:00			`print 'TOK', 100-scorer.token_acc`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`print 'POS', scorer.tags_acc`
			`print 'UAS', scorer.uas`
			`print 'LAS', scorer.las`

			`print 'NER P', scorer.ents_p`
			`print 'NER R', scorer.ents_r`
			`print 'NER F', scorer.ents_f`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Add parser training script 2015-01-09 20:53:26 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`