spaCy/bin/parser/train_ud.py

from __future__ import unicode_literals, print_function
import plac
import json
import random
import pathlib

from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, TokenVectorEncoder
from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
import spacy.attrs
import io
from thinc.neural.ops import CupyOps
from thinc.neural import Model

try:
    import cupy
    print("Using GPU")
    Model.ops = CupyOps()
except ImportError:
    pass

from thinc.neural import Model


try:
    import cupy
    from thinc.neural.ops import CupyOps
except:
    cupy = None


def read_conllx(loc, n=0):
    with io.open(loc, 'r', encoding='utf8') as file_:
        text = file_.read()
    i = 0
    for sent in text.strip().split('\n\n'):
        lines = sent.strip().split('\n')
        if lines:
            while lines[0].startswith('#'):
                lines.pop(0)
            tokens = []
            for line in lines:
                id_, word, lemma, pos, tag, morph, head, dep, _1, \
                _2 = line.split('\t')
                if '-' in id_ or '.' in id_:
                    continue
                try:
                    id_ = int(id_) - 1
                    head = (int(head) - 1) if head != '0' else id_
                    dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
                    tag = pos+'__'+dep+'__'+morph
                    Spanish.Defaults.tag_map[tag] = {POS: pos}
                    tokens.append((id_, word, tag, head, dep, 'O'))
                except:
                    raise
            tuples = [list(t) for t in zip(*tokens)]
            yield (None, [[tuples, []]])
            i += 1
            if n >= 1 and i >= n:
                break


def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
    scorer = Scorer()
    correct = 0.
    total = 0.
    for doc, gold in zip(Xs, ys):
        doc = Doc(vocab, words=[w.text for w in doc])
        encoder(doc)
        parser(doc)
        PseudoProjectivity.deprojectivize(doc)
        scorer.score(doc, gold, verbose=verbose)
        for token, tag in zip(doc, gold.tags):
            if '_' in token.tag_:
                univ_guess, _ = token.tag_.split('_', 1)
            else:
                univ_guess = ''
            univ_truth, _ = tag.split('_', 1)
            correct += univ_guess == univ_truth
            total += 1
    return scorer


def organize_data(vocab, train_sents):
    Xs = []
    ys = []
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
            Xs.append(doc)
            ys.append(gold)
    return Xs, ys


def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    if cupy is not None:
        Model.ops = CupyOps()
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    dev_sents = list(read_conllx(dev_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    encoder = TokenVectorEncoder(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    Xs, ys = organize_data(vocab, train_sents)
    dev_Xs, dev_ys = organize_data(vocab, dev_sents)
    Xs = Xs
    ys = ys
    dev_Xs = dev_Xs[:1000]
    dev_ys = dev_ys[:1000]
    with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
        docs = list(Xs)
        for doc in docs:
            encoder(doc)
        parser.begin_training(docs, ys)
        nn_loss = [0.]
        def track_progress():
            with encoder.tagger.use_params(optimizer.averages):
                scorer = score_model(vocab, encoder, tagger, parser, dev_Xs, dev_ys)
            itn = len(nn_loss)
            print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
            nn_loss.append(0.)
        trainer.each_epoch.append(track_progress)
        trainer.batch_size = 24
        trainer.nb_epoch = 10
        for docs, golds in trainer.iterate(Xs, ys):
            docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
            tokvecs, upd_tokvecs = encoder.begin_update(docs)
            for doc, tokvec in zip(docs, tokvecs):
                doc.tensor = tokvec
            for doc, gold in zip(docs, golds):
                tagger.update(doc, gold)
            d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
            upd_tokvecs(d_tokvecs, sgd=optimizer)
            encoder.update(docs, golds, sgd=optimizer)
            nn_loss[-1] += loss
    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))


if __name__ == '__main__':
    plac.call(main)
Gradients look correct 2017-05-06 17:47:15 +03:00			`from __future__ import unicode_literals, print_function`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`import plac`
			`import json`
			`import random`
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00			`import pathlib`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`from spacy.tokens import Doc`
			`from spacy.syntax.nonproj import PseudoProjectivity`
			`from spacy.language import Language`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`from spacy.gold import GoldParse`
			`from spacy.tagger import Tagger`
Gradients look correct 2017-05-06 17:47:15 +03:00			`from spacy.pipeline import DependencyParser, TokenVectorEncoder`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`from spacy.syntax.parser import get_templates`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`from spacy.syntax.arc_eager import ArcEager`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`from spacy.scorer import Scorer`
Improve train_ud script 2017-01-09 18:53:46 +03:00			`from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP`
* Work around get_lex_attr bug introduced during German parsing 2016-05-23 13:53:00 +03:00			`import spacy.attrs`
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00			`import io`
Tmp 2017-05-07 15:31:09 +03:00			`from thinc.neural.ops import CupyOps`
			`from thinc.neural import Model`

			`try:`
			`import cupy`
			`print("Using GPU")`
			`Model.ops = CupyOps()`
			`except ImportError:`
			`pass`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00
Tmp GPU code 2017-05-07 19:04:24 +03:00			`from thinc.neural import Model`


			`try:`
			`import cupy`
			`from thinc.neural.ops import CupyOps`
			`except:`
			`cupy = None`

* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00			`def read_conllx(loc, n=0):`
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00			`with io.open(loc, 'r', encoding='utf8') as file_:`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`text = file_.read()`
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00			`i = 0`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`for sent in text.strip().split('\n\n'):`
			`lines = sent.strip().split('\n')`
			`if lines:`
* Work around get_lex_attr bug introduced during German parsing 2016-05-23 13:53:00 +03:00			`while lines[0].startswith('#'):`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`lines.pop(0)`
			`tokens = []`
			`for line in lines:`
Split CONLLX file using tabs and not default split separators 2017-03-22 01:00:13 +03:00			`id_, word, lemma, pos, tag, morph, head, dep, _1, \`
			`_2 = line.split('\t')`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`if '-' in id_ or '.' in id_:`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`continue`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`try:`
			`id_ = int(id_) - 1`
			`head = (int(head) - 1) if head != '0' else id_`
working residual net 2017-05-07 04:57:26 +03:00			`dep = 'ROOT' if dep == 'root' else dep #'unlabelled'`
Add dep to supertag. 2017-05-08 15:50:01 +03:00			`tag = pos+'__'+dep+'__'+morph`
			`Spanish.Defaults.tag_map[tag] = {POS: pos}`
			`tokens.append((id_, word, tag, head, dep, 'O'))`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`except:`
			`raise`
			`tuples = [list(t) for t in zip(*tokens)]`
			`yield (None, [[tuples, []]])`
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00			`i += 1`
			`if n >= 1 and i >= n:`
			`break`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00

Add dep to supertag. 2017-05-08 15:50:01 +03:00			`def score_model(vocab, encoder, parser, Xs, ys, verbose=False):`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`scorer = Scorer()`
Gradients look correct 2017-05-06 17:47:15 +03:00			`correct = 0.`
			`total = 0.`
			`for doc, gold in zip(Xs, ys):`
			`doc = Doc(vocab, words=[w.text for w in doc])`
			`encoder(doc)`
			`parser(doc)`
			`PseudoProjectivity.deprojectivize(doc)`
			`scorer.score(doc, gold, verbose=verbose)`
			`for token, tag in zip(doc, gold.tags):`
Add dep to supertag. 2017-05-08 15:50:01 +03:00			`if '_' in token.tag_:`
			`univ_guess, _ = token.tag_.split('_', 1)`
			`else:`
			`univ_guess = ''`
Gradients look correct 2017-05-06 17:47:15 +03:00			`univ_truth, _ = tag.split('_', 1)`
			`correct += univ_guess == univ_truth`
			`total += 1`
			`return scorer`


			`def organize_data(vocab, train_sents):`
			`Xs = []`
			`ys = []`
			`for _, doc_sents in train_sents:`
			`for (ids, words, tags, heads, deps, ner), _ in doc_sents:`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`doc = Doc(vocab, words=words)`
			`gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`Xs.append(doc)`
			`ys.append(gold)`
			`return Xs, ys`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00

Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):`
Tmp GPU code 2017-05-07 19:04:24 +03:00			`if cupy is not None:`
			`Model.ops = CupyOps()`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`LangClass = spacy.util.get_lang_class(lang_name)`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`train_sents = list(read_conllx(train_loc))`
working residual net 2017-05-07 04:57:26 +03:00			`dev_sents = list(read_conllx(dev_loc))`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`train_sents = PseudoProjectivity.preprocess_training_data(train_sents)`
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`actions = ArcEager.get_actions(gold_parses=train_sents)`
			`features = get_templates('basic')`
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00			`model_dir = pathlib.Path(model_dir)`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`if not model_dir.exists():`
			`model_dir.mkdir()`
Update train_ud.py Create deps folder if it doesn't exist. 2017-01-09 02:55:44 +03:00			`if not (model_dir / 'deps').exists():`
			`(model_dir / 'deps').mkdir()`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`if not (model_dir / 'pos').exists():`
			`(model_dir / 'pos').mkdir()`
Improve train_ud script 2017-01-09 18:53:46 +03:00			`with (model_dir / 'deps' / 'config.json').open('wb') as file_:`
			`file_.write(`
			`json.dumps(`
			`{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00
			`vocab = LangClass.Defaults.create_vocab()`
			`if not (model_dir / 'vocab').exists():`
			`(model_dir / 'vocab').mkdir()`
			`else:`
			`if (model_dir / 'vocab' / 'strings.json').exists():`
			`with (model_dir / 'vocab' / 'strings.json').open() as file_:`
			`vocab.strings.load(file_)`
			`if (model_dir / 'vocab' / 'lexemes.bin').exists():`
			`vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')`

			`if clusters_loc is not None:`
			`clusters_loc = pathlib.Path(clusters_loc)`
			`with clusters_loc.open() as file_:`
			`for line in file_:`
			`try:`
			`cluster, word, freq = line.split()`
			`except ValueError:`
			`continue`
			`lex = vocab[word]`
			`lex.cluster = int(cluster[::-1], 2)`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`# Populate vocab`
			`for _, doc_sents in train_sents:`
			`for (ids, words, tags, heads, deps, ner), _ in doc_sents:`
			`for word in words:`
			`_ = vocab[word]`
Use unicode literals in train_ud 2016-11-26 02:45:45 +03:00			`for dep in deps:`
			`_ = vocab[dep]`
			`for tag in tags:`
			`_ = vocab[tag]`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`if vocab.morphology.tag_map:`
Improve train_ud script 2017-01-09 18:53:46 +03:00			`for tag in tags:`
Update train_ud for Universal Dependencies 2 2017-03-17 01:08:15 +03:00			`assert tag in vocab.morphology.tag_map, repr(tag)`
			`tagger = Tagger(vocab)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`encoder = TokenVectorEncoder(vocab)`
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00			`parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)`

Gradients look correct 2017-05-06 17:47:15 +03:00			`Xs, ys = organize_data(vocab, train_sents)`
working residual net 2017-05-07 04:57:26 +03:00			`dev_Xs, dev_ys = organize_data(vocab, dev_sents)`
Tmp 2017-05-07 15:31:09 +03:00			`Xs = Xs`
			`ys = ys`
			`dev_Xs = dev_Xs[:1000]`
			`dev_ys = dev_ys[:1000]`
Gradients look correct 2017-05-06 17:47:15 +03:00			`with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):`
			`docs = list(Xs)`
			`for doc in docs:`
			`encoder(doc)`
			`parser.begin_training(docs, ys)`
			`nn_loss = [0.]`
			`def track_progress():`
working residual net 2017-05-07 04:57:26 +03:00			`with encoder.tagger.use_params(optimizer.averages):`
			`scorer = score_model(vocab, encoder, tagger, parser, dev_Xs, dev_ys)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`itn = len(nn_loss)`
			`print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))`
			`nn_loss.append(0.)`
			`trainer.each_epoch.append(track_progress)`
Tmp GPU code 2017-05-07 19:04:24 +03:00			`trainer.batch_size = 24`
Tmp 2017-05-07 15:31:09 +03:00			`trainer.nb_epoch = 10`
			`for docs, golds in trainer.iterate(Xs, ys):`
Gradients look correct 2017-05-06 17:47:15 +03:00			`docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]`
			`tokvecs, upd_tokvecs = encoder.begin_update(docs)`
			`for doc, tokvec in zip(docs, tokvecs):`
			`doc.tensor = tokvec`
			`for doc, gold in zip(docs, golds):`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`tagger.update(doc, gold)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)`
			`upd_tokvecs(d_tokvecs, sgd=optimizer)`
Tmp 2017-05-07 15:31:09 +03:00			`encoder.update(docs, golds, sgd=optimizer)`
Gradients look correct 2017-05-06 17:47:15 +03:00			`nn_loss[-1] += loss`
Use specific language class instead of base Language class 2017-03-22 01:18:54 +03:00			`nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`nlp.end_training(model_dir)`
Fix train_ud script, which trains models from the Universal Dependencies format. 2016-11-25 20:19:33 +03:00			`scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))`
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00			`print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))`
Improve printing in train_ud script 2017-03-11 20:11:05 +03:00
* Add script to train models off the UD treebanks. Note that the UD data is restricted to research purposes only, and should only be used to train models for academic experiments. 2015-10-08 04:00:11 +03:00
			`if __name__ == '__main__':`
			`plac.call(main)`