Remove old, outdated files in /bin

2024-11-10 19:57:17 +03:00 · 2017-10-27 19:44:38 +02:00 · 2017-10-27 19:44:38 +02:00 · 5025d709e0
commit 5025d709e0
parent 9c89e2cdef
10 changed files with 0 additions and 1301 deletions
--- a/bin/get_freqs.py
+++ b/bin/get_freqs.py
@ -1,93 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import unicode_literals, print_function
-
-import plac
-import joblib
-from os import path
-import os
-import bz2
-import ujson
-from preshed.counter import PreshCounter
-from joblib import Parallel, delayed
-import io
-
-from spacy.en import English
-from spacy.strings import StringStore
-from spacy.attrs import ORTH
-from spacy.tokenizer import Tokenizer
-from spacy.vocab import Vocab
-
-
-def iter_comments(loc):
-    with bz2.BZ2File(loc) as file_:
-        for line in file_:
-            yield ujson.loads(line)
-
-
-def count_freqs(input_loc, output_loc):
-    print(output_loc)
-    vocab = English.default_vocab(get_lex_attr=None)
-    tokenizer = Tokenizer.from_dir(vocab,
-                    path.join(English.default_data_dir(), 'tokenizer'))
-
-    counts = PreshCounter()
-    for json_comment in iter_comments(input_loc):
-        doc = tokenizer(json_comment['body'])
-        doc.count_by(ORTH, counts=counts)
-
-    with io.open(output_loc, 'w', 'utf8') as file_:
-        for orth, freq in counts:
-            string = tokenizer.vocab.strings[orth]
-            if not string.isspace():
-                file_.write('%d\t%s\n' % (freq, string))
-
-
-def parallelize(func, iterator, n_jobs):
-    Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
-
-
-def merge_counts(locs, out_loc):
-    string_map = StringStore()
-    counts = PreshCounter()
-    for loc in locs:
-        with io.open(loc, 'r', encoding='utf8') as file_:
-            for line in file_:
-                freq, word = line.strip().split('\t', 1)
-                orth = string_map[word]
-                counts.inc(orth, int(freq))
-    with io.open(out_loc, 'w', encoding='utf8') as file_:
-        for orth, count in counts:
-            string = string_map[orth]
-            file_.write('%d\t%s\n' % (count, string))
-
-
-@plac.annotations(
-    input_loc=("Location of input file list"),
-    freqs_dir=("Directory for frequency files"),
-    output_loc=("Location for output file"),
-    n_jobs=("Number of workers", "option", "n", int),
-    skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
-)
-def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
-    tasks = []
-    outputs = []
-    for input_path in open(input_loc):
-        input_path = input_path.strip()
-        if not input_path:
-            continue
-        filename = input_path.split('/')[-1]
-        output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
-        outputs.append(output_path)
-        if not path.exists(output_path) or not skip_existing:
-            tasks.append((input_path, output_path))
-
-    if tasks:
-        parallelize(count_freqs, tasks, n_jobs)
-
-    print("Merge")
-    merge_counts(outputs, output_loc)
-                
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/munge_ewtb.py
+++ b/bin/munge_ewtb.py
@ -1,89 +0,0 @@
-#!/usr/bin/env python
-from __future__ import unicode_literals
-
-from xml.etree import cElementTree as ElementTree
-import json
-import re
-
-import plac
-from pathlib import Path
-from os import path
-
-
-escaped_tokens = {
-    '-LRB-': '(',
-    '-RRB-': ')',
-    '-LSB-': '[',
-    '-RSB-': ']',
-    '-LCB-': '{',
-    '-RCB-': '}',
-}
-
-def read_parses(parse_loc):
-    offset = 0
-    doc = []
-    for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
-        parse = _adjust_token_ids(parse, offset)
-        offset += len(parse.split('\n'))
-        doc.append(parse)
-    return doc
-
-def _adjust_token_ids(parse, offset):
-    output = []
-    for line in parse.split('\n'):
-        pieces = line.split()
-        pieces[0] = str(int(pieces[0]) + offset)
-        pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
-        output.append('\t'.join(pieces))
-    return '\n'.join(output)
-
-
-def _fmt_doc(filename, paras):
-    return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
-
-
-def _fmt_para(raw, sents):
-    return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
-
-
-def _fmt_sent(sent):
-    return {
-        'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
-        'brackets': []}
-
-
-def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
-    head = int(head) - 1
-    id_ = int(id_) - 1
-    head = (head - id_) if head != -1 else 0
-    return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
-
-
-tags_re = re.compile(r'<[\w\?/][^>]+>')
-def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
-    ewtb_dir = Path(ewtb_dir)
-    out_dir = Path(out_dir)
-    if not out_dir.exists():
-        out_dir.mkdir()
-    for genre_dir in ewtb_dir.joinpath('data').iterdir():
-        #if 'answers' in str(genre_dir): continue
-        parse_dir = genre_dir.joinpath('penntree')
-        docs = []
-        for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
-            filename = source_loc.parts[-1].replace('.sgm.sgm', '')
-            filename = filename.replace('.xml', '')
-            filename = filename.replace('.txt', '')
-            parse_loc = parse_dir.joinpath(filename + '.xml.tree')
-            parses = read_parses(parse_loc)
-            source = source_loc.open().read().strip()
-            if 'answers' in str(genre_dir):
-                source = tags_re.sub('', source).strip()
-            docs.append(_fmt_doc(filename, [[source, parses]]))
-
-        out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
-        with open(str(out_loc), 'w') as out_file:
-            out_file.write(json.dumps(docs, indent=4))
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/ner_tag.py
+++ b/bin/ner_tag.py
@ -1,32 +0,0 @@
-import io
-import plac
-
-from spacy.en import English
-
-
-def main(text_loc):
-    with io.open(text_loc, 'r', encoding='utf8') as file_:
-        text = file_.read()
-    NLU = English()
-    for paragraph in text.split('\n\n'):
-        tokens = NLU(paragraph)
-
-        ent_starts = {}
-        ent_ends = {}
-        for span in tokens.ents:
-            ent_starts[span.start] = span.label_
-            ent_ends[span.end] = span.label_
-
-        output = []
-        for token in tokens:
-            if token.i in ent_starts:
-                output.append('<%s>' % ent_starts[token.i])
-            output.append(token.orth_)
-            if (token.i+1) in ent_ends:
-                output.append('</%s>' % ent_ends[token.i+1])
-        output.append('\n\n')
-    print ' '.join(output)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/parser/conll_train.py
+++ b/bin/parser/conll_train.py
@ -1,157 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import io
-import random
-import time
-import gzip
-
-import plac
-import cProfile
-import pstats
-
-import spacy.util
-from spacy.en import English
-from spacy.gold import GoldParse
-
-from spacy.syntax.util import Config
-from spacy.syntax.arc_eager import ArcEager
-from spacy.syntax.parser import Parser
-from spacy.scorer import Scorer
-from spacy.tagger import Tagger
-
-# Last updated for spaCy v0.97
-
-
-def read_conll(file_):
-    """Read a standard CoNLL/MALT-style format"""
-    sents = []
-    for sent_str in file_.read().strip().split('\n\n'):
-        ids = []
-        words = []
-        heads = []
-        labels = []
-        tags = []
-        for i, line in enumerate(sent_str.split('\n')):
-            word, pos_string, head_idx, label = _parse_line(line)
-            words.append(word)
-            if head_idx < 0:
-                head_idx = i
-            ids.append(i)
-            heads.append(head_idx)
-            labels.append(label)
-            tags.append(pos_string)
-        text = ' '.join(words)
-        annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
-        sents.append((None, [(annot, [])]))
-    return sents
-
-
-def _parse_line(line):
-    pieces = line.split()
-    if len(pieces) == 4:
-        word, pos, head_idx, label = pieces
-        head_idx = int(head_idx)
-    elif len(pieces) == 15:
-        id_ = int(pieces[0].split('_')[-1])
-        word = pieces[1]
-        pos = pieces[4]
-        head_idx = int(pieces[8])-1
-        label = pieces[10]
-    else:
-        id_ = int(pieces[0].split('_')[-1])
-        word = pieces[1]
-        pos = pieces[4]
-        head_idx = int(pieces[6])-1
-        label = pieces[7]
-    if head_idx == 0:
-        label = 'ROOT'
-    return word, pos, head_idx, label
-
-        
-def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
-    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    nlp.tagger(tokens)
-    nlp.parser(tokens)
-    gold = GoldParse(tokens, annot_tuples, make_projective=False)
-    scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
-
-
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
-          gold_preproc=False, force_gold=False):
-    dep_model_dir = path.join(model_dir, 'deps')
-    pos_model_dir = path.join(model_dir, 'pos')
-    if path.exists(dep_model_dir):
-        shutil.rmtree(dep_model_dir)
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
-    os.mkdir(dep_model_dir)
-    os.mkdir(pos_model_dir)
-
-    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=ArcEager.get_labels(gold_tuples))
-
-    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
-    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
-    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
- 
-    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
-    for itn in range(n_iter):
-        scorer = Scorer()
-        loss = 0
-        for _, sents in gold_tuples:
-            for annot_tuples, _ in sents:
-                if len(annot_tuples[1]) == 1:
-                    continue
-
-                score_model(scorer, nlp, None, annot_tuples, verbose=False)
-
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples, make_projective=True)
-                if not gold.is_projective:
-                    raise Exception(
-                        "Non-projective sentence in training, after we should "
-                        "have enforced projectivity: %s" % annot_tuples
-                    )
- 
-                loss += nlp.parser.train(tokens, gold)
-                nlp.tagger.train(tokens, gold.tags)
-        random.shuffle(gold_tuples)
-        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
-                                             scorer.tags_acc, scorer.token_acc))
-    print('end training')
-    nlp.end_training(model_dir)
-    print('done')
-
-
-@plac.annotations(
-    train_loc=("Location of CoNLL 09 formatted training file"),
-    dev_loc=("Location of CoNLL 09 formatted development file"),
-    model_dir=("Location of output model directory"),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    n_iter=("Number of training iterations", "option", "i", int),
-)
-def main(train_loc, dev_loc, model_dir, n_iter=15):
-    with io.open(train_loc, 'r', encoding='utf8') as file_:
-        train_sents = read_conll(file_)
-    if not eval_only:
-        train(English, train_sents, model_dir, n_iter=n_iter)
-    nlp = English(data_dir=model_dir)
-    dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
-    scorer = Scorer()
-    for _, sents in dev_sents:
-        for annot_tuples, _ in sents:
-            score_model(scorer, nlp, None, annot_tuples)
-    print('TOK', 100-scorer.token_acc)
-    print('POS', scorer.tags_acc)
-    print('UAS', scorer.uas)
-    print('LAS', scorer.las)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -1,187 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-from __future__ import print_function
-
-import os
-from os import path
-import shutil
-import io
-import random
-
-import plac
-import re
-
-import spacy.util
-
-from spacy.syntax.util import Config
-from spacy.gold import read_json_file
-from spacy.gold import GoldParse
-from spacy.gold import merge_sents
-
-from spacy.scorer import Scorer
-
-from spacy.syntax.arc_eager import ArcEager
-from spacy.syntax.ner import BiluoPushDown
-from spacy.tagger import Tagger
-from spacy.syntax.parser import Parser
-from spacy.syntax.nonproj import PseudoProjectivity
-
-
-def _corrupt(c, noise_level):
-    if random.random() >= noise_level:
-        return c
-    elif c == ' ':
-        return '\n'
-    elif c == '\n':
-        return ' '
-    elif c in ['.', "'", "!", "?"]:
-        return ''
-    else:
-        return c.lower()
-
-
-def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
-        return orig
-    elif type(orig) == list:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
-        corrupted = [w for w in corrupted if w]
-        return corrupted
-    else:
-        return ''.join(_corrupt(c, noise_level) for c in orig)
-
-
-def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
-    if raw_text is None:
-        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    else:
-        tokens = nlp.tokenizer(raw_text)
-    nlp.tagger(tokens)
-    nlp.entity(tokens)
-    nlp.parser(tokens)
-    gold = GoldParse(tokens, annot_tuples)
-    scorer.score(tokens, gold, verbose=verbose)
-
-
-def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
-        n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
-    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
-    format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
-    with Language.train(model_dir, train_data,
-            tagger_cfg, parser_cfg, entity_cfg) as trainer:
-        loss = 0
-        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
-                                                   augment_data=None)):
-            for doc, gold in epoch:
-                trainer.update(doc, gold)
-            dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
-            print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
-                trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
-
-
-def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
-             beam_width=None, cand_preproc=None):
-    print("Load parser", model_dir)
-    nlp = Language(path=model_dir)
-    if nlp.lang == 'de':
-        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
-    if beam_width is not None:
-        nlp.parser.cfg.beam_width = beam_width
-    scorer = Scorer()
-    for raw_text, sents in gold_tuples:
-        if gold_preproc:
-            raw_text = None
-        else:
-            sents = merge_sents(sents)
-        for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.parser(tokens)
-                nlp.entity(tokens)
-            else:
-                tokens = nlp(raw_text)
-            gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
-            scorer.score(tokens, gold, verbose=verbose)
-    return scorer
-
-
-def write_parses(Language, dev_loc, model_dir, out_loc):
-    nlp = Language(data_dir=model_dir)
-    gold_tuples = read_json_file(dev_loc)
-    scorer = Scorer()
-    out_file = io.open(out_loc, 'w', 'utf8')
-    for raw_text, sents in gold_tuples:
-        sents = _merge_sents(sents)
-        for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.entity(tokens)
-                nlp.parser(tokens)
-            else:
-                tokens = nlp(raw_text)
-            #gold = GoldParse(tokens, annot_tuples)
-            #scorer.score(tokens, gold, verbose=False)
-            for sent in tokens.sents:
-                for t in sent:
-                    if not t.is_space:
-                        out_file.write(
-                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
-                        )
-                out_file.write('\n')
-
-
-@plac.annotations(
-    language=("The language to train", "positional", None, str, ['en','de', 'zh']),
-    train_loc=("Location of training file or directory"),
-    dev_loc=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    corruption_level=("Amount of noise to add to training data", "option", "c", float),
-    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
-    out_loc=("Out location", "option", "o", str),
-    n_sents=("Number of training sentences", "option", "n", int),
-    n_iter=("Number of training iterations", "option", "i", int),
-    verbose=("Verbose error reporting", "flag", "v", bool),
-    debug=("Debug mode", "flag", "d", bool),
-    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
-    L1=("L1 regularization penalty", "option", "L", float),
-)
-def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
-         L1=1e-6):
-    parser_cfg = dict(locals())
-    tagger_cfg = dict(locals())
-    entity_cfg = dict(locals())
-
-    lang = spacy.util.get_lang_class(language)
-
-    parser_cfg['features'] = lang.Defaults.parser_features
-    entity_cfg['features'] = lang.Defaults.entity_features
-
-    if not eval_only:
-        gold_train = list(read_json_file(train_loc))
-        gold_dev = list(read_json_file(dev_loc))
-        if n_sents > 0:
-            gold_train = gold_train[:n_sents]
-        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
-              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
-              n_iter=n_iter)
-    if out_loc:
-        write_parses(lang, dev_loc, model_dir, out_loc)
-    scorer = evaluate(lang, list(read_json_file(dev_loc)),
-                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
-    print('TOK', scorer.token_acc)
-    print('POS', scorer.tags_acc)
-    print('UAS', scorer.uas)
-    print('LAS', scorer.las)
-
-    print('NER P', scorer.ents_p)
-    print('NER R', scorer.ents_r)
-    print('NER F', scorer.ents_f)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -1,201 +0,0 @@
-from __future__ import unicode_literals, print_function
-import plac
-import json
-import random
-import pathlib
-
-from spacy.tokens import Doc
-from spacy.syntax.nonproj import PseudoProjectivity
-from spacy.language import Language
-from spacy.gold import GoldParse
-from spacy.tagger import Tagger
-from spacy.pipeline import DependencyParser, TokenVectorEncoder
-from spacy.syntax.parser import get_templates
-from spacy.syntax.arc_eager import ArcEager
-from spacy.scorer import Scorer
-from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
-import spacy.attrs
-import io
-from thinc.neural.ops import CupyOps
-from thinc.neural import Model
-from spacy.es import Spanish
-from spacy.attrs import POS
-
-
-from thinc.neural import Model
-
-
-try:
-    import cupy
-    from thinc.neural.ops import CupyOps
-except:
-    cupy = None
-
-
-def read_conllx(loc, n=0):
-    with io.open(loc, 'r', encoding='utf8') as file_:
-        text = file_.read()
-    i = 0
-    for sent in text.strip().split('\n\n'):
-        lines = sent.strip().split('\n')
-        if lines:
-            while lines[0].startswith('#'):
-                lines.pop(0)
-            tokens = []
-            for line in lines:
-                id_, word, lemma, pos, tag, morph, head, dep, _1, \
-                _2 = line.split('\t')
-                if '-' in id_ or '.' in id_:
-                    continue
-                try:
-                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != '0' else id_
-                    dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
-                    tag = pos+'__'+dep+'__'+morph
-                    Spanish.Defaults.tag_map[tag] = {POS: pos}
-                    tokens.append((id_, word, tag, head, dep, 'O'))
-                except:
-                    raise
-            tuples = [list(t) for t in zip(*tokens)]
-            yield (None, [[tuples, []]])
-            i += 1
-            if n >= 1 and i >= n:
-                break
-
-
-def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
-    scorer = Scorer()
-    correct = 0.
-    total = 0.
-    for doc, gold in zip(Xs, ys):
-        doc = Doc(vocab, words=[w.text for w in doc])
-        encoder(doc)
-        parser(doc)
-        PseudoProjectivity.deprojectivize(doc)
-        scorer.score(doc, gold, verbose=verbose)
-        for token, tag in zip(doc, gold.tags):
-            if '_' in token.tag_:
-                univ_guess, _ = token.tag_.split('_', 1)
-            else:
-                univ_guess = ''
-            univ_truth, _ = tag.split('_', 1)
-            correct += univ_guess == univ_truth
-            total += 1
-    return scorer
-
-
-def organize_data(vocab, train_sents):
-    Xs = []
-    ys = []
-    for _, doc_sents in train_sents:
-        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
-            doc = Doc(vocab, words=words)
-            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
-            Xs.append(doc)
-            ys.append(gold)
-    return Xs, ys
-
-
-def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
-    LangClass = spacy.util.get_lang_class(lang_name)
-    train_sents = list(read_conllx(train_loc))
-    dev_sents = list(read_conllx(dev_loc))
-    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
-
-    actions = ArcEager.get_actions(gold_parses=train_sents)
-    features = get_templates('basic')
-
-    model_dir = pathlib.Path(model_dir)
-    if not model_dir.exists():
-        model_dir.mkdir()
-    if not (model_dir / 'deps').exists():
-        (model_dir / 'deps').mkdir()
-    if not (model_dir / 'pos').exists():
-        (model_dir / 'pos').mkdir()
-    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
-        file_.write(
-            json.dumps(
-                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
-
-    vocab = LangClass.Defaults.create_vocab()
-    if not (model_dir / 'vocab').exists():
-        (model_dir / 'vocab').mkdir()
-    else:
-        if (model_dir / 'vocab' / 'strings.json').exists():
-            with (model_dir / 'vocab' / 'strings.json').open() as file_:
-                vocab.strings.load(file_)
-            if (model_dir / 'vocab' / 'lexemes.bin').exists():
-                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
-
-    if clusters_loc is not None:
-        clusters_loc = pathlib.Path(clusters_loc)
-        with clusters_loc.open() as file_:
-            for line in file_:
-                try:
-                    cluster, word, freq = line.split()
-                except ValueError:
-                    continue
-                lex = vocab[word]
-                lex.cluster = int(cluster[::-1], 2)
-    # Populate vocab
-    for _, doc_sents in train_sents:
-        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
-            for word in words:
-                _ = vocab[word]
-            for dep in deps:
-                _ = vocab[dep]
-            for tag in tags:
-                _ = vocab[tag]
-            if vocab.morphology.tag_map:
-                for tag in tags:
-                    vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
-    tagger = Tagger(vocab)
-    encoder = TokenVectorEncoder(vocab, width=64)
-    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
-
-    Xs, ys = organize_data(vocab, train_sents)
-    dev_Xs, dev_ys = organize_data(vocab, dev_sents)
-    with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
-        docs = list(Xs)
-        for doc in docs:
-            encoder(doc)
-        nn_loss = [0.]
-        def track_progress():
-            with encoder.tagger.use_params(optimizer.averages):
-                with parser.model.use_params(optimizer.averages):
-                    scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
-            itn = len(nn_loss)
-            print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
-            nn_loss.append(0.)
-        track_progress()
-        trainer.each_epoch.append(track_progress)
-        trainer.batch_size = 24
-        trainer.nb_epoch = 40
-        for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
-            docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
-            tokvecs, upd_tokvecs = encoder.begin_update(docs)
-            for doc, tokvec in zip(docs, tokvecs):
-                doc.tensor = tokvec
-            d_tokvecs = parser.update(docs, golds, sgd=optimizer)
-            upd_tokvecs(d_tokvecs, sgd=optimizer)
-            encoder.update(docs, golds, sgd=optimizer)
-    nlp = LangClass(vocab=vocab, parser=parser)
-    scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
-    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
-    #nlp.end_training(model_dir)
-    #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
-    #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
-
-
-if __name__ == '__main__':
-    import cProfile
-    import pstats
-    if 1:
-        plac.call(main)
-    else:
-        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
-    s = pstats.Stats("Profile.prof")
-    s.strip_dirs().sort_stats("time").print_stats()
-
-
-    plac.call(main)
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@ -1,194 +0,0 @@
-"""Convert OntoNotes into a json format.
-
-doc: {
-    id: string,
-    paragraphs: [{
-        raw: string,
-        sents: [int],
-        tokens: [{
-            start: int,
-            tag: string,
-            head: int,
-            dep: string}],
-        ner: [{
-            start: int,
-            end: int,
-            label: string}],
-        brackets: [{
-            start: int,
-            end: int,
-            label: string}]}]}
-
-Consumes output of spacy/munge/align_raw.py
-"""
-from __future__ import unicode_literals
-import plac
-import json
-from os import path
-import os
-import re
-import io
-from collections import defaultdict
-
-from spacy.munge import read_ptb
-from spacy.munge import read_conll
-from spacy.munge import read_ner
-
-
-def _iter_raw_files(raw_loc):
-    files = json.load(open(raw_loc))
-    for f in files:
-        yield f
-
-
-def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
-    ptb_sents = read_ptb.split(ptb_text)
-    dep_sents = read_conll.split(dep_text)
-    if len(ptb_sents) != len(dep_sents):
-        return None
-    if ner_text is not None:
-        ner_sents = read_ner.split(ner_text)
-    else:
-        ner_sents = [None] * len(ptb_sents)
-
-    i = 0
-    doc = {'id': file_id}
-    if raw_paras is None:
-        doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
-        #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
-        #    doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
-    else:
-        doc['paragraphs'] = []
-        for raw_sents in raw_paras:
-            para = format_para(
-                        ' '.join(raw_sents).replace('<SEP>', ''),
-                        ptb_sents[i:i+len(raw_sents)],
-                        dep_sents[i:i+len(raw_sents)],
-                        ner_sents[i:i+len(raw_sents)])
-            if para['sentences']:
-                doc['paragraphs'].append(para)
-            i += len(raw_sents)
-    return doc
-
-
-def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
-    para = {'raw': raw_text, 'sentences': []}
-    offset = 0
-    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
-    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
-        _, deps = read_conll.parse(dep_text, strip_bad_periods=True)
-        if deps and 'VERB' in [t['tag'] for t in deps]:
-            continue
-        if ner_text is not None:
-            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
-        else:
-            ner = ['-' for _ in deps]
-        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
-        # Necessary because the ClearNLP converter deletes EDITED words.
-        if len(ner) != len(deps):
-            ner = ['-' for _ in deps]
-        para['sentences'].append(format_sentence(deps, ner, brackets))
-    return para
-
-
-def format_sentence(deps, ner, brackets):
-    sent = {'tokens': [], 'brackets': []}
-    for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
-        sent['tokens'].append(format_token(token_id, token, token_ent))
-
-    for label, start, end in brackets:
-        if start != end:
-            sent['brackets'].append({
-                'label': label,
-                'first': start,
-                'last': (end-1)})
-    return sent
-
-
-def format_token(token_id, token, ner):
-    assert token_id == token['id']
-    head = (token['head'] - token_id) if token['head'] != -1 else 0
-    return {
-        'id': token_id,
-        'orth': token['word'],
-        'tag': token['tag'],
-        'head': head,
-        'dep': token['dep'],
-        'ner': ner}
-
-
-def read_file(*pieces):
-    loc = path.join(*pieces)
-    if not path.exists(loc):
-        return None
-    else:
-        return io.open(loc, 'r', encoding='utf8').read().strip()
-
-
-def get_file_names(section_dir, subsection):
-    filenames = []
-    for fn in os.listdir(path.join(section_dir, subsection)):
-        filenames.append(fn.rsplit('.', 1)[0])
-    return list(sorted(set(filenames)))
-
-
-def read_wsj_with_source(onto_dir, raw_dir):
-    # Now do WSJ, with source alignment
-    onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
-    docs = {}
-    for i in range(25):
-        section = str(i) if i >= 10 else ('0' + str(i))
-        raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
-        for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
-            if section == '00':
-                j += 1
-            if section == '04' and filename == '55':
-                continue
-            ptb = read_file(onto_dir, section, '%s.parse' % filename)
-            dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
-            ner = read_file(onto_dir, section, '%s.name' % filename)
-            if ptb is not None and dep is not None:
-                docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
-    return docs
-
-
-def get_doc(onto_dir, file_path, wsj_docs):
-    filename = file_path.rsplit('/', 1)[1]
-    if filename in wsj_docs:
-        return wsj_docs[filename]
-    else:
-        ptb = read_file(onto_dir, file_path + '.parse')
-        dep = read_file(onto_dir, file_path + '.parse.dep')
-        ner = read_file(onto_dir, file_path + '.name')
-        if ptb is not None and dep is not None:
-            return format_doc(filename, None, ptb, dep, ner)
-        else:
-            return None
-
-
-def read_ids(loc):
-    return open(loc).read().strip().split('\n')
-
-
-def main(onto_dir, raw_dir, out_dir):
-    wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
-
-    for partition in ('train', 'test', 'development'):
-        ids = read_ids(path.join(onto_dir, '%s.id' % partition))
-        docs_by_genre = defaultdict(list)
-        for file_path in ids:
-            doc = get_doc(onto_dir, file_path, wsj_docs)
-            if doc is not None:
-                genre = file_path.split('/')[3]
-                docs_by_genre[genre].append(doc)
-        part_dir = path.join(out_dir, partition)
-        if not path.exists(part_dir):
-            os.mkdir(part_dir)
-        for genre, docs in sorted(docs_by_genre.items()):
-            out_loc = path.join(part_dir, genre + '.json')
-            with open(out_loc, 'w') as file_:
-                json.dump(docs, file_, indent=4)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/prepare_vecs.py
+++ b/bin/prepare_vecs.py
@ -1,13 +0,0 @@
-"""Read a vector file, and prepare it as binary data, for easy consumption"""
-
-import plac
-
-from spacy.vocab import write_binary_vectors
-
-
-def main(in_loc, out_loc):
-    write_binary_vectors(in_loc, out_loc)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/tagger/train.py
+++ b/bin/tagger/train.py
@ -1,175 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-from __future__ import print_function
-
-import os
-from os import path
-import shutil
-import codecs
-import random
-
-import plac
-import re
-
-import spacy.util
-from spacy.en import English
-
-from spacy.tagger import Tagger
-
-from spacy.syntax.util import Config
-from spacy.gold import read_json_file
-from spacy.gold import GoldParse
-
-from spacy.scorer import Scorer
-
-
-def score_model(scorer, nlp, raw_text, annot_tuples):
-    if raw_text is None:
-        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-    else:
-        tokens = nlp.tokenizer(raw_text)
-    nlp.tagger(tokens)
-    gold = GoldParse(tokens, annot_tuples)
-    scorer.score(tokens, gold)
-
-
-def _merge_sents(sents):
-    m_deps = [[], [], [], [], [], []]
-    m_brackets = []
-    i = 0
-    for (ids, words, tags, heads, labels, ner), brackets in sents:
-        m_deps[0].extend(id_ + i for id_ in ids)
-        m_deps[1].extend(words)
-        m_deps[2].extend(tags)
-        m_deps[3].extend(head + i for head in heads)
-        m_deps[4].extend(labels)
-        m_deps[5].extend(ner)
-        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
-        i += len(ids)
-    return [(m_deps, m_brackets)]
-
-
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
-          beam_width=1, verbose=False,
-          use_orig_arc_eager=False):
-    if n_sents > 0:
-        gold_tuples = gold_tuples[:n_sents]
-   
-    templates = Tagger.default_templates()
-    nlp = Language(data_dir=model_dir, tagger=False)
-    nlp.tagger = Tagger.blank(nlp.vocab, templates)
-
-    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
-    for itn in range(n_iter):
-        scorer = Scorer()
-        loss = 0
-        for raw_text, sents in gold_tuples:
-            if gold_preproc:
-                raw_text = None
-            else:
-                sents = _merge_sents(sents)
-            for annot_tuples, ctnt in sents:
-                words = annot_tuples[1]
-                gold_tags = annot_tuples[2]
-                score_model(scorer, nlp, raw_text, annot_tuples)
-                if raw_text is None:
-                    tokens = nlp.tokenizer.tokens_from_list(words)
-                else:
-                    tokens = nlp.tokenizer(raw_text)
-                loss += nlp.tagger.train(tokens, gold_tags)
-        random.shuffle(gold_tuples)
-        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
-                                                   scorer.tags_acc,
-                                                   scorer.token_acc))
-    nlp.end_training(model_dir)
-
-def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
-             beam_width=None):
-    nlp = Language(data_dir=model_dir)
-    if beam_width is not None:
-        nlp.parser.cfg.beam_width = beam_width
-    scorer = Scorer()
-    for raw_text, sents in gold_tuples:
-        if gold_preproc:
-            raw_text = None
-        else:
-            sents = _merge_sents(sents)
-        for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.entity(tokens)
-                nlp.parser(tokens)
-            else:
-                tokens = nlp(raw_text, merge_mwes=False)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold, verbose=verbose)
-    return scorer
-
-
-def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
-    nlp = Language(data_dir=model_dir)
-    if beam_width is not None:
-        nlp.parser.cfg.beam_width = beam_width
-    gold_tuples = read_json_file(dev_loc)
-    scorer = Scorer()
-    out_file = codecs.open(out_loc, 'w', 'utf8')
-    for raw_text, sents in gold_tuples:
-        sents = _merge_sents(sents)
-        for annot_tuples, brackets in sents:
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-                nlp.tagger(tokens)
-                nlp.entity(tokens)
-                nlp.parser(tokens)
-            else:
-                tokens = nlp(raw_text, merge_mwes=False)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold, verbose=False)
-            for t in tokens:
-                out_file.write(
-                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
-                )
-    return scorer
-
-
-@plac.annotations(
-    train_loc=("Location of training file or directory"),
-    dev_loc=("Location of development file or directory"),
-    model_dir=("Location of output model directory",),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    corruption_level=("Amount of noise to add to training data", "option", "c", float),
-    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
-    out_loc=("Out location", "option", "o", str),
-    n_sents=("Number of training sentences", "option", "n", int),
-    n_iter=("Number of training iterations", "option", "i", int),
-    verbose=("Verbose error reporting", "flag", "v", bool),
-    debug=("Debug mode", "flag", "d", bool),
-)
-def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
-    if not eval_only:
-        gold_train = list(read_json_file(train_loc))
-        train(English, gold_train, model_dir,
-              feat_set='basic' if not debug else 'debug',
-              gold_preproc=gold_preproc, n_sents=n_sents,
-              corruption_level=corruption_level, n_iter=n_iter,
-              verbose=verbose)
-    #if out_loc:
-    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
-    scorer = evaluate(English, list(read_json_file(dev_loc)),
-                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
-    print('TOK', scorer.token_acc)
-    print('POS', scorer.tags_acc)
-    print('UAS', scorer.uas)
-    print('LAS', scorer.las)
-
-    print('NER P', scorer.ents_p)
-    print('NER R', scorer.ents_r)
-    print('NER F', scorer.ents_f)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/bin/tagger/train_german_tagger.py
+++ b/bin/tagger/train_german_tagger.py
@ -1,160 +0,0 @@
-#!/usr/bin/env python
-from __future__ import division
-from __future__ import unicode_literals
-
-import os
-from os import path
-import shutil
-import io
-import random
-import time
-import gzip
-import ujson
-
-import plac
-import cProfile
-import pstats
-
-import spacy.util
-from spacy.de import German
-from spacy.gold import GoldParse
-from spacy.tagger import Tagger
-from spacy.scorer import PRFScore
-
-from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
-from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
-from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
-from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
-from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
-
-
-def default_templates():
-    return spacy.tagger.Tagger.default_templates()
-
-def default_templates_without_clusters():
-    return (
-        (W_orth,),
-        (P1_lemma, P1_pos),
-        (P2_lemma, P2_pos),
-        (N1_orth,),
-        (N2_orth,),
-
-        (W_suffix,),
-        (W_prefix,),
-
-        (P1_pos,),
-        (P2_pos,),
-        (P1_pos, P2_pos),
-        (P1_pos, W_orth),
-        (P1_suffix,),
-        (N1_suffix,),
-
-        (W_shape,),
-
-        (W_flags,),
-        (N1_flags,),
-        (N2_flags,),
-        (P1_flags,),
-        (P2_flags,),
-    )
-
-
-def make_tagger(vocab, templates):
-    model = spacy.tagger.TaggerModel(templates)
-    return spacy.tagger.Tagger(vocab,model)
-
-
-def read_conll(file_):
-    def sentences():
-        words, tags = [], []
-        for line in file_:
-            line = line.strip()
-            if line:
-                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
-                words.append(word)
-                tags.append(tag)
-            elif words:
-                yield words, tags
-                words, tags = [], []
-        if words:
-            yield words, tags
-    return [ s for s in sentences() ]
-
-        
-def score_model(score, nlp, words, gold_tags):
-    tokens = nlp.tokenizer.tokens_from_list(words)
-    assert(len(tokens) == len(gold_tags))
-    nlp.tagger(tokens)
-
-    for token, gold_tag in zip(tokens,gold_tags):
-        score.score_set(set([token.tag_]),set([gold_tag]))
-
-
-def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
-    # make shuffling deterministic
-    random.seed(seed)
-
-    # set up directory for model
-    pos_model_dir = path.join(model_dir, 'pos')
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
-    os.mkdir(pos_model_dir)
-
-    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
-    nlp.tagger = make_tagger(nlp.vocab,default_templates())
-     
-    print("Itn.\ttrain acc %\tdev acc %")
-    for itn in range(n_iter):
-        # train on train set
-        #train_acc = PRFScore()
-        correct, total = 0., 0.
-        for words, gold_tags in train_sents:
-            tokens = nlp.tokenizer.tokens_from_list(words)
-            correct += nlp.tagger.train(tokens, gold_tags)
-            total += len(words)
-        train_acc = correct/total
-
-        # test on dev set
-        dev_acc = PRFScore()
-        for words, gold_tags in dev_sents:
-            score_model(dev_acc, nlp, words, gold_tags)
-
-        random.shuffle(train_sents)
-        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
-
-
-    print('end training')
-    nlp.end_training(model_dir)
-    print('done')
-
-
-@plac.annotations(
-    train_loc=("Location of CoNLL 09 formatted training file"),
-    dev_loc=("Location of CoNLL 09 formatted development file"),
-    model_dir=("Location of output model directory"),
-    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
-    n_iter=("Number of training iterations", "option", "i", int),
-)
-def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
-    # training
-    if not eval_only:
-        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
-             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
-            train_sents = read_conll(trainfile_)
-            dev_sents = read_conll(devfile_)
-        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
-
-    # testing
-    with io.open(dev_loc, 'r', encoding='utf8') as file_:
-        dev_sents = read_conll(file_)
-        nlp = German(data_dir=model_dir)
-
-        dev_acc = PRFScore()
-        for words, gold_tags in dev_sents:
-            score_model(dev_acc, nlp, words, gold_tags)                
-        
-        print('POS: %6.2f %%' % (100*dev_acc.precision))
-
-
-if __name__ == '__main__':
-    plac.call(main)