Remove old, outdated files in /bin

2025-11-04 01:48:04 +03:00 · 2017-10-27 19:44:38 +02:00 · 2017-10-27 19:44:38 +02:00 · 5025d709e0
commit 5025d709e0
parent 9c89e2cdef
10 changed files with 0 additions and 1301 deletions
--- a/bin/get_freqs.py
+++ b/bin/get_freqs.py
@ -1,93 +0,0 @@
 #!/usr/bin/env python
 from __future__ import unicode_literals, print_function
 import plac
 import joblib
 from os import path
 import os
 import bz2
 import ujson
 from preshed.counter import PreshCounter
 from joblib import Parallel, delayed
 import io
 from spacy.en import English
 from spacy.strings import StringStore
 from spacy.attrs import ORTH
 from spacy.tokenizer import Tokenizer
 from spacy.vocab import Vocab
 def iter_comments(loc):
    with bz2.BZ2File(loc) as file_:
        for line in file_:
            yield ujson.loads(line)
 def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(vocab,
                    path.join(English.default_data_dir(), 'tokenizer'))
    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)
    with io.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
 def parallelize(func, iterator, n_jobs):
    Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
 def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
@plac.annotations(
    input_loc=("Location of input file list"),
    freqs_dir=("Directory for frequency files"),
    output_loc=("Location for output file"),
    n_jobs=("Number of workers", "option", "n", int),
    skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
 )
 def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
    tasks = []
    outputs = []
    for input_path in open(input_loc):
        input_path = input_path.strip()
        if not input_path:
            continue
        filename = input_path.split('/')[-1]
        output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
        outputs.append(output_path)
        if not path.exists(output_path) or not skip_existing:
            tasks.append((input_path, output_path))
    if tasks:
        parallelize(count_freqs, tasks, n_jobs)
    print("Merge")
    merge_counts(outputs, output_loc)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/munge_ewtb.py
+++ b/bin/munge_ewtb.py
@ -1,89 +0,0 @@
 #!/usr/bin/env python
 from __future__ import unicode_literals
 from xml.etree import cElementTree as ElementTree
 import json
 import re
 import plac
 from pathlib import Path
 from os import path
 escaped_tokens = {
    '-LRB-': '(',
    '-RRB-': ')',
    '-LSB-': '[',
    '-RSB-': ']',
    '-LCB-': '{',
    '-RCB-': '}',
 }
 def read_parses(parse_loc):
    offset = 0
    doc = []
    for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
        parse = _adjust_token_ids(parse, offset)
        offset += len(parse.split('\n'))
        doc.append(parse)
    return doc
 def _adjust_token_ids(parse, offset):
    output = []
    for line in parse.split('\n'):
        pieces = line.split()
        pieces[0] = str(int(pieces[0]) + offset)
        pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
        output.append('\t'.join(pieces))
    return '\n'.join(output)
 def _fmt_doc(filename, paras):
    return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
 def _fmt_para(raw, sents):
    return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
 def _fmt_sent(sent):
    return {
        'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
        'brackets': []}
 def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
    head = int(head) - 1
    id_ = int(id_) - 1
    head = (head - id_) if head != -1 else 0
    return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
 tags_re = re.compile(r'<[\w\?/][^>]+>')
 def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
    ewtb_dir = Path(ewtb_dir)
    out_dir = Path(out_dir)
    if not out_dir.exists():
        out_dir.mkdir()
    for genre_dir in ewtb_dir.joinpath('data').iterdir():
        #if 'answers' in str(genre_dir): continue
        parse_dir = genre_dir.joinpath('penntree')
        docs = []
        for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
            filename = source_loc.parts[-1].replace('.sgm.sgm', '')
            filename = filename.replace('.xml', '')
            filename = filename.replace('.txt', '')
            parse_loc = parse_dir.joinpath(filename + '.xml.tree')
            parses = read_parses(parse_loc)
            source = source_loc.open().read().strip()
            if 'answers' in str(genre_dir):
                source = tags_re.sub('', source).strip()
            docs.append(_fmt_doc(filename, [[source, parses]]))
        out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
        with open(str(out_loc), 'w') as out_file:
            out_file.write(json.dumps(docs, indent=4))
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/ner_tag.py
+++ b/bin/ner_tag.py
@ -1,32 +0,0 @@
 import io
 import plac
 from spacy.en import English
 def main(text_loc):
    with io.open(text_loc, 'r', encoding='utf8') as file_:
        text = file_.read()
    NLU = English()
    for paragraph in text.split('\n\n'):
        tokens = NLU(paragraph)
        ent_starts = {}
        ent_ends = {}
        for span in tokens.ents:
            ent_starts[span.start] = span.label_
            ent_ends[span.end] = span.label_
        output = []
        for token in tokens:
            if token.i in ent_starts:
                output.append('<%s>' % ent_starts[token.i])
            output.append(token.orth_)
            if (token.i+1) in ent_ends:
                output.append('</%s>' % ent_ends[token.i+1])
        output.append('\n\n')
    print ' '.join(output)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/conll_train.py
+++ b/bin/parser/conll_train.py
@ -1,157 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import io
 import random
 import time
 import gzip
 import plac
 import cProfile
 import pstats
 import spacy.util
 from spacy.en import English
 from spacy.gold import GoldParse
 from spacy.syntax.util import Config
 from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.parser import Parser
 from spacy.scorer import Scorer
 from spacy.tagger import Tagger
 # Last updated for spaCy v0.97
 def read_conll(file_):
    """Read a standard CoNLL/MALT-style format"""
    sents = []
    for sent_str in file_.read().strip().split('\n\n'):
        ids = []
        words = []
        heads = []
        labels = []
        tags = []
        for i, line in enumerate(sent_str.split('\n')):
            word, pos_string, head_idx, label = _parse_line(line)
            words.append(word)
            if head_idx < 0:
                head_idx = i
            ids.append(i)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
        text = ' '.join(words)
        annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
        sents.append((None, [(annot, [])]))
    return sents
 def _parse_line(line):
    pieces = line.split()
    if len(pieces) == 4:
        word, pos, head_idx, label = pieces
        head_idx = int(head_idx)
    elif len(pieces) == 15:
        id_ = int(pieces[0].split('_')[-1])
        word = pieces[1]
        pos = pieces[4]
        head_idx = int(pieces[8])-1
        label = pieces[10]
    else:
        id_ = int(pieces[0].split('_')[-1])
        word = pieces[1]
        pos = pieces[4]
        head_idx = int(pieces[6])-1
        label = pieces[7]
    if head_idx == 0:
        label = 'ROOT'
    return word, pos, head_idx, label
 def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    nlp.tagger(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples, make_projective=False)
    scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples))
    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, _ in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, None, annot_tuples, verbose=False)
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                             scorer.tags_acc, scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
@plac.annotations(
    train_loc=("Location of CoNLL 09 formatted training file"),
    dev_loc=("Location of CoNLL 09 formatted development file"),
    model_dir=("Location of output model directory"),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    n_iter=("Number of training iterations", "option", "i", int),
 )
 def main(train_loc, dev_loc, model_dir, n_iter=15):
    with io.open(train_loc, 'r', encoding='utf8') as file_:
        train_sents = read_conll(file_)
    if not eval_only:
        train(English, train_sents, model_dir, n_iter=n_iter)
    nlp = English(data_dir=model_dir)
    dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
    scorer = Scorer()
    for _, sents in dev_sents:
        for annot_tuples, _ in sents:
            score_model(scorer, nlp, None, annot_tuples)
    print('TOK', 100-scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -1,187 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 from __future__ import print_function
 import os
 from os import path
 import shutil
 import io
 import random
 import plac
 import re
 import spacy.util
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
 from spacy.gold import GoldParse
 from spacy.gold import merge_sents
 from spacy.scorer import Scorer
 from spacy.syntax.arc_eager import ArcEager
 from spacy.syntax.ner import BiluoPushDown
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
 from spacy.syntax.nonproj import PseudoProjectivity
 def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()
 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return ''.join(_corrupt(c, noise_level) for c in orig)
 def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
        tokens = nlp.tokenizer(raw_text)
    nlp.tagger(tokens)
    nlp.entity(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)
 def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
        n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
    format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
    with Language.train(model_dir, train_data,
            tagger_cfg, parser_cfg, entity_cfg) as trainer:
        loss = 0
        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
                                                   augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
            dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
            print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
                trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None, cand_preproc=None):
    print("Load parser", model_dir)
    nlp = Language(path=model_dir)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.parser(tokens)
                nlp.entity(tokens)
            else:
                tokens = nlp(raw_text)
            gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer
 def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = io.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text)
            #gold = GoldParse(tokens, annot_tuples)
            #scorer.score(tokens, gold, verbose=False)
            for sent in tokens.sents:
                for t in sent:
                    if not t.is_space:
                        out_file.write(
                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
                        )
                out_file.write('\n')
@plac.annotations(
    language=("The language to train", "positional", None, str, ['en','de', 'zh']),
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
    L1=("L1 regularization penalty", "option", "L", float),
 )
 def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
         L1=1e-6):
    parser_cfg = dict(locals())
    tagger_cfg = dict(locals())
    entity_cfg = dict(locals())
    lang = spacy.util.get_lang_class(language)
    parser_cfg['features'] = lang.Defaults.parser_features
    entity_cfg['features'] = lang.Defaults.entity_features
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        gold_dev = list(read_json_file(dev_loc))
        if n_sents > 0:
            gold_train = gold_train[:n_sents]
        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
              n_iter=n_iter)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -1,201 +0,0 @@
 from __future__ import unicode_literals, print_function
 import plac
 import json
 import random
 import pathlib
 from spacy.tokens import Doc
 from spacy.syntax.nonproj import PseudoProjectivity
 from spacy.language import Language
 from spacy.gold import GoldParse
 from spacy.tagger import Tagger
 from spacy.pipeline import DependencyParser, TokenVectorEncoder
 from spacy.syntax.parser import get_templates
 from spacy.syntax.arc_eager import ArcEager
 from spacy.scorer import Scorer
 from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
 import spacy.attrs
 import io
 from thinc.neural.ops import CupyOps
 from thinc.neural import Model
 from spacy.es import Spanish
 from spacy.attrs import POS
 from thinc.neural import Model
 try:
    import cupy
    from thinc.neural.ops import CupyOps
 except:
    cupy = None
 def read_conllx(loc, n=0):
    with io.open(loc, 'r', encoding='utf8') as file_:
        text = file_.read()
    i = 0
    for sent in text.strip().split('\n\n'):
        lines = sent.strip().split('\n')
        if lines:
            while lines[0].startswith('#'):
                lines.pop(0)
            tokens = []
            for line in lines:
                id_, word, lemma, pos, tag, morph, head, dep, _1, \
                _2 = line.split('\t')
                if '-' in id_ or '.' in id_:
                    continue
                try:
                    id_ = int(id_) - 1
                    head = (int(head) - 1) if head != '0' else id_
                    dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
                    tag = pos+'__'+dep+'__'+morph
                    Spanish.Defaults.tag_map[tag] = {POS: pos}
                    tokens.append((id_, word, tag, head, dep, 'O'))
                except:
                    raise
            tuples = [list(t) for t in zip(*tokens)]
            yield (None, [[tuples, []]])
            i += 1
            if n >= 1 and i >= n:
                break
 def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
    scorer = Scorer()
    correct = 0.
    total = 0.
    for doc, gold in zip(Xs, ys):
        doc = Doc(vocab, words=[w.text for w in doc])
        encoder(doc)
        parser(doc)
        PseudoProjectivity.deprojectivize(doc)
        scorer.score(doc, gold, verbose=verbose)
        for token, tag in zip(doc, gold.tags):
            if '_' in token.tag_:
                univ_guess, _ = token.tag_.split('_', 1)
            else:
                univ_guess = ''
            univ_truth, _ = tag.split('_', 1)
            correct += univ_guess == univ_truth
            total += 1
    return scorer
 def organize_data(vocab, train_sents):
    Xs = []
    ys = []
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
            Xs.append(doc)
            ys.append(gold)
    return Xs, ys
 def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    dev_sents = list(read_conllx(dev_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')
    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
    tagger = Tagger(vocab)
    encoder = TokenVectorEncoder(vocab, width=64)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
    Xs, ys = organize_data(vocab, train_sents)
    dev_Xs, dev_ys = organize_data(vocab, dev_sents)
    with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
        docs = list(Xs)
        for doc in docs:
            encoder(doc)
        nn_loss = [0.]
        def track_progress():
            with encoder.tagger.use_params(optimizer.averages):
                with parser.model.use_params(optimizer.averages):
                    scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
            itn = len(nn_loss)
            print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
            nn_loss.append(0.)
        track_progress()
        trainer.each_epoch.append(track_progress)
        trainer.batch_size = 24
        trainer.nb_epoch = 40
        for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
            docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
            tokvecs, upd_tokvecs = encoder.begin_update(docs)
            for doc, tokvec in zip(docs, tokvecs):
                doc.tensor = tokvec
            d_tokvecs = parser.update(docs, golds, sgd=optimizer)
            upd_tokvecs(d_tokvecs, sgd=optimizer)
            encoder.update(docs, golds, sgd=optimizer)
    nlp = LangClass(vocab=vocab, parser=parser)
    scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
    #nlp.end_training(model_dir)
    #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 if __name__ == '__main__':
    import cProfile
    import pstats
    if 1:
        plac.call(main)
    else:
        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
    plac.call(main)
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@ -1,194 +0,0 @@
 """Convert OntoNotes into a json format.
 doc: {
    id: string,
    paragraphs: [{
        raw: string,
        sents: [int],
        tokens: [{
            start: int,
            tag: string,
            head: int,
            dep: string}],
        ner: [{
            start: int,
            end: int,
            label: string}],
        brackets: [{
            start: int,
            end: int,
            label: string}]}]}
 Consumes output of spacy/munge/align_raw.py
 """
 from __future__ import unicode_literals
 import plac
 import json
 from os import path
 import os
 import re
 import io
 from collections import defaultdict
 from spacy.munge import read_ptb
 from spacy.munge import read_conll
 from spacy.munge import read_ner
 def _iter_raw_files(raw_loc):
    files = json.load(open(raw_loc))
    for f in files:
        yield f
 def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
    ptb_sents = read_ptb.split(ptb_text)
    dep_sents = read_conll.split(dep_text)
    if len(ptb_sents) != len(dep_sents):
        return None
    if ner_text is not None:
        ner_sents = read_ner.split(ner_text)
    else:
        ner_sents = [None] * len(ptb_sents)
    i = 0
    doc = {'id': file_id}
    if raw_paras is None:
        doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
        #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
        #    doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
    else:
        doc['paragraphs'] = []
        for raw_sents in raw_paras:
            para = format_para(
                        ' '.join(raw_sents).replace('<SEP>', ''),
                        ptb_sents[i:i+len(raw_sents)],
                        dep_sents[i:i+len(raw_sents)],
                        ner_sents[i:i+len(raw_sents)])
            if para['sentences']:
                doc['paragraphs'].append(para)
            i += len(raw_sents)
    return doc
 def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
    para = {'raw': raw_text, 'sentences': []}
    offset = 0
    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
        _, deps = read_conll.parse(dep_text, strip_bad_periods=True)
        if deps and 'VERB' in [t['tag'] for t in deps]:
            continue
        if ner_text is not None:
            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
        else:
            ner = ['-' for _ in deps]
        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
        # Necessary because the ClearNLP converter deletes EDITED words.
        if len(ner) != len(deps):
            ner = ['-' for _ in deps]
        para['sentences'].append(format_sentence(deps, ner, brackets))
    return para
 def format_sentence(deps, ner, brackets):
    sent = {'tokens': [], 'brackets': []}
    for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
        sent['tokens'].append(format_token(token_id, token, token_ent))
    for label, start, end in brackets:
        if start != end:
            sent['brackets'].append({
                'label': label,
                'first': start,
                'last': (end-1)})
    return sent
 def format_token(token_id, token, ner):
    assert token_id == token['id']
    head = (token['head'] - token_id) if token['head'] != -1 else 0
    return {
        'id': token_id,
        'orth': token['word'],
        'tag': token['tag'],
        'head': head,
        'dep': token['dep'],
        'ner': ner}
 def read_file(*pieces):
    loc = path.join(*pieces)
    if not path.exists(loc):
        return None
    else:
        return io.open(loc, 'r', encoding='utf8').read().strip()
 def get_file_names(section_dir, subsection):
    filenames = []
    for fn in os.listdir(path.join(section_dir, subsection)):
        filenames.append(fn.rsplit('.', 1)[0])
    return list(sorted(set(filenames)))
 def read_wsj_with_source(onto_dir, raw_dir):
    # Now do WSJ, with source alignment
    onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
    docs = {}
    for i in range(25):
        section = str(i) if i >= 10 else ('0' + str(i))
        raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
        for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
            if section == '00':
                j += 1
            if section == '04' and filename == '55':
                continue
            ptb = read_file(onto_dir, section, '%s.parse' % filename)
            dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
            ner = read_file(onto_dir, section, '%s.name' % filename)
            if ptb is not None and dep is not None:
                docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
    return docs
 def get_doc(onto_dir, file_path, wsj_docs):
    filename = file_path.rsplit('/', 1)[1]
    if filename in wsj_docs:
        return wsj_docs[filename]
    else:
        ptb = read_file(onto_dir, file_path + '.parse')
        dep = read_file(onto_dir, file_path + '.parse.dep')
        ner = read_file(onto_dir, file_path + '.name')
        if ptb is not None and dep is not None:
            return format_doc(filename, None, ptb, dep, ner)
        else:
            return None
 def read_ids(loc):
    return open(loc).read().strip().split('\n')
 def main(onto_dir, raw_dir, out_dir):
    wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
    for partition in ('train', 'test', 'development'):
        ids = read_ids(path.join(onto_dir, '%s.id' % partition))
        docs_by_genre = defaultdict(list)
        for file_path in ids:
            doc = get_doc(onto_dir, file_path, wsj_docs)
            if doc is not None:
                genre = file_path.split('/')[3]
                docs_by_genre[genre].append(doc)
        part_dir = path.join(out_dir, partition)
        if not path.exists(part_dir):
            os.mkdir(part_dir)
        for genre, docs in sorted(docs_by_genre.items()):
            out_loc = path.join(part_dir, genre + '.json')
            with open(out_loc, 'w') as file_:
                json.dump(docs, file_, indent=4)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/prepare_vecs.py
+++ b/bin/prepare_vecs.py
@ -1,13 +0,0 @@
 """Read a vector file, and prepare it as binary data, for easy consumption"""
 import plac
 from spacy.vocab import write_binary_vectors
 def main(in_loc, out_loc):
    write_binary_vectors(in_loc, out_loc)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/tagger/train.py
+++ b/bin/tagger/train.py
@ -1,175 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 from __future__ import print_function
 import os
 from os import path
 import shutil
 import codecs
 import random
 import plac
 import re
 import spacy.util
 from spacy.en import English
 from spacy.tagger import Tagger
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
 from spacy.gold import GoldParse
 from spacy.scorer import Scorer
 def score_model(scorer, nlp, raw_text, annot_tuples):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
        tokens = nlp.tokenizer(raw_text)
    nlp.tagger(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold)
 def _merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), brackets in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]
 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    templates = Tagger.default_templates()
    nlp = Language(data_dir=model_dir, tagger=False)
    nlp.tagger = Tagger.blank(nlp.vocab, templates)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                words = annot_tuples[1]
                gold_tags = annot_tuples[2]
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    tokens = nlp.tokenizer(raw_text)
                loss += nlp.tagger.train(tokens, gold_tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    nlp.end_training(model_dir)
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer
 def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)
            for t in tokens:
                out_file.write(
                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
                )
    return scorer
@plac.annotations(
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose)
    #if out_loc:
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
 if __name__ == '__main__':
    plac.call(main)
--- a/bin/tagger/train_german_tagger.py
+++ b/bin/tagger/train_german_tagger.py
@ -1,160 +0,0 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
 import os
 from os import path
 import shutil
 import io
 import random
 import time
 import gzip
 import ujson
 import plac
 import cProfile
 import pstats
 import spacy.util
 from spacy.de import German
 from spacy.gold import GoldParse
 from spacy.tagger import Tagger
 from spacy.scorer import PRFScore
 from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
 from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
 from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
 from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
 from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
 def default_templates():
    return spacy.tagger.Tagger.default_templates()
 def default_templates_without_clusters():
    return (
        (W_orth,),
        (P1_lemma, P1_pos),
        (P2_lemma, P2_pos),
        (N1_orth,),
        (N2_orth,),
        (W_suffix,),
        (W_prefix,),
        (P1_pos,),
        (P2_pos,),
        (P1_pos, P2_pos),
        (P1_pos, W_orth),
        (P1_suffix,),
        (N1_suffix,),
        (W_shape,),
        (W_flags,),
        (N1_flags,),
        (N2_flags,),
        (P1_flags,),
        (P2_flags,),
    )
 def make_tagger(vocab, templates):
    model = spacy.tagger.TaggerModel(templates)
    return spacy.tagger.Tagger(vocab,model)
 def read_conll(file_):
    def sentences():
        words, tags = [], []
        for line in file_:
            line = line.strip()
            if line:
                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
                words.append(word)
                tags.append(tag)
            elif words:
                yield words, tags
                words, tags = [], []
        if words:
            yield words, tags
    return [ s for s in sentences() ]
 def score_model(score, nlp, words, gold_tags):
    tokens = nlp.tokenizer.tokens_from_list(words)
    assert(len(tokens) == len(gold_tags))
    nlp.tagger(tokens)
    for token, gold_tag in zip(tokens,gold_tags):
        score.score_set(set([token.tag_]),set([gold_tag]))
 def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
    # make shuffling deterministic
    random.seed(seed)
    # set up directory for model
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(pos_model_dir)
    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = make_tagger(nlp.vocab,default_templates())
    print("Itn.\ttrain acc %\tdev acc %")
    for itn in range(n_iter):
        # train on train set
        #train_acc = PRFScore()
        correct, total = 0., 0.
        for words, gold_tags in train_sents:
            tokens = nlp.tokenizer.tokens_from_list(words)
            correct += nlp.tagger.train(tokens, gold_tags)
            total += len(words)
        train_acc = correct/total
        # test on dev set
        dev_acc = PRFScore()
        for words, gold_tags in dev_sents:
            score_model(dev_acc, nlp, words, gold_tags)
        random.shuffle(train_sents)
        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
@plac.annotations(
    train_loc=("Location of CoNLL 09 formatted training file"),
    dev_loc=("Location of CoNLL 09 formatted development file"),
    model_dir=("Location of output model directory"),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    n_iter=("Number of training iterations", "option", "i", int),
 )
 def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
    # training
    if not eval_only:
        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
            train_sents = read_conll(trainfile_)
            dev_sents = read_conll(devfile_)
        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
    # testing
    with io.open(dev_loc, 'r', encoding='utf8') as file_:
        dev_sents = read_conll(file_)
        nlp = German(data_dir=model_dir)
        dev_acc = PRFScore()
        for words, gold_tags in dev_sents:
            score_model(dev_acc, nlp, words, gold_tags)                
        print('POS: %6.2f %%' % (100*dev_acc.precision))
 if __name__ == '__main__':
    plac.call(main)