mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Remove old, outdated files in /bin
This commit is contained in:
		
							parent
							
								
									9c89e2cdef
								
							
						
					
					
						commit
						5025d709e0
					
				| 
						 | 
					@ -1,93 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import joblib
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import bz2
 | 
					 | 
				
			||||||
import ujson
 | 
					 | 
				
			||||||
from preshed.counter import PreshCounter
 | 
					 | 
				
			||||||
from joblib import Parallel, delayed
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
from spacy.strings import StringStore
 | 
					 | 
				
			||||||
from spacy.attrs import ORTH
 | 
					 | 
				
			||||||
from spacy.tokenizer import Tokenizer
 | 
					 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def iter_comments(loc):
 | 
					 | 
				
			||||||
    with bz2.BZ2File(loc) as file_:
 | 
					 | 
				
			||||||
        for line in file_:
 | 
					 | 
				
			||||||
            yield ujson.loads(line)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def count_freqs(input_loc, output_loc):
 | 
					 | 
				
			||||||
    print(output_loc)
 | 
					 | 
				
			||||||
    vocab = English.default_vocab(get_lex_attr=None)
 | 
					 | 
				
			||||||
    tokenizer = Tokenizer.from_dir(vocab,
 | 
					 | 
				
			||||||
                    path.join(English.default_data_dir(), 'tokenizer'))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    counts = PreshCounter()
 | 
					 | 
				
			||||||
    for json_comment in iter_comments(input_loc):
 | 
					 | 
				
			||||||
        doc = tokenizer(json_comment['body'])
 | 
					 | 
				
			||||||
        doc.count_by(ORTH, counts=counts)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    with io.open(output_loc, 'w', 'utf8') as file_:
 | 
					 | 
				
			||||||
        for orth, freq in counts:
 | 
					 | 
				
			||||||
            string = tokenizer.vocab.strings[orth]
 | 
					 | 
				
			||||||
            if not string.isspace():
 | 
					 | 
				
			||||||
                file_.write('%d\t%s\n' % (freq, string))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def parallelize(func, iterator, n_jobs):
 | 
					 | 
				
			||||||
    Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def merge_counts(locs, out_loc):
 | 
					 | 
				
			||||||
    string_map = StringStore()
 | 
					 | 
				
			||||||
    counts = PreshCounter()
 | 
					 | 
				
			||||||
    for loc in locs:
 | 
					 | 
				
			||||||
        with io.open(loc, 'r', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
            for line in file_:
 | 
					 | 
				
			||||||
                freq, word = line.strip().split('\t', 1)
 | 
					 | 
				
			||||||
                orth = string_map[word]
 | 
					 | 
				
			||||||
                counts.inc(orth, int(freq))
 | 
					 | 
				
			||||||
    with io.open(out_loc, 'w', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
        for orth, count in counts:
 | 
					 | 
				
			||||||
            string = string_map[orth]
 | 
					 | 
				
			||||||
            file_.write('%d\t%s\n' % (count, string))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    input_loc=("Location of input file list"),
 | 
					 | 
				
			||||||
    freqs_dir=("Directory for frequency files"),
 | 
					 | 
				
			||||||
    output_loc=("Location for output file"),
 | 
					 | 
				
			||||||
    n_jobs=("Number of workers", "option", "n", int),
 | 
					 | 
				
			||||||
    skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
 | 
					 | 
				
			||||||
    tasks = []
 | 
					 | 
				
			||||||
    outputs = []
 | 
					 | 
				
			||||||
    for input_path in open(input_loc):
 | 
					 | 
				
			||||||
        input_path = input_path.strip()
 | 
					 | 
				
			||||||
        if not input_path:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        filename = input_path.split('/')[-1]
 | 
					 | 
				
			||||||
        output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
 | 
					 | 
				
			||||||
        outputs.append(output_path)
 | 
					 | 
				
			||||||
        if not path.exists(output_path) or not skip_existing:
 | 
					 | 
				
			||||||
            tasks.append((input_path, output_path))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if tasks:
 | 
					 | 
				
			||||||
        parallelize(count_freqs, tasks, n_jobs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("Merge")
 | 
					 | 
				
			||||||
    merge_counts(outputs, output_loc)
 | 
					 | 
				
			||||||
                
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,89 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from xml.etree import cElementTree as ElementTree
 | 
					 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
escaped_tokens = {
 | 
					 | 
				
			||||||
    '-LRB-': '(',
 | 
					 | 
				
			||||||
    '-RRB-': ')',
 | 
					 | 
				
			||||||
    '-LSB-': '[',
 | 
					 | 
				
			||||||
    '-RSB-': ']',
 | 
					 | 
				
			||||||
    '-LCB-': '{',
 | 
					 | 
				
			||||||
    '-RCB-': '}',
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_parses(parse_loc):
 | 
					 | 
				
			||||||
    offset = 0
 | 
					 | 
				
			||||||
    doc = []
 | 
					 | 
				
			||||||
    for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
 | 
					 | 
				
			||||||
        parse = _adjust_token_ids(parse, offset)
 | 
					 | 
				
			||||||
        offset += len(parse.split('\n'))
 | 
					 | 
				
			||||||
        doc.append(parse)
 | 
					 | 
				
			||||||
    return doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _adjust_token_ids(parse, offset):
 | 
					 | 
				
			||||||
    output = []
 | 
					 | 
				
			||||||
    for line in parse.split('\n'):
 | 
					 | 
				
			||||||
        pieces = line.split()
 | 
					 | 
				
			||||||
        pieces[0] = str(int(pieces[0]) + offset)
 | 
					 | 
				
			||||||
        pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
 | 
					 | 
				
			||||||
        output.append('\t'.join(pieces))
 | 
					 | 
				
			||||||
    return '\n'.join(output)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _fmt_doc(filename, paras):
 | 
					 | 
				
			||||||
    return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _fmt_para(raw, sents):
 | 
					 | 
				
			||||||
    return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _fmt_sent(sent):
 | 
					 | 
				
			||||||
    return {
 | 
					 | 
				
			||||||
        'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
 | 
					 | 
				
			||||||
        'brackets': []}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
 | 
					 | 
				
			||||||
    head = int(head) - 1
 | 
					 | 
				
			||||||
    id_ = int(id_) - 1
 | 
					 | 
				
			||||||
    head = (head - id_) if head != -1 else 0
 | 
					 | 
				
			||||||
    return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
tags_re = re.compile(r'<[\w\?/][^>]+>')
 | 
					 | 
				
			||||||
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
 | 
					 | 
				
			||||||
    ewtb_dir = Path(ewtb_dir)
 | 
					 | 
				
			||||||
    out_dir = Path(out_dir)
 | 
					 | 
				
			||||||
    if not out_dir.exists():
 | 
					 | 
				
			||||||
        out_dir.mkdir()
 | 
					 | 
				
			||||||
    for genre_dir in ewtb_dir.joinpath('data').iterdir():
 | 
					 | 
				
			||||||
        #if 'answers' in str(genre_dir): continue
 | 
					 | 
				
			||||||
        parse_dir = genre_dir.joinpath('penntree')
 | 
					 | 
				
			||||||
        docs = []
 | 
					 | 
				
			||||||
        for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
 | 
					 | 
				
			||||||
            filename = source_loc.parts[-1].replace('.sgm.sgm', '')
 | 
					 | 
				
			||||||
            filename = filename.replace('.xml', '')
 | 
					 | 
				
			||||||
            filename = filename.replace('.txt', '')
 | 
					 | 
				
			||||||
            parse_loc = parse_dir.joinpath(filename + '.xml.tree')
 | 
					 | 
				
			||||||
            parses = read_parses(parse_loc)
 | 
					 | 
				
			||||||
            source = source_loc.open().read().strip()
 | 
					 | 
				
			||||||
            if 'answers' in str(genre_dir):
 | 
					 | 
				
			||||||
                source = tags_re.sub('', source).strip()
 | 
					 | 
				
			||||||
            docs.append(_fmt_doc(filename, [[source, parses]]))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
 | 
					 | 
				
			||||||
        with open(str(out_loc), 'w') as out_file:
 | 
					 | 
				
			||||||
            out_file.write(json.dumps(docs, indent=4))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,32 +0,0 @@
 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def main(text_loc):
 | 
					 | 
				
			||||||
    with io.open(text_loc, 'r', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
        text = file_.read()
 | 
					 | 
				
			||||||
    NLU = English()
 | 
					 | 
				
			||||||
    for paragraph in text.split('\n\n'):
 | 
					 | 
				
			||||||
        tokens = NLU(paragraph)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        ent_starts = {}
 | 
					 | 
				
			||||||
        ent_ends = {}
 | 
					 | 
				
			||||||
        for span in tokens.ents:
 | 
					 | 
				
			||||||
            ent_starts[span.start] = span.label_
 | 
					 | 
				
			||||||
            ent_ends[span.end] = span.label_
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        output = []
 | 
					 | 
				
			||||||
        for token in tokens:
 | 
					 | 
				
			||||||
            if token.i in ent_starts:
 | 
					 | 
				
			||||||
                output.append('<%s>' % ent_starts[token.i])
 | 
					 | 
				
			||||||
            output.append(token.orth_)
 | 
					 | 
				
			||||||
            if (token.i+1) in ent_ends:
 | 
					 | 
				
			||||||
                output.append('</%s>' % ent_ends[token.i+1])
 | 
					 | 
				
			||||||
        output.append('\n\n')
 | 
					 | 
				
			||||||
    print ' '.join(output)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,157 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
from __future__ import division
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
import gzip
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import cProfile
 | 
					 | 
				
			||||||
import pstats
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import spacy.util
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.syntax.util import Config
 | 
					 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					 | 
				
			||||||
from spacy.syntax.parser import Parser
 | 
					 | 
				
			||||||
from spacy.scorer import Scorer
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Last updated for spaCy v0.97
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_conll(file_):
 | 
					 | 
				
			||||||
    """Read a standard CoNLL/MALT-style format"""
 | 
					 | 
				
			||||||
    sents = []
 | 
					 | 
				
			||||||
    for sent_str in file_.read().strip().split('\n\n'):
 | 
					 | 
				
			||||||
        ids = []
 | 
					 | 
				
			||||||
        words = []
 | 
					 | 
				
			||||||
        heads = []
 | 
					 | 
				
			||||||
        labels = []
 | 
					 | 
				
			||||||
        tags = []
 | 
					 | 
				
			||||||
        for i, line in enumerate(sent_str.split('\n')):
 | 
					 | 
				
			||||||
            word, pos_string, head_idx, label = _parse_line(line)
 | 
					 | 
				
			||||||
            words.append(word)
 | 
					 | 
				
			||||||
            if head_idx < 0:
 | 
					 | 
				
			||||||
                head_idx = i
 | 
					 | 
				
			||||||
            ids.append(i)
 | 
					 | 
				
			||||||
            heads.append(head_idx)
 | 
					 | 
				
			||||||
            labels.append(label)
 | 
					 | 
				
			||||||
            tags.append(pos_string)
 | 
					 | 
				
			||||||
        text = ' '.join(words)
 | 
					 | 
				
			||||||
        annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
 | 
					 | 
				
			||||||
        sents.append((None, [(annot, [])]))
 | 
					 | 
				
			||||||
    return sents
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _parse_line(line):
 | 
					 | 
				
			||||||
    pieces = line.split()
 | 
					 | 
				
			||||||
    if len(pieces) == 4:
 | 
					 | 
				
			||||||
        word, pos, head_idx, label = pieces
 | 
					 | 
				
			||||||
        head_idx = int(head_idx)
 | 
					 | 
				
			||||||
    elif len(pieces) == 15:
 | 
					 | 
				
			||||||
        id_ = int(pieces[0].split('_')[-1])
 | 
					 | 
				
			||||||
        word = pieces[1]
 | 
					 | 
				
			||||||
        pos = pieces[4]
 | 
					 | 
				
			||||||
        head_idx = int(pieces[8])-1
 | 
					 | 
				
			||||||
        label = pieces[10]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        id_ = int(pieces[0].split('_')[-1])
 | 
					 | 
				
			||||||
        word = pieces[1]
 | 
					 | 
				
			||||||
        pos = pieces[4]
 | 
					 | 
				
			||||||
        head_idx = int(pieces[6])-1
 | 
					 | 
				
			||||||
        label = pieces[7]
 | 
					 | 
				
			||||||
    if head_idx == 0:
 | 
					 | 
				
			||||||
        label = 'ROOT'
 | 
					 | 
				
			||||||
    return word, pos, head_idx, label
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
 | 
					 | 
				
			||||||
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
    nlp.tagger(tokens)
 | 
					 | 
				
			||||||
    nlp.parser(tokens)
 | 
					 | 
				
			||||||
    gold = GoldParse(tokens, annot_tuples, make_projective=False)
 | 
					 | 
				
			||||||
    scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
 | 
					 | 
				
			||||||
          gold_preproc=False, force_gold=False):
 | 
					 | 
				
			||||||
    dep_model_dir = path.join(model_dir, 'deps')
 | 
					 | 
				
			||||||
    pos_model_dir = path.join(model_dir, 'pos')
 | 
					 | 
				
			||||||
    if path.exists(dep_model_dir):
 | 
					 | 
				
			||||||
        shutil.rmtree(dep_model_dir)
 | 
					 | 
				
			||||||
    if path.exists(pos_model_dir):
 | 
					 | 
				
			||||||
        shutil.rmtree(pos_model_dir)
 | 
					 | 
				
			||||||
    os.mkdir(dep_model_dir)
 | 
					 | 
				
			||||||
    os.mkdir(pos_model_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
 | 
					 | 
				
			||||||
                 labels=ArcEager.get_labels(gold_tuples))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
 | 
					 | 
				
			||||||
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
 | 
					 | 
				
			||||||
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
 | 
					 | 
				
			||||||
    for itn in range(n_iter):
 | 
					 | 
				
			||||||
        scorer = Scorer()
 | 
					 | 
				
			||||||
        loss = 0
 | 
					 | 
				
			||||||
        for _, sents in gold_tuples:
 | 
					 | 
				
			||||||
            for annot_tuples, _ in sents:
 | 
					 | 
				
			||||||
                if len(annot_tuples[1]) == 1:
 | 
					 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                score_model(scorer, nlp, None, annot_tuples, verbose=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
                nlp.tagger(tokens)
 | 
					 | 
				
			||||||
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
 | 
					 | 
				
			||||||
                if not gold.is_projective:
 | 
					 | 
				
			||||||
                    raise Exception(
 | 
					 | 
				
			||||||
                        "Non-projective sentence in training, after we should "
 | 
					 | 
				
			||||||
                        "have enforced projectivity: %s" % annot_tuples
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
                loss += nlp.parser.train(tokens, gold)
 | 
					 | 
				
			||||||
                nlp.tagger.train(tokens, gold.tags)
 | 
					 | 
				
			||||||
        random.shuffle(gold_tuples)
 | 
					 | 
				
			||||||
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
 | 
					 | 
				
			||||||
                                             scorer.tags_acc, scorer.token_acc))
 | 
					 | 
				
			||||||
    print('end training')
 | 
					 | 
				
			||||||
    nlp.end_training(model_dir)
 | 
					 | 
				
			||||||
    print('done')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    train_loc=("Location of CoNLL 09 formatted training file"),
 | 
					 | 
				
			||||||
    dev_loc=("Location of CoNLL 09 formatted development file"),
 | 
					 | 
				
			||||||
    model_dir=("Location of output model directory"),
 | 
					 | 
				
			||||||
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
 | 
					 | 
				
			||||||
    n_iter=("Number of training iterations", "option", "i", int),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
 | 
					 | 
				
			||||||
    with io.open(train_loc, 'r', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
        train_sents = read_conll(file_)
 | 
					 | 
				
			||||||
    if not eval_only:
 | 
					 | 
				
			||||||
        train(English, train_sents, model_dir, n_iter=n_iter)
 | 
					 | 
				
			||||||
    nlp = English(data_dir=model_dir)
 | 
					 | 
				
			||||||
    dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    for _, sents in dev_sents:
 | 
					 | 
				
			||||||
        for annot_tuples, _ in sents:
 | 
					 | 
				
			||||||
            score_model(scorer, nlp, None, annot_tuples)
 | 
					 | 
				
			||||||
    print('TOK', 100-scorer.token_acc)
 | 
					 | 
				
			||||||
    print('POS', scorer.tags_acc)
 | 
					 | 
				
			||||||
    print('UAS', scorer.uas)
 | 
					 | 
				
			||||||
    print('LAS', scorer.las)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,187 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
from __future__ import division
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
from __future__ import print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import spacy.util
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.syntax.util import Config
 | 
					 | 
				
			||||||
from spacy.gold import read_json_file
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.gold import merge_sents
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.scorer import Scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					 | 
				
			||||||
from spacy.syntax.ner import BiluoPushDown
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					 | 
				
			||||||
from spacy.syntax.parser import Parser
 | 
					 | 
				
			||||||
from spacy.syntax.nonproj import PseudoProjectivity
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _corrupt(c, noise_level):
 | 
					 | 
				
			||||||
    if random.random() >= noise_level:
 | 
					 | 
				
			||||||
        return c
 | 
					 | 
				
			||||||
    elif c == ' ':
 | 
					 | 
				
			||||||
        return '\n'
 | 
					 | 
				
			||||||
    elif c == '\n':
 | 
					 | 
				
			||||||
        return ' '
 | 
					 | 
				
			||||||
    elif c in ['.', "'", "!", "?"]:
 | 
					 | 
				
			||||||
        return ''
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return c.lower()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def add_noise(orig, noise_level):
 | 
					 | 
				
			||||||
    if random.random() >= noise_level:
 | 
					 | 
				
			||||||
        return orig
 | 
					 | 
				
			||||||
    elif type(orig) == list:
 | 
					 | 
				
			||||||
        corrupted = [_corrupt(word, noise_level) for word in orig]
 | 
					 | 
				
			||||||
        corrupted = [w for w in corrupted if w]
 | 
					 | 
				
			||||||
        return corrupted
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return ''.join(_corrupt(c, noise_level) for c in orig)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
 | 
					 | 
				
			||||||
    if raw_text is None:
 | 
					 | 
				
			||||||
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        tokens = nlp.tokenizer(raw_text)
 | 
					 | 
				
			||||||
    nlp.tagger(tokens)
 | 
					 | 
				
			||||||
    nlp.entity(tokens)
 | 
					 | 
				
			||||||
    nlp.parser(tokens)
 | 
					 | 
				
			||||||
    gold = GoldParse(tokens, annot_tuples)
 | 
					 | 
				
			||||||
    scorer.score(tokens, gold, verbose=verbose)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
 | 
					 | 
				
			||||||
        n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
 | 
					 | 
				
			||||||
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
 | 
					 | 
				
			||||||
    format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
 | 
					 | 
				
			||||||
    with Language.train(model_dir, train_data,
 | 
					 | 
				
			||||||
            tagger_cfg, parser_cfg, entity_cfg) as trainer:
 | 
					 | 
				
			||||||
        loss = 0
 | 
					 | 
				
			||||||
        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
 | 
					 | 
				
			||||||
                                                   augment_data=None)):
 | 
					 | 
				
			||||||
            for doc, gold in epoch:
 | 
					 | 
				
			||||||
                trainer.update(doc, gold)
 | 
					 | 
				
			||||||
            dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
 | 
					 | 
				
			||||||
            print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
 | 
					 | 
				
			||||||
                trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
 | 
					 | 
				
			||||||
             beam_width=None, cand_preproc=None):
 | 
					 | 
				
			||||||
    print("Load parser", model_dir)
 | 
					 | 
				
			||||||
    nlp = Language(path=model_dir)
 | 
					 | 
				
			||||||
    if nlp.lang == 'de':
 | 
					 | 
				
			||||||
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
 | 
					 | 
				
			||||||
    if beam_width is not None:
 | 
					 | 
				
			||||||
        nlp.parser.cfg.beam_width = beam_width
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    for raw_text, sents in gold_tuples:
 | 
					 | 
				
			||||||
        if gold_preproc:
 | 
					 | 
				
			||||||
            raw_text = None
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            sents = merge_sents(sents)
 | 
					 | 
				
			||||||
        for annot_tuples, brackets in sents:
 | 
					 | 
				
			||||||
            if raw_text is None:
 | 
					 | 
				
			||||||
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
                nlp.tagger(tokens)
 | 
					 | 
				
			||||||
                nlp.parser(tokens)
 | 
					 | 
				
			||||||
                nlp.entity(tokens)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                tokens = nlp(raw_text)
 | 
					 | 
				
			||||||
            gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
 | 
					 | 
				
			||||||
            scorer.score(tokens, gold, verbose=verbose)
 | 
					 | 
				
			||||||
    return scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def write_parses(Language, dev_loc, model_dir, out_loc):
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir)
 | 
					 | 
				
			||||||
    gold_tuples = read_json_file(dev_loc)
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    out_file = io.open(out_loc, 'w', 'utf8')
 | 
					 | 
				
			||||||
    for raw_text, sents in gold_tuples:
 | 
					 | 
				
			||||||
        sents = _merge_sents(sents)
 | 
					 | 
				
			||||||
        for annot_tuples, brackets in sents:
 | 
					 | 
				
			||||||
            if raw_text is None:
 | 
					 | 
				
			||||||
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
                nlp.tagger(tokens)
 | 
					 | 
				
			||||||
                nlp.entity(tokens)
 | 
					 | 
				
			||||||
                nlp.parser(tokens)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                tokens = nlp(raw_text)
 | 
					 | 
				
			||||||
            #gold = GoldParse(tokens, annot_tuples)
 | 
					 | 
				
			||||||
            #scorer.score(tokens, gold, verbose=False)
 | 
					 | 
				
			||||||
            for sent in tokens.sents:
 | 
					 | 
				
			||||||
                for t in sent:
 | 
					 | 
				
			||||||
                    if not t.is_space:
 | 
					 | 
				
			||||||
                        out_file.write(
 | 
					 | 
				
			||||||
                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
 | 
					 | 
				
			||||||
                        )
 | 
					 | 
				
			||||||
                out_file.write('\n')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    language=("The language to train", "positional", None, str, ['en','de', 'zh']),
 | 
					 | 
				
			||||||
    train_loc=("Location of training file or directory"),
 | 
					 | 
				
			||||||
    dev_loc=("Location of development file or directory"),
 | 
					 | 
				
			||||||
    model_dir=("Location of output model directory",),
 | 
					 | 
				
			||||||
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
 | 
					 | 
				
			||||||
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
 | 
					 | 
				
			||||||
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
 | 
					 | 
				
			||||||
    out_loc=("Out location", "option", "o", str),
 | 
					 | 
				
			||||||
    n_sents=("Number of training sentences", "option", "n", int),
 | 
					 | 
				
			||||||
    n_iter=("Number of training iterations", "option", "i", int),
 | 
					 | 
				
			||||||
    verbose=("Verbose error reporting", "flag", "v", bool),
 | 
					 | 
				
			||||||
    debug=("Debug mode", "flag", "d", bool),
 | 
					 | 
				
			||||||
    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
 | 
					 | 
				
			||||||
    L1=("L1 regularization penalty", "option", "L", float),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
 | 
					 | 
				
			||||||
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
 | 
					 | 
				
			||||||
         L1=1e-6):
 | 
					 | 
				
			||||||
    parser_cfg = dict(locals())
 | 
					 | 
				
			||||||
    tagger_cfg = dict(locals())
 | 
					 | 
				
			||||||
    entity_cfg = dict(locals())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    lang = spacy.util.get_lang_class(language)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    parser_cfg['features'] = lang.Defaults.parser_features
 | 
					 | 
				
			||||||
    entity_cfg['features'] = lang.Defaults.entity_features
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not eval_only:
 | 
					 | 
				
			||||||
        gold_train = list(read_json_file(train_loc))
 | 
					 | 
				
			||||||
        gold_dev = list(read_json_file(dev_loc))
 | 
					 | 
				
			||||||
        if n_sents > 0:
 | 
					 | 
				
			||||||
            gold_train = gold_train[:n_sents]
 | 
					 | 
				
			||||||
        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
 | 
					 | 
				
			||||||
              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
 | 
					 | 
				
			||||||
              n_iter=n_iter)
 | 
					 | 
				
			||||||
    if out_loc:
 | 
					 | 
				
			||||||
        write_parses(lang, dev_loc, model_dir, out_loc)
 | 
					 | 
				
			||||||
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
 | 
					 | 
				
			||||||
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
 | 
					 | 
				
			||||||
    print('TOK', scorer.token_acc)
 | 
					 | 
				
			||||||
    print('POS', scorer.tags_acc)
 | 
					 | 
				
			||||||
    print('UAS', scorer.uas)
 | 
					 | 
				
			||||||
    print('LAS', scorer.las)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print('NER P', scorer.ents_p)
 | 
					 | 
				
			||||||
    print('NER R', scorer.ents_r)
 | 
					 | 
				
			||||||
    print('NER F', scorer.ents_f)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,201 +0,0 @@
 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import pathlib
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.tokens import Doc
 | 
					 | 
				
			||||||
from spacy.syntax.nonproj import PseudoProjectivity
 | 
					 | 
				
			||||||
from spacy.language import Language
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					 | 
				
			||||||
from spacy.pipeline import DependencyParser, TokenVectorEncoder
 | 
					 | 
				
			||||||
from spacy.syntax.parser import get_templates
 | 
					 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					 | 
				
			||||||
from spacy.scorer import Scorer
 | 
					 | 
				
			||||||
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
 | 
					 | 
				
			||||||
import spacy.attrs
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
from thinc.neural.ops import CupyOps
 | 
					 | 
				
			||||||
from thinc.neural import Model
 | 
					 | 
				
			||||||
from spacy.es import Spanish
 | 
					 | 
				
			||||||
from spacy.attrs import POS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.neural import Model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
try:
 | 
					 | 
				
			||||||
    import cupy
 | 
					 | 
				
			||||||
    from thinc.neural.ops import CupyOps
 | 
					 | 
				
			||||||
except:
 | 
					 | 
				
			||||||
    cupy = None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_conllx(loc, n=0):
 | 
					 | 
				
			||||||
    with io.open(loc, 'r', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
        text = file_.read()
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    for sent in text.strip().split('\n\n'):
 | 
					 | 
				
			||||||
        lines = sent.strip().split('\n')
 | 
					 | 
				
			||||||
        if lines:
 | 
					 | 
				
			||||||
            while lines[0].startswith('#'):
 | 
					 | 
				
			||||||
                lines.pop(0)
 | 
					 | 
				
			||||||
            tokens = []
 | 
					 | 
				
			||||||
            for line in lines:
 | 
					 | 
				
			||||||
                id_, word, lemma, pos, tag, morph, head, dep, _1, \
 | 
					 | 
				
			||||||
                _2 = line.split('\t')
 | 
					 | 
				
			||||||
                if '-' in id_ or '.' in id_:
 | 
					 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                try:
 | 
					 | 
				
			||||||
                    id_ = int(id_) - 1
 | 
					 | 
				
			||||||
                    head = (int(head) - 1) if head != '0' else id_
 | 
					 | 
				
			||||||
                    dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
 | 
					 | 
				
			||||||
                    tag = pos+'__'+dep+'__'+morph
 | 
					 | 
				
			||||||
                    Spanish.Defaults.tag_map[tag] = {POS: pos}
 | 
					 | 
				
			||||||
                    tokens.append((id_, word, tag, head, dep, 'O'))
 | 
					 | 
				
			||||||
                except:
 | 
					 | 
				
			||||||
                    raise
 | 
					 | 
				
			||||||
            tuples = [list(t) for t in zip(*tokens)]
 | 
					 | 
				
			||||||
            yield (None, [[tuples, []]])
 | 
					 | 
				
			||||||
            i += 1
 | 
					 | 
				
			||||||
            if n >= 1 and i >= n:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    correct = 0.
 | 
					 | 
				
			||||||
    total = 0.
 | 
					 | 
				
			||||||
    for doc, gold in zip(Xs, ys):
 | 
					 | 
				
			||||||
        doc = Doc(vocab, words=[w.text for w in doc])
 | 
					 | 
				
			||||||
        encoder(doc)
 | 
					 | 
				
			||||||
        parser(doc)
 | 
					 | 
				
			||||||
        PseudoProjectivity.deprojectivize(doc)
 | 
					 | 
				
			||||||
        scorer.score(doc, gold, verbose=verbose)
 | 
					 | 
				
			||||||
        for token, tag in zip(doc, gold.tags):
 | 
					 | 
				
			||||||
            if '_' in token.tag_:
 | 
					 | 
				
			||||||
                univ_guess, _ = token.tag_.split('_', 1)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                univ_guess = ''
 | 
					 | 
				
			||||||
            univ_truth, _ = tag.split('_', 1)
 | 
					 | 
				
			||||||
            correct += univ_guess == univ_truth
 | 
					 | 
				
			||||||
            total += 1
 | 
					 | 
				
			||||||
    return scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def organize_data(vocab, train_sents):
 | 
					 | 
				
			||||||
    Xs = []
 | 
					 | 
				
			||||||
    ys = []
 | 
					 | 
				
			||||||
    for _, doc_sents in train_sents:
 | 
					 | 
				
			||||||
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
 | 
					 | 
				
			||||||
            doc = Doc(vocab, words=words)
 | 
					 | 
				
			||||||
            gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
 | 
					 | 
				
			||||||
            Xs.append(doc)
 | 
					 | 
				
			||||||
            ys.append(gold)
 | 
					 | 
				
			||||||
    return Xs, ys
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
 | 
					 | 
				
			||||||
    LangClass = spacy.util.get_lang_class(lang_name)
 | 
					 | 
				
			||||||
    train_sents = list(read_conllx(train_loc))
 | 
					 | 
				
			||||||
    dev_sents = list(read_conllx(dev_loc))
 | 
					 | 
				
			||||||
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    actions = ArcEager.get_actions(gold_parses=train_sents)
 | 
					 | 
				
			||||||
    features = get_templates('basic')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model_dir = pathlib.Path(model_dir)
 | 
					 | 
				
			||||||
    if not model_dir.exists():
 | 
					 | 
				
			||||||
        model_dir.mkdir()
 | 
					 | 
				
			||||||
    if not (model_dir / 'deps').exists():
 | 
					 | 
				
			||||||
        (model_dir / 'deps').mkdir()
 | 
					 | 
				
			||||||
    if not (model_dir / 'pos').exists():
 | 
					 | 
				
			||||||
        (model_dir / 'pos').mkdir()
 | 
					 | 
				
			||||||
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
 | 
					 | 
				
			||||||
        file_.write(
 | 
					 | 
				
			||||||
            json.dumps(
 | 
					 | 
				
			||||||
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    vocab = LangClass.Defaults.create_vocab()
 | 
					 | 
				
			||||||
    if not (model_dir / 'vocab').exists():
 | 
					 | 
				
			||||||
        (model_dir / 'vocab').mkdir()
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        if (model_dir / 'vocab' / 'strings.json').exists():
 | 
					 | 
				
			||||||
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
 | 
					 | 
				
			||||||
                vocab.strings.load(file_)
 | 
					 | 
				
			||||||
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
 | 
					 | 
				
			||||||
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if clusters_loc is not None:
 | 
					 | 
				
			||||||
        clusters_loc = pathlib.Path(clusters_loc)
 | 
					 | 
				
			||||||
        with clusters_loc.open() as file_:
 | 
					 | 
				
			||||||
            for line in file_:
 | 
					 | 
				
			||||||
                try:
 | 
					 | 
				
			||||||
                    cluster, word, freq = line.split()
 | 
					 | 
				
			||||||
                except ValueError:
 | 
					 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                lex = vocab[word]
 | 
					 | 
				
			||||||
                lex.cluster = int(cluster[::-1], 2)
 | 
					 | 
				
			||||||
    # Populate vocab
 | 
					 | 
				
			||||||
    for _, doc_sents in train_sents:
 | 
					 | 
				
			||||||
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
 | 
					 | 
				
			||||||
            for word in words:
 | 
					 | 
				
			||||||
                _ = vocab[word]
 | 
					 | 
				
			||||||
            for dep in deps:
 | 
					 | 
				
			||||||
                _ = vocab[dep]
 | 
					 | 
				
			||||||
            for tag in tags:
 | 
					 | 
				
			||||||
                _ = vocab[tag]
 | 
					 | 
				
			||||||
            if vocab.morphology.tag_map:
 | 
					 | 
				
			||||||
                for tag in tags:
 | 
					 | 
				
			||||||
                    vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
 | 
					 | 
				
			||||||
    tagger = Tagger(vocab)
 | 
					 | 
				
			||||||
    encoder = TokenVectorEncoder(vocab, width=64)
 | 
					 | 
				
			||||||
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Xs, ys = organize_data(vocab, train_sents)
 | 
					 | 
				
			||||||
    dev_Xs, dev_ys = organize_data(vocab, dev_sents)
 | 
					 | 
				
			||||||
    with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
 | 
					 | 
				
			||||||
        docs = list(Xs)
 | 
					 | 
				
			||||||
        for doc in docs:
 | 
					 | 
				
			||||||
            encoder(doc)
 | 
					 | 
				
			||||||
        nn_loss = [0.]
 | 
					 | 
				
			||||||
        def track_progress():
 | 
					 | 
				
			||||||
            with encoder.tagger.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
                with parser.model.use_params(optimizer.averages):
 | 
					 | 
				
			||||||
                    scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
 | 
					 | 
				
			||||||
            itn = len(nn_loss)
 | 
					 | 
				
			||||||
            print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
 | 
					 | 
				
			||||||
            nn_loss.append(0.)
 | 
					 | 
				
			||||||
        track_progress()
 | 
					 | 
				
			||||||
        trainer.each_epoch.append(track_progress)
 | 
					 | 
				
			||||||
        trainer.batch_size = 24
 | 
					 | 
				
			||||||
        trainer.nb_epoch = 40
 | 
					 | 
				
			||||||
        for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
 | 
					 | 
				
			||||||
            docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
 | 
					 | 
				
			||||||
            tokvecs, upd_tokvecs = encoder.begin_update(docs)
 | 
					 | 
				
			||||||
            for doc, tokvec in zip(docs, tokvecs):
 | 
					 | 
				
			||||||
                doc.tensor = tokvec
 | 
					 | 
				
			||||||
            d_tokvecs = parser.update(docs, golds, sgd=optimizer)
 | 
					 | 
				
			||||||
            upd_tokvecs(d_tokvecs, sgd=optimizer)
 | 
					 | 
				
			||||||
            encoder.update(docs, golds, sgd=optimizer)
 | 
					 | 
				
			||||||
    nlp = LangClass(vocab=vocab, parser=parser)
 | 
					 | 
				
			||||||
    scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
 | 
					 | 
				
			||||||
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | 
					 | 
				
			||||||
    #nlp.end_training(model_dir)
 | 
					 | 
				
			||||||
    #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | 
					 | 
				
			||||||
    #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    import cProfile
 | 
					 | 
				
			||||||
    import pstats
 | 
					 | 
				
			||||||
    if 1:
 | 
					 | 
				
			||||||
        plac.call(main)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
 | 
					 | 
				
			||||||
    s = pstats.Stats("Profile.prof")
 | 
					 | 
				
			||||||
    s.strip_dirs().sort_stats("time").print_stats()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,194 +0,0 @@
 | 
				
			||||||
"""Convert OntoNotes into a json format.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
doc: {
 | 
					 | 
				
			||||||
    id: string,
 | 
					 | 
				
			||||||
    paragraphs: [{
 | 
					 | 
				
			||||||
        raw: string,
 | 
					 | 
				
			||||||
        sents: [int],
 | 
					 | 
				
			||||||
        tokens: [{
 | 
					 | 
				
			||||||
            start: int,
 | 
					 | 
				
			||||||
            tag: string,
 | 
					 | 
				
			||||||
            head: int,
 | 
					 | 
				
			||||||
            dep: string}],
 | 
					 | 
				
			||||||
        ner: [{
 | 
					 | 
				
			||||||
            start: int,
 | 
					 | 
				
			||||||
            end: int,
 | 
					 | 
				
			||||||
            label: string}],
 | 
					 | 
				
			||||||
        brackets: [{
 | 
					 | 
				
			||||||
            start: int,
 | 
					 | 
				
			||||||
            end: int,
 | 
					 | 
				
			||||||
            label: string}]}]}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Consumes output of spacy/munge/align_raw.py
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
from collections import defaultdict
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.munge import read_ptb
 | 
					 | 
				
			||||||
from spacy.munge import read_conll
 | 
					 | 
				
			||||||
from spacy.munge import read_ner
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _iter_raw_files(raw_loc):
 | 
					 | 
				
			||||||
    files = json.load(open(raw_loc))
 | 
					 | 
				
			||||||
    for f in files:
 | 
					 | 
				
			||||||
        yield f
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
 | 
					 | 
				
			||||||
    ptb_sents = read_ptb.split(ptb_text)
 | 
					 | 
				
			||||||
    dep_sents = read_conll.split(dep_text)
 | 
					 | 
				
			||||||
    if len(ptb_sents) != len(dep_sents):
 | 
					 | 
				
			||||||
        return None
 | 
					 | 
				
			||||||
    if ner_text is not None:
 | 
					 | 
				
			||||||
        ner_sents = read_ner.split(ner_text)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        ner_sents = [None] * len(ptb_sents)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    doc = {'id': file_id}
 | 
					 | 
				
			||||||
    if raw_paras is None:
 | 
					 | 
				
			||||||
        doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
 | 
					 | 
				
			||||||
        #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
 | 
					 | 
				
			||||||
        #    doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        doc['paragraphs'] = []
 | 
					 | 
				
			||||||
        for raw_sents in raw_paras:
 | 
					 | 
				
			||||||
            para = format_para(
 | 
					 | 
				
			||||||
                        ' '.join(raw_sents).replace('<SEP>', ''),
 | 
					 | 
				
			||||||
                        ptb_sents[i:i+len(raw_sents)],
 | 
					 | 
				
			||||||
                        dep_sents[i:i+len(raw_sents)],
 | 
					 | 
				
			||||||
                        ner_sents[i:i+len(raw_sents)])
 | 
					 | 
				
			||||||
            if para['sentences']:
 | 
					 | 
				
			||||||
                doc['paragraphs'].append(para)
 | 
					 | 
				
			||||||
            i += len(raw_sents)
 | 
					 | 
				
			||||||
    return doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
 | 
					 | 
				
			||||||
    para = {'raw': raw_text, 'sentences': []}
 | 
					 | 
				
			||||||
    offset = 0
 | 
					 | 
				
			||||||
    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
 | 
					 | 
				
			||||||
    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
 | 
					 | 
				
			||||||
        _, deps = read_conll.parse(dep_text, strip_bad_periods=True)
 | 
					 | 
				
			||||||
        if deps and 'VERB' in [t['tag'] for t in deps]:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        if ner_text is not None:
 | 
					 | 
				
			||||||
            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            ner = ['-' for _ in deps]
 | 
					 | 
				
			||||||
        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
 | 
					 | 
				
			||||||
        # Necessary because the ClearNLP converter deletes EDITED words.
 | 
					 | 
				
			||||||
        if len(ner) != len(deps):
 | 
					 | 
				
			||||||
            ner = ['-' for _ in deps]
 | 
					 | 
				
			||||||
        para['sentences'].append(format_sentence(deps, ner, brackets))
 | 
					 | 
				
			||||||
    return para
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def format_sentence(deps, ner, brackets):
 | 
					 | 
				
			||||||
    sent = {'tokens': [], 'brackets': []}
 | 
					 | 
				
			||||||
    for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
 | 
					 | 
				
			||||||
        sent['tokens'].append(format_token(token_id, token, token_ent))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for label, start, end in brackets:
 | 
					 | 
				
			||||||
        if start != end:
 | 
					 | 
				
			||||||
            sent['brackets'].append({
 | 
					 | 
				
			||||||
                'label': label,
 | 
					 | 
				
			||||||
                'first': start,
 | 
					 | 
				
			||||||
                'last': (end-1)})
 | 
					 | 
				
			||||||
    return sent
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def format_token(token_id, token, ner):
 | 
					 | 
				
			||||||
    assert token_id == token['id']
 | 
					 | 
				
			||||||
    head = (token['head'] - token_id) if token['head'] != -1 else 0
 | 
					 | 
				
			||||||
    return {
 | 
					 | 
				
			||||||
        'id': token_id,
 | 
					 | 
				
			||||||
        'orth': token['word'],
 | 
					 | 
				
			||||||
        'tag': token['tag'],
 | 
					 | 
				
			||||||
        'head': head,
 | 
					 | 
				
			||||||
        'dep': token['dep'],
 | 
					 | 
				
			||||||
        'ner': ner}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_file(*pieces):
 | 
					 | 
				
			||||||
    loc = path.join(*pieces)
 | 
					 | 
				
			||||||
    if not path.exists(loc):
 | 
					 | 
				
			||||||
        return None
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return io.open(loc, 'r', encoding='utf8').read().strip()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_file_names(section_dir, subsection):
 | 
					 | 
				
			||||||
    filenames = []
 | 
					 | 
				
			||||||
    for fn in os.listdir(path.join(section_dir, subsection)):
 | 
					 | 
				
			||||||
        filenames.append(fn.rsplit('.', 1)[0])
 | 
					 | 
				
			||||||
    return list(sorted(set(filenames)))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_wsj_with_source(onto_dir, raw_dir):
 | 
					 | 
				
			||||||
    # Now do WSJ, with source alignment
 | 
					 | 
				
			||||||
    onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
 | 
					 | 
				
			||||||
    docs = {}
 | 
					 | 
				
			||||||
    for i in range(25):
 | 
					 | 
				
			||||||
        section = str(i) if i >= 10 else ('0' + str(i))
 | 
					 | 
				
			||||||
        raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
 | 
					 | 
				
			||||||
        for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
 | 
					 | 
				
			||||||
            if section == '00':
 | 
					 | 
				
			||||||
                j += 1
 | 
					 | 
				
			||||||
            if section == '04' and filename == '55':
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            ptb = read_file(onto_dir, section, '%s.parse' % filename)
 | 
					 | 
				
			||||||
            dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
 | 
					 | 
				
			||||||
            ner = read_file(onto_dir, section, '%s.name' % filename)
 | 
					 | 
				
			||||||
            if ptb is not None and dep is not None:
 | 
					 | 
				
			||||||
                docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
 | 
					 | 
				
			||||||
    return docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_doc(onto_dir, file_path, wsj_docs):
 | 
					 | 
				
			||||||
    filename = file_path.rsplit('/', 1)[1]
 | 
					 | 
				
			||||||
    if filename in wsj_docs:
 | 
					 | 
				
			||||||
        return wsj_docs[filename]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        ptb = read_file(onto_dir, file_path + '.parse')
 | 
					 | 
				
			||||||
        dep = read_file(onto_dir, file_path + '.parse.dep')
 | 
					 | 
				
			||||||
        ner = read_file(onto_dir, file_path + '.name')
 | 
					 | 
				
			||||||
        if ptb is not None and dep is not None:
 | 
					 | 
				
			||||||
            return format_doc(filename, None, ptb, dep, ner)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_ids(loc):
 | 
					 | 
				
			||||||
    return open(loc).read().strip().split('\n')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def main(onto_dir, raw_dir, out_dir):
 | 
					 | 
				
			||||||
    wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for partition in ('train', 'test', 'development'):
 | 
					 | 
				
			||||||
        ids = read_ids(path.join(onto_dir, '%s.id' % partition))
 | 
					 | 
				
			||||||
        docs_by_genre = defaultdict(list)
 | 
					 | 
				
			||||||
        for file_path in ids:
 | 
					 | 
				
			||||||
            doc = get_doc(onto_dir, file_path, wsj_docs)
 | 
					 | 
				
			||||||
            if doc is not None:
 | 
					 | 
				
			||||||
                genre = file_path.split('/')[3]
 | 
					 | 
				
			||||||
                docs_by_genre[genre].append(doc)
 | 
					 | 
				
			||||||
        part_dir = path.join(out_dir, partition)
 | 
					 | 
				
			||||||
        if not path.exists(part_dir):
 | 
					 | 
				
			||||||
            os.mkdir(part_dir)
 | 
					 | 
				
			||||||
        for genre, docs in sorted(docs_by_genre.items()):
 | 
					 | 
				
			||||||
            out_loc = path.join(part_dir, genre + '.json')
 | 
					 | 
				
			||||||
            with open(out_loc, 'w') as file_:
 | 
					 | 
				
			||||||
                json.dump(docs, file_, indent=4)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.vocab import write_binary_vectors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def main(in_loc, out_loc):
 | 
					 | 
				
			||||||
    write_binary_vectors(in_loc, out_loc)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,175 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
from __future__ import division
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
from __future__ import print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import codecs
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import spacy.util
 | 
					 | 
				
			||||||
from spacy.en import English
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.syntax.util import Config
 | 
					 | 
				
			||||||
from spacy.gold import read_json_file
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.scorer import Scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def score_model(scorer, nlp, raw_text, annot_tuples):
 | 
					 | 
				
			||||||
    if raw_text is None:
 | 
					 | 
				
			||||||
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        tokens = nlp.tokenizer(raw_text)
 | 
					 | 
				
			||||||
    nlp.tagger(tokens)
 | 
					 | 
				
			||||||
    gold = GoldParse(tokens, annot_tuples)
 | 
					 | 
				
			||||||
    scorer.score(tokens, gold)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _merge_sents(sents):
 | 
					 | 
				
			||||||
    m_deps = [[], [], [], [], [], []]
 | 
					 | 
				
			||||||
    m_brackets = []
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    for (ids, words, tags, heads, labels, ner), brackets in sents:
 | 
					 | 
				
			||||||
        m_deps[0].extend(id_ + i for id_ in ids)
 | 
					 | 
				
			||||||
        m_deps[1].extend(words)
 | 
					 | 
				
			||||||
        m_deps[2].extend(tags)
 | 
					 | 
				
			||||||
        m_deps[3].extend(head + i for head in heads)
 | 
					 | 
				
			||||||
        m_deps[4].extend(labels)
 | 
					 | 
				
			||||||
        m_deps[5].extend(ner)
 | 
					 | 
				
			||||||
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
 | 
					 | 
				
			||||||
        i += len(ids)
 | 
					 | 
				
			||||||
    return [(m_deps, m_brackets)]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
 | 
					 | 
				
			||||||
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
 | 
					 | 
				
			||||||
          beam_width=1, verbose=False,
 | 
					 | 
				
			||||||
          use_orig_arc_eager=False):
 | 
					 | 
				
			||||||
    if n_sents > 0:
 | 
					 | 
				
			||||||
        gold_tuples = gold_tuples[:n_sents]
 | 
					 | 
				
			||||||
   
 | 
					 | 
				
			||||||
    templates = Tagger.default_templates()
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir, tagger=False)
 | 
					 | 
				
			||||||
    nlp.tagger = Tagger.blank(nlp.vocab, templates)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
 | 
					 | 
				
			||||||
    for itn in range(n_iter):
 | 
					 | 
				
			||||||
        scorer = Scorer()
 | 
					 | 
				
			||||||
        loss = 0
 | 
					 | 
				
			||||||
        for raw_text, sents in gold_tuples:
 | 
					 | 
				
			||||||
            if gold_preproc:
 | 
					 | 
				
			||||||
                raw_text = None
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                sents = _merge_sents(sents)
 | 
					 | 
				
			||||||
            for annot_tuples, ctnt in sents:
 | 
					 | 
				
			||||||
                words = annot_tuples[1]
 | 
					 | 
				
			||||||
                gold_tags = annot_tuples[2]
 | 
					 | 
				
			||||||
                score_model(scorer, nlp, raw_text, annot_tuples)
 | 
					 | 
				
			||||||
                if raw_text is None:
 | 
					 | 
				
			||||||
                    tokens = nlp.tokenizer.tokens_from_list(words)
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    tokens = nlp.tokenizer(raw_text)
 | 
					 | 
				
			||||||
                loss += nlp.tagger.train(tokens, gold_tags)
 | 
					 | 
				
			||||||
        random.shuffle(gold_tuples)
 | 
					 | 
				
			||||||
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
 | 
					 | 
				
			||||||
                                                   scorer.tags_acc,
 | 
					 | 
				
			||||||
                                                   scorer.token_acc))
 | 
					 | 
				
			||||||
    nlp.end_training(model_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
 | 
					 | 
				
			||||||
             beam_width=None):
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir)
 | 
					 | 
				
			||||||
    if beam_width is not None:
 | 
					 | 
				
			||||||
        nlp.parser.cfg.beam_width = beam_width
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    for raw_text, sents in gold_tuples:
 | 
					 | 
				
			||||||
        if gold_preproc:
 | 
					 | 
				
			||||||
            raw_text = None
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            sents = _merge_sents(sents)
 | 
					 | 
				
			||||||
        for annot_tuples, brackets in sents:
 | 
					 | 
				
			||||||
            if raw_text is None:
 | 
					 | 
				
			||||||
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
                nlp.tagger(tokens)
 | 
					 | 
				
			||||||
                nlp.entity(tokens)
 | 
					 | 
				
			||||||
                nlp.parser(tokens)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                tokens = nlp(raw_text, merge_mwes=False)
 | 
					 | 
				
			||||||
            gold = GoldParse(tokens, annot_tuples)
 | 
					 | 
				
			||||||
            scorer.score(tokens, gold, verbose=verbose)
 | 
					 | 
				
			||||||
    return scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir)
 | 
					 | 
				
			||||||
    if beam_width is not None:
 | 
					 | 
				
			||||||
        nlp.parser.cfg.beam_width = beam_width
 | 
					 | 
				
			||||||
    gold_tuples = read_json_file(dev_loc)
 | 
					 | 
				
			||||||
    scorer = Scorer()
 | 
					 | 
				
			||||||
    out_file = codecs.open(out_loc, 'w', 'utf8')
 | 
					 | 
				
			||||||
    for raw_text, sents in gold_tuples:
 | 
					 | 
				
			||||||
        sents = _merge_sents(sents)
 | 
					 | 
				
			||||||
        for annot_tuples, brackets in sents:
 | 
					 | 
				
			||||||
            if raw_text is None:
 | 
					 | 
				
			||||||
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
 | 
					 | 
				
			||||||
                nlp.tagger(tokens)
 | 
					 | 
				
			||||||
                nlp.entity(tokens)
 | 
					 | 
				
			||||||
                nlp.parser(tokens)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                tokens = nlp(raw_text, merge_mwes=False)
 | 
					 | 
				
			||||||
            gold = GoldParse(tokens, annot_tuples)
 | 
					 | 
				
			||||||
            scorer.score(tokens, gold, verbose=False)
 | 
					 | 
				
			||||||
            for t in tokens:
 | 
					 | 
				
			||||||
                out_file.write(
 | 
					 | 
				
			||||||
                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
    return scorer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    train_loc=("Location of training file or directory"),
 | 
					 | 
				
			||||||
    dev_loc=("Location of development file or directory"),
 | 
					 | 
				
			||||||
    model_dir=("Location of output model directory",),
 | 
					 | 
				
			||||||
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
 | 
					 | 
				
			||||||
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
 | 
					 | 
				
			||||||
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
 | 
					 | 
				
			||||||
    out_loc=("Out location", "option", "o", str),
 | 
					 | 
				
			||||||
    n_sents=("Number of training sentences", "option", "n", int),
 | 
					 | 
				
			||||||
    n_iter=("Number of training iterations", "option", "i", int),
 | 
					 | 
				
			||||||
    verbose=("Verbose error reporting", "flag", "v", bool),
 | 
					 | 
				
			||||||
    debug=("Debug mode", "flag", "d", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
 | 
					 | 
				
			||||||
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
 | 
					 | 
				
			||||||
    if not eval_only:
 | 
					 | 
				
			||||||
        gold_train = list(read_json_file(train_loc))
 | 
					 | 
				
			||||||
        train(English, gold_train, model_dir,
 | 
					 | 
				
			||||||
              feat_set='basic' if not debug else 'debug',
 | 
					 | 
				
			||||||
              gold_preproc=gold_preproc, n_sents=n_sents,
 | 
					 | 
				
			||||||
              corruption_level=corruption_level, n_iter=n_iter,
 | 
					 | 
				
			||||||
              verbose=verbose)
 | 
					 | 
				
			||||||
    #if out_loc:
 | 
					 | 
				
			||||||
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
 | 
					 | 
				
			||||||
    scorer = evaluate(English, list(read_json_file(dev_loc)),
 | 
					 | 
				
			||||||
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
 | 
					 | 
				
			||||||
    print('TOK', scorer.token_acc)
 | 
					 | 
				
			||||||
    print('POS', scorer.tags_acc)
 | 
					 | 
				
			||||||
    print('UAS', scorer.uas)
 | 
					 | 
				
			||||||
    print('LAS', scorer.las)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print('NER P', scorer.ents_p)
 | 
					 | 
				
			||||||
    print('NER R', scorer.ents_r)
 | 
					 | 
				
			||||||
    print('NER F', scorer.ents_f)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,160 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
from __future__ import division
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
import gzip
 | 
					 | 
				
			||||||
import ujson
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import cProfile
 | 
					 | 
				
			||||||
import pstats
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import spacy.util
 | 
					 | 
				
			||||||
from spacy.de import German
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					 | 
				
			||||||
from spacy.scorer import PRFScore
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags 
 | 
					 | 
				
			||||||
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags 
 | 
					 | 
				
			||||||
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
 | 
					 | 
				
			||||||
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
 | 
					 | 
				
			||||||
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_templates():
 | 
					 | 
				
			||||||
    return spacy.tagger.Tagger.default_templates()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_templates_without_clusters():
 | 
					 | 
				
			||||||
    return (
 | 
					 | 
				
			||||||
        (W_orth,),
 | 
					 | 
				
			||||||
        (P1_lemma, P1_pos),
 | 
					 | 
				
			||||||
        (P2_lemma, P2_pos),
 | 
					 | 
				
			||||||
        (N1_orth,),
 | 
					 | 
				
			||||||
        (N2_orth,),
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        (W_suffix,),
 | 
					 | 
				
			||||||
        (W_prefix,),
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        (P1_pos,),
 | 
					 | 
				
			||||||
        (P2_pos,),
 | 
					 | 
				
			||||||
        (P1_pos, P2_pos),
 | 
					 | 
				
			||||||
        (P1_pos, W_orth),
 | 
					 | 
				
			||||||
        (P1_suffix,),
 | 
					 | 
				
			||||||
        (N1_suffix,),
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        (W_shape,),
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        (W_flags,),
 | 
					 | 
				
			||||||
        (N1_flags,),
 | 
					 | 
				
			||||||
        (N2_flags,),
 | 
					 | 
				
			||||||
        (P1_flags,),
 | 
					 | 
				
			||||||
        (P2_flags,),
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def make_tagger(vocab, templates):
 | 
					 | 
				
			||||||
    model = spacy.tagger.TaggerModel(templates)
 | 
					 | 
				
			||||||
    return spacy.tagger.Tagger(vocab,model)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_conll(file_):
 | 
					 | 
				
			||||||
    def sentences():
 | 
					 | 
				
			||||||
        words, tags = [], []
 | 
					 | 
				
			||||||
        for line in file_:
 | 
					 | 
				
			||||||
            line = line.strip()
 | 
					 | 
				
			||||||
            if line:
 | 
					 | 
				
			||||||
                word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
 | 
					 | 
				
			||||||
                words.append(word)
 | 
					 | 
				
			||||||
                tags.append(tag)
 | 
					 | 
				
			||||||
            elif words:
 | 
					 | 
				
			||||||
                yield words, tags
 | 
					 | 
				
			||||||
                words, tags = [], []
 | 
					 | 
				
			||||||
        if words:
 | 
					 | 
				
			||||||
            yield words, tags
 | 
					 | 
				
			||||||
    return [ s for s in sentences() ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
def score_model(score, nlp, words, gold_tags):
 | 
					 | 
				
			||||||
    tokens = nlp.tokenizer.tokens_from_list(words)
 | 
					 | 
				
			||||||
    assert(len(tokens) == len(gold_tags))
 | 
					 | 
				
			||||||
    nlp.tagger(tokens)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for token, gold_tag in zip(tokens,gold_tags):
 | 
					 | 
				
			||||||
        score.score_set(set([token.tag_]),set([gold_tag]))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
 | 
					 | 
				
			||||||
    # make shuffling deterministic
 | 
					 | 
				
			||||||
    random.seed(seed)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # set up directory for model
 | 
					 | 
				
			||||||
    pos_model_dir = path.join(model_dir, 'pos')
 | 
					 | 
				
			||||||
    if path.exists(pos_model_dir):
 | 
					 | 
				
			||||||
        shutil.rmtree(pos_model_dir)
 | 
					 | 
				
			||||||
    os.mkdir(pos_model_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
 | 
					 | 
				
			||||||
    nlp.tagger = make_tagger(nlp.vocab,default_templates())
 | 
					 | 
				
			||||||
     
 | 
					 | 
				
			||||||
    print("Itn.\ttrain acc %\tdev acc %")
 | 
					 | 
				
			||||||
    for itn in range(n_iter):
 | 
					 | 
				
			||||||
        # train on train set
 | 
					 | 
				
			||||||
        #train_acc = PRFScore()
 | 
					 | 
				
			||||||
        correct, total = 0., 0.
 | 
					 | 
				
			||||||
        for words, gold_tags in train_sents:
 | 
					 | 
				
			||||||
            tokens = nlp.tokenizer.tokens_from_list(words)
 | 
					 | 
				
			||||||
            correct += nlp.tagger.train(tokens, gold_tags)
 | 
					 | 
				
			||||||
            total += len(words)
 | 
					 | 
				
			||||||
        train_acc = correct/total
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # test on dev set
 | 
					 | 
				
			||||||
        dev_acc = PRFScore()
 | 
					 | 
				
			||||||
        for words, gold_tags in dev_sents:
 | 
					 | 
				
			||||||
            score_model(dev_acc, nlp, words, gold_tags)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        random.shuffle(train_sents)
 | 
					 | 
				
			||||||
        print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print('end training')
 | 
					 | 
				
			||||||
    nlp.end_training(model_dir)
 | 
					 | 
				
			||||||
    print('done')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    train_loc=("Location of CoNLL 09 formatted training file"),
 | 
					 | 
				
			||||||
    dev_loc=("Location of CoNLL 09 formatted development file"),
 | 
					 | 
				
			||||||
    model_dir=("Location of output model directory"),
 | 
					 | 
				
			||||||
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
 | 
					 | 
				
			||||||
    n_iter=("Number of training iterations", "option", "i", int),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
 | 
					 | 
				
			||||||
    # training
 | 
					 | 
				
			||||||
    if not eval_only:
 | 
					 | 
				
			||||||
        with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
 | 
					 | 
				
			||||||
             io.open(dev_loc, 'r', encoding='utf8') as devfile_:
 | 
					 | 
				
			||||||
            train_sents = read_conll(trainfile_)
 | 
					 | 
				
			||||||
            dev_sents = read_conll(devfile_)
 | 
					 | 
				
			||||||
        train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # testing
 | 
					 | 
				
			||||||
    with io.open(dev_loc, 'r', encoding='utf8') as file_:
 | 
					 | 
				
			||||||
        dev_sents = read_conll(file_)
 | 
					 | 
				
			||||||
        nlp = German(data_dir=model_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        dev_acc = PRFScore()
 | 
					 | 
				
			||||||
        for words, gold_tags in dev_sents:
 | 
					 | 
				
			||||||
            score_model(dev_acc, nlp, words, gold_tags)                
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
        print('POS: %6.2f %%' % (100*dev_acc.precision))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == '__main__':
 | 
					 | 
				
			||||||
    plac.call(main)
 | 
					 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user