mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Remove old, outdated files in /bin
This commit is contained in:
parent
9c89e2cdef
commit
5025d709e0
|
@ -1,93 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import joblib
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
import bz2
|
|
||||||
import ujson
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
import io
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.strings import StringStore
|
|
||||||
from spacy.attrs import ORTH
|
|
||||||
from spacy.tokenizer import Tokenizer
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def iter_comments(loc):
|
|
||||||
with bz2.BZ2File(loc) as file_:
|
|
||||||
for line in file_:
|
|
||||||
yield ujson.loads(line)
|
|
||||||
|
|
||||||
|
|
||||||
def count_freqs(input_loc, output_loc):
|
|
||||||
print(output_loc)
|
|
||||||
vocab = English.default_vocab(get_lex_attr=None)
|
|
||||||
tokenizer = Tokenizer.from_dir(vocab,
|
|
||||||
path.join(English.default_data_dir(), 'tokenizer'))
|
|
||||||
|
|
||||||
counts = PreshCounter()
|
|
||||||
for json_comment in iter_comments(input_loc):
|
|
||||||
doc = tokenizer(json_comment['body'])
|
|
||||||
doc.count_by(ORTH, counts=counts)
|
|
||||||
|
|
||||||
with io.open(output_loc, 'w', 'utf8') as file_:
|
|
||||||
for orth, freq in counts:
|
|
||||||
string = tokenizer.vocab.strings[orth]
|
|
||||||
if not string.isspace():
|
|
||||||
file_.write('%d\t%s\n' % (freq, string))
|
|
||||||
|
|
||||||
|
|
||||||
def parallelize(func, iterator, n_jobs):
|
|
||||||
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_counts(locs, out_loc):
|
|
||||||
string_map = StringStore()
|
|
||||||
counts = PreshCounter()
|
|
||||||
for loc in locs:
|
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
|
||||||
for line in file_:
|
|
||||||
freq, word = line.strip().split('\t', 1)
|
|
||||||
orth = string_map[word]
|
|
||||||
counts.inc(orth, int(freq))
|
|
||||||
with io.open(out_loc, 'w', encoding='utf8') as file_:
|
|
||||||
for orth, count in counts:
|
|
||||||
string = string_map[orth]
|
|
||||||
file_.write('%d\t%s\n' % (count, string))
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
input_loc=("Location of input file list"),
|
|
||||||
freqs_dir=("Directory for frequency files"),
|
|
||||||
output_loc=("Location for output file"),
|
|
||||||
n_jobs=("Number of workers", "option", "n", int),
|
|
||||||
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
|
|
||||||
)
|
|
||||||
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
|
|
||||||
tasks = []
|
|
||||||
outputs = []
|
|
||||||
for input_path in open(input_loc):
|
|
||||||
input_path = input_path.strip()
|
|
||||||
if not input_path:
|
|
||||||
continue
|
|
||||||
filename = input_path.split('/')[-1]
|
|
||||||
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
|
|
||||||
outputs.append(output_path)
|
|
||||||
if not path.exists(output_path) or not skip_existing:
|
|
||||||
tasks.append((input_path, output_path))
|
|
||||||
|
|
||||||
if tasks:
|
|
||||||
parallelize(count_freqs, tasks, n_jobs)
|
|
||||||
|
|
||||||
print("Merge")
|
|
||||||
merge_counts(outputs, output_loc)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,89 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from xml.etree import cElementTree as ElementTree
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
escaped_tokens = {
|
|
||||||
'-LRB-': '(',
|
|
||||||
'-RRB-': ')',
|
|
||||||
'-LSB-': '[',
|
|
||||||
'-RSB-': ']',
|
|
||||||
'-LCB-': '{',
|
|
||||||
'-RCB-': '}',
|
|
||||||
}
|
|
||||||
|
|
||||||
def read_parses(parse_loc):
|
|
||||||
offset = 0
|
|
||||||
doc = []
|
|
||||||
for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
|
|
||||||
parse = _adjust_token_ids(parse, offset)
|
|
||||||
offset += len(parse.split('\n'))
|
|
||||||
doc.append(parse)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def _adjust_token_ids(parse, offset):
|
|
||||||
output = []
|
|
||||||
for line in parse.split('\n'):
|
|
||||||
pieces = line.split()
|
|
||||||
pieces[0] = str(int(pieces[0]) + offset)
|
|
||||||
pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
|
|
||||||
output.append('\t'.join(pieces))
|
|
||||||
return '\n'.join(output)
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_doc(filename, paras):
|
|
||||||
return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_para(raw, sents):
|
|
||||||
return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_sent(sent):
|
|
||||||
return {
|
|
||||||
'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
|
|
||||||
'brackets': []}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
|
|
||||||
head = int(head) - 1
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = (head - id_) if head != -1 else 0
|
|
||||||
return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
|
|
||||||
|
|
||||||
|
|
||||||
tags_re = re.compile(r'<[\w\?/][^>]+>')
|
|
||||||
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
|
|
||||||
ewtb_dir = Path(ewtb_dir)
|
|
||||||
out_dir = Path(out_dir)
|
|
||||||
if not out_dir.exists():
|
|
||||||
out_dir.mkdir()
|
|
||||||
for genre_dir in ewtb_dir.joinpath('data').iterdir():
|
|
||||||
#if 'answers' in str(genre_dir): continue
|
|
||||||
parse_dir = genre_dir.joinpath('penntree')
|
|
||||||
docs = []
|
|
||||||
for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
|
|
||||||
filename = source_loc.parts[-1].replace('.sgm.sgm', '')
|
|
||||||
filename = filename.replace('.xml', '')
|
|
||||||
filename = filename.replace('.txt', '')
|
|
||||||
parse_loc = parse_dir.joinpath(filename + '.xml.tree')
|
|
||||||
parses = read_parses(parse_loc)
|
|
||||||
source = source_loc.open().read().strip()
|
|
||||||
if 'answers' in str(genre_dir):
|
|
||||||
source = tags_re.sub('', source).strip()
|
|
||||||
docs.append(_fmt_doc(filename, [[source, parses]]))
|
|
||||||
|
|
||||||
out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
|
|
||||||
with open(str(out_loc), 'w') as out_file:
|
|
||||||
out_file.write(json.dumps(docs, indent=4))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,32 +0,0 @@
|
||||||
import io
|
|
||||||
import plac
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
|
|
||||||
|
|
||||||
def main(text_loc):
|
|
||||||
with io.open(text_loc, 'r', encoding='utf8') as file_:
|
|
||||||
text = file_.read()
|
|
||||||
NLU = English()
|
|
||||||
for paragraph in text.split('\n\n'):
|
|
||||||
tokens = NLU(paragraph)
|
|
||||||
|
|
||||||
ent_starts = {}
|
|
||||||
ent_ends = {}
|
|
||||||
for span in tokens.ents:
|
|
||||||
ent_starts[span.start] = span.label_
|
|
||||||
ent_ends[span.end] = span.label_
|
|
||||||
|
|
||||||
output = []
|
|
||||||
for token in tokens:
|
|
||||||
if token.i in ent_starts:
|
|
||||||
output.append('<%s>' % ent_starts[token.i])
|
|
||||||
output.append(token.orth_)
|
|
||||||
if (token.i+1) in ent_ends:
|
|
||||||
output.append('</%s>' % ent_ends[token.i+1])
|
|
||||||
output.append('\n\n')
|
|
||||||
print ' '.join(output)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,157 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.syntax.parser import Parser
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
|
|
||||||
# Last updated for spaCy v0.97
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
|
||||||
sents = []
|
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx < 0:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(i)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
|
||||||
sents.append((None, [(annot, [])]))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
if len(pieces) == 4:
|
|
||||||
word, pos, head_idx, label = pieces
|
|
||||||
head_idx = int(head_idx)
|
|
||||||
elif len(pieces) == 15:
|
|
||||||
id_ = int(pieces[0].split('_')[-1])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[4]
|
|
||||||
head_idx = int(pieces[8])-1
|
|
||||||
label = pieces[10]
|
|
||||||
else:
|
|
||||||
id_ = int(pieces[0].split('_')[-1])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[4]
|
|
||||||
head_idx = int(pieces[6])-1
|
|
||||||
label = pieces[7]
|
|
||||||
if head_idx == 0:
|
|
||||||
label = 'ROOT'
|
|
||||||
return word, pos, head_idx, label
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|
||||||
gold_preproc=False, force_gold=False):
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(dep_model_dir):
|
|
||||||
shutil.rmtree(dep_model_dir)
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(dep_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
|
||||||
labels=ArcEager.get_labels(gold_tuples))
|
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
|
||||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for _, sents in gold_tuples:
|
|
||||||
for annot_tuples, _ in sents:
|
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
|
|
||||||
score_model(scorer, nlp, None, annot_tuples, verbose=False)
|
|
||||||
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
|
||||||
if not gold.is_projective:
|
|
||||||
raise Exception(
|
|
||||||
"Non-projective sentence in training, after we should "
|
|
||||||
"have enforced projectivity: %s" % annot_tuples
|
|
||||||
)
|
|
||||||
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
|
||||||
scorer.tags_acc, scorer.token_acc))
|
|
||||||
print('end training')
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
|
||||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
|
||||||
model_dir=("Location of output model directory"),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
|
||||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
|
||||||
train_sents = read_conll(file_)
|
|
||||||
if not eval_only:
|
|
||||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
|
||||||
nlp = English(data_dir=model_dir)
|
|
||||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
|
||||||
scorer = Scorer()
|
|
||||||
for _, sents in dev_sents:
|
|
||||||
for annot_tuples, _ in sents:
|
|
||||||
score_model(scorer, nlp, None, annot_tuples)
|
|
||||||
print('TOK', 100-scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,187 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.gold import merge_sents
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.syntax.parser import Parser
|
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
|
||||||
|
|
||||||
|
|
||||||
def _corrupt(c, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return c
|
|
||||||
elif c == ' ':
|
|
||||||
return '\n'
|
|
||||||
elif c == '\n':
|
|
||||||
return ' '
|
|
||||||
elif c in ['.', "'", "!", "?"]:
|
|
||||||
return ''
|
|
||||||
else:
|
|
||||||
return c.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return orig
|
|
||||||
elif type(orig) == list:
|
|
||||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
|
||||||
corrupted = [w for w in corrupted if w]
|
|
||||||
return corrupted
|
|
||||||
else:
|
|
||||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
|
||||||
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
|
|
||||||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
|
||||||
with Language.train(model_dir, train_data,
|
|
||||||
tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
|
||||||
loss = 0
|
|
||||||
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
|
|
||||||
augment_data=None)):
|
|
||||||
for doc, gold in epoch:
|
|
||||||
trainer.update(doc, gold)
|
|
||||||
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
|
|
||||||
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
|
|
||||||
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
|
||||||
beam_width=None, cand_preproc=None):
|
|
||||||
print("Load parser", model_dir)
|
|
||||||
nlp = Language(path=model_dir)
|
|
||||||
if nlp.lang == 'de':
|
|
||||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text)
|
|
||||||
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def write_parses(Language, dev_loc, model_dir, out_loc):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
gold_tuples = read_json_file(dev_loc)
|
|
||||||
scorer = Scorer()
|
|
||||||
out_file = io.open(out_loc, 'w', 'utf8')
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text)
|
|
||||||
#gold = GoldParse(tokens, annot_tuples)
|
|
||||||
#scorer.score(tokens, gold, verbose=False)
|
|
||||||
for sent in tokens.sents:
|
|
||||||
for t in sent:
|
|
||||||
if not t.is_space:
|
|
||||||
out_file.write(
|
|
||||||
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
|
|
||||||
)
|
|
||||||
out_file.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
|
||||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
|
||||||
out_loc=("Out location", "option", "o", str),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
debug=("Debug mode", "flag", "d", bool),
|
|
||||||
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
|
||||||
L1=("L1 regularization penalty", "option", "L", float),
|
|
||||||
)
|
|
||||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
|
||||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
|
|
||||||
L1=1e-6):
|
|
||||||
parser_cfg = dict(locals())
|
|
||||||
tagger_cfg = dict(locals())
|
|
||||||
entity_cfg = dict(locals())
|
|
||||||
|
|
||||||
lang = spacy.util.get_lang_class(language)
|
|
||||||
|
|
||||||
parser_cfg['features'] = lang.Defaults.parser_features
|
|
||||||
entity_cfg['features'] = lang.Defaults.entity_features
|
|
||||||
|
|
||||||
if not eval_only:
|
|
||||||
gold_train = list(read_json_file(train_loc))
|
|
||||||
gold_dev = list(read_json_file(dev_loc))
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_train = gold_train[:n_sents]
|
|
||||||
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
|
||||||
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
|
|
||||||
n_iter=n_iter)
|
|
||||||
if out_loc:
|
|
||||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
|
||||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
|
||||||
print('TOK', scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
print('NER P', scorer.ents_p)
|
|
||||||
print('NER R', scorer.ents_r)
|
|
||||||
print('NER F', scorer.ents_f)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,201 +0,0 @@
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
import random
|
|
||||||
import pathlib
|
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.pipeline import DependencyParser, TokenVectorEncoder
|
|
||||||
from spacy.syntax.parser import get_templates
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
|
|
||||||
import spacy.attrs
|
|
||||||
import io
|
|
||||||
from thinc.neural.ops import CupyOps
|
|
||||||
from thinc.neural import Model
|
|
||||||
from spacy.es import Spanish
|
|
||||||
from spacy.attrs import POS
|
|
||||||
|
|
||||||
|
|
||||||
from thinc.neural import Model
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cupy
|
|
||||||
from thinc.neural.ops import CupyOps
|
|
||||||
except:
|
|
||||||
cupy = None
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(loc, n=0):
|
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
|
||||||
text = file_.read()
|
|
||||||
i = 0
|
|
||||||
for sent in text.strip().split('\n\n'):
|
|
||||||
lines = sent.strip().split('\n')
|
|
||||||
if lines:
|
|
||||||
while lines[0].startswith('#'):
|
|
||||||
lines.pop(0)
|
|
||||||
tokens = []
|
|
||||||
for line in lines:
|
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
|
||||||
_2 = line.split('\t')
|
|
||||||
if '-' in id_ or '.' in id_:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
|
||||||
dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
|
|
||||||
tag = pos+'__'+dep+'__'+morph
|
|
||||||
Spanish.Defaults.tag_map[tag] = {POS: pos}
|
|
||||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
|
||||||
yield (None, [[tuples, []]])
|
|
||||||
i += 1
|
|
||||||
if n >= 1 and i >= n:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
|
|
||||||
scorer = Scorer()
|
|
||||||
correct = 0.
|
|
||||||
total = 0.
|
|
||||||
for doc, gold in zip(Xs, ys):
|
|
||||||
doc = Doc(vocab, words=[w.text for w in doc])
|
|
||||||
encoder(doc)
|
|
||||||
parser(doc)
|
|
||||||
PseudoProjectivity.deprojectivize(doc)
|
|
||||||
scorer.score(doc, gold, verbose=verbose)
|
|
||||||
for token, tag in zip(doc, gold.tags):
|
|
||||||
if '_' in token.tag_:
|
|
||||||
univ_guess, _ = token.tag_.split('_', 1)
|
|
||||||
else:
|
|
||||||
univ_guess = ''
|
|
||||||
univ_truth, _ = tag.split('_', 1)
|
|
||||||
correct += univ_guess == univ_truth
|
|
||||||
total += 1
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def organize_data(vocab, train_sents):
|
|
||||||
Xs = []
|
|
||||||
ys = []
|
|
||||||
for _, doc_sents in train_sents:
|
|
||||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
|
||||||
doc = Doc(vocab, words=words)
|
|
||||||
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
|
|
||||||
Xs.append(doc)
|
|
||||||
ys.append(gold)
|
|
||||||
return Xs, ys
|
|
||||||
|
|
||||||
|
|
||||||
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
|
||||||
LangClass = spacy.util.get_lang_class(lang_name)
|
|
||||||
train_sents = list(read_conllx(train_loc))
|
|
||||||
dev_sents = list(read_conllx(dev_loc))
|
|
||||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
|
||||||
|
|
||||||
actions = ArcEager.get_actions(gold_parses=train_sents)
|
|
||||||
features = get_templates('basic')
|
|
||||||
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
|
||||||
if not model_dir.exists():
|
|
||||||
model_dir.mkdir()
|
|
||||||
if not (model_dir / 'deps').exists():
|
|
||||||
(model_dir / 'deps').mkdir()
|
|
||||||
if not (model_dir / 'pos').exists():
|
|
||||||
(model_dir / 'pos').mkdir()
|
|
||||||
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
|
|
||||||
file_.write(
|
|
||||||
json.dumps(
|
|
||||||
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
|
|
||||||
|
|
||||||
vocab = LangClass.Defaults.create_vocab()
|
|
||||||
if not (model_dir / 'vocab').exists():
|
|
||||||
(model_dir / 'vocab').mkdir()
|
|
||||||
else:
|
|
||||||
if (model_dir / 'vocab' / 'strings.json').exists():
|
|
||||||
with (model_dir / 'vocab' / 'strings.json').open() as file_:
|
|
||||||
vocab.strings.load(file_)
|
|
||||||
if (model_dir / 'vocab' / 'lexemes.bin').exists():
|
|
||||||
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
|
||||||
|
|
||||||
if clusters_loc is not None:
|
|
||||||
clusters_loc = pathlib.Path(clusters_loc)
|
|
||||||
with clusters_loc.open() as file_:
|
|
||||||
for line in file_:
|
|
||||||
try:
|
|
||||||
cluster, word, freq = line.split()
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
lex = vocab[word]
|
|
||||||
lex.cluster = int(cluster[::-1], 2)
|
|
||||||
# Populate vocab
|
|
||||||
for _, doc_sents in train_sents:
|
|
||||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
|
||||||
for word in words:
|
|
||||||
_ = vocab[word]
|
|
||||||
for dep in deps:
|
|
||||||
_ = vocab[dep]
|
|
||||||
for tag in tags:
|
|
||||||
_ = vocab[tag]
|
|
||||||
if vocab.morphology.tag_map:
|
|
||||||
for tag in tags:
|
|
||||||
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
|
|
||||||
tagger = Tagger(vocab)
|
|
||||||
encoder = TokenVectorEncoder(vocab, width=64)
|
|
||||||
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
|
|
||||||
|
|
||||||
Xs, ys = organize_data(vocab, train_sents)
|
|
||||||
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
|
|
||||||
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
|
|
||||||
docs = list(Xs)
|
|
||||||
for doc in docs:
|
|
||||||
encoder(doc)
|
|
||||||
nn_loss = [0.]
|
|
||||||
def track_progress():
|
|
||||||
with encoder.tagger.use_params(optimizer.averages):
|
|
||||||
with parser.model.use_params(optimizer.averages):
|
|
||||||
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
|
|
||||||
itn = len(nn_loss)
|
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
|
|
||||||
nn_loss.append(0.)
|
|
||||||
track_progress()
|
|
||||||
trainer.each_epoch.append(track_progress)
|
|
||||||
trainer.batch_size = 24
|
|
||||||
trainer.nb_epoch = 40
|
|
||||||
for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
|
|
||||||
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
|
|
||||||
tokvecs, upd_tokvecs = encoder.begin_update(docs)
|
|
||||||
for doc, tokvec in zip(docs, tokvecs):
|
|
||||||
doc.tensor = tokvec
|
|
||||||
d_tokvecs = parser.update(docs, golds, sgd=optimizer)
|
|
||||||
upd_tokvecs(d_tokvecs, sgd=optimizer)
|
|
||||||
encoder.update(docs, golds, sgd=optimizer)
|
|
||||||
nlp = LangClass(vocab=vocab, parser=parser)
|
|
||||||
scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
|
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
|
||||||
#nlp.end_training(model_dir)
|
|
||||||
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
|
||||||
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
if 1:
|
|
||||||
plac.call(main)
|
|
||||||
else:
|
|
||||||
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
|
||||||
s = pstats.Stats("Profile.prof")
|
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
|
||||||
|
|
||||||
|
|
||||||
plac.call(main)
|
|
|
@ -1,194 +0,0 @@
|
||||||
"""Convert OntoNotes into a json format.
|
|
||||||
|
|
||||||
doc: {
|
|
||||||
id: string,
|
|
||||||
paragraphs: [{
|
|
||||||
raw: string,
|
|
||||||
sents: [int],
|
|
||||||
tokens: [{
|
|
||||||
start: int,
|
|
||||||
tag: string,
|
|
||||||
head: int,
|
|
||||||
dep: string}],
|
|
||||||
ner: [{
|
|
||||||
start: int,
|
|
||||||
end: int,
|
|
||||||
label: string}],
|
|
||||||
brackets: [{
|
|
||||||
start: int,
|
|
||||||
end: int,
|
|
||||||
label: string}]}]}
|
|
||||||
|
|
||||||
Consumes output of spacy/munge/align_raw.py
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import io
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from spacy.munge import read_ptb
|
|
||||||
from spacy.munge import read_conll
|
|
||||||
from spacy.munge import read_ner
|
|
||||||
|
|
||||||
|
|
||||||
def _iter_raw_files(raw_loc):
|
|
||||||
files = json.load(open(raw_loc))
|
|
||||||
for f in files:
|
|
||||||
yield f
|
|
||||||
|
|
||||||
|
|
||||||
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
|
||||||
ptb_sents = read_ptb.split(ptb_text)
|
|
||||||
dep_sents = read_conll.split(dep_text)
|
|
||||||
if len(ptb_sents) != len(dep_sents):
|
|
||||||
return None
|
|
||||||
if ner_text is not None:
|
|
||||||
ner_sents = read_ner.split(ner_text)
|
|
||||||
else:
|
|
||||||
ner_sents = [None] * len(ptb_sents)
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
doc = {'id': file_id}
|
|
||||||
if raw_paras is None:
|
|
||||||
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
|
|
||||||
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
|
|
||||||
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
|
|
||||||
else:
|
|
||||||
doc['paragraphs'] = []
|
|
||||||
for raw_sents in raw_paras:
|
|
||||||
para = format_para(
|
|
||||||
' '.join(raw_sents).replace('<SEP>', ''),
|
|
||||||
ptb_sents[i:i+len(raw_sents)],
|
|
||||||
dep_sents[i:i+len(raw_sents)],
|
|
||||||
ner_sents[i:i+len(raw_sents)])
|
|
||||||
if para['sentences']:
|
|
||||||
doc['paragraphs'].append(para)
|
|
||||||
i += len(raw_sents)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
|
||||||
para = {'raw': raw_text, 'sentences': []}
|
|
||||||
offset = 0
|
|
||||||
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
|
||||||
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
|
||||||
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
|
|
||||||
if deps and 'VERB' in [t['tag'] for t in deps]:
|
|
||||||
continue
|
|
||||||
if ner_text is not None:
|
|
||||||
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
|
||||||
else:
|
|
||||||
ner = ['-' for _ in deps]
|
|
||||||
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
|
||||||
# Necessary because the ClearNLP converter deletes EDITED words.
|
|
||||||
if len(ner) != len(deps):
|
|
||||||
ner = ['-' for _ in deps]
|
|
||||||
para['sentences'].append(format_sentence(deps, ner, brackets))
|
|
||||||
return para
|
|
||||||
|
|
||||||
|
|
||||||
def format_sentence(deps, ner, brackets):
|
|
||||||
sent = {'tokens': [], 'brackets': []}
|
|
||||||
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
|
|
||||||
sent['tokens'].append(format_token(token_id, token, token_ent))
|
|
||||||
|
|
||||||
for label, start, end in brackets:
|
|
||||||
if start != end:
|
|
||||||
sent['brackets'].append({
|
|
||||||
'label': label,
|
|
||||||
'first': start,
|
|
||||||
'last': (end-1)})
|
|
||||||
return sent
|
|
||||||
|
|
||||||
|
|
||||||
def format_token(token_id, token, ner):
|
|
||||||
assert token_id == token['id']
|
|
||||||
head = (token['head'] - token_id) if token['head'] != -1 else 0
|
|
||||||
return {
|
|
||||||
'id': token_id,
|
|
||||||
'orth': token['word'],
|
|
||||||
'tag': token['tag'],
|
|
||||||
'head': head,
|
|
||||||
'dep': token['dep'],
|
|
||||||
'ner': ner}
|
|
||||||
|
|
||||||
|
|
||||||
def read_file(*pieces):
|
|
||||||
loc = path.join(*pieces)
|
|
||||||
if not path.exists(loc):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return io.open(loc, 'r', encoding='utf8').read().strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_file_names(section_dir, subsection):
|
|
||||||
filenames = []
|
|
||||||
for fn in os.listdir(path.join(section_dir, subsection)):
|
|
||||||
filenames.append(fn.rsplit('.', 1)[0])
|
|
||||||
return list(sorted(set(filenames)))
|
|
||||||
|
|
||||||
|
|
||||||
def read_wsj_with_source(onto_dir, raw_dir):
|
|
||||||
# Now do WSJ, with source alignment
|
|
||||||
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
|
|
||||||
docs = {}
|
|
||||||
for i in range(25):
|
|
||||||
section = str(i) if i >= 10 else ('0' + str(i))
|
|
||||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
|
||||||
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
|
||||||
if section == '00':
|
|
||||||
j += 1
|
|
||||||
if section == '04' and filename == '55':
|
|
||||||
continue
|
|
||||||
ptb = read_file(onto_dir, section, '%s.parse' % filename)
|
|
||||||
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
|
|
||||||
ner = read_file(onto_dir, section, '%s.name' % filename)
|
|
||||||
if ptb is not None and dep is not None:
|
|
||||||
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def get_doc(onto_dir, file_path, wsj_docs):
|
|
||||||
filename = file_path.rsplit('/', 1)[1]
|
|
||||||
if filename in wsj_docs:
|
|
||||||
return wsj_docs[filename]
|
|
||||||
else:
|
|
||||||
ptb = read_file(onto_dir, file_path + '.parse')
|
|
||||||
dep = read_file(onto_dir, file_path + '.parse.dep')
|
|
||||||
ner = read_file(onto_dir, file_path + '.name')
|
|
||||||
if ptb is not None and dep is not None:
|
|
||||||
return format_doc(filename, None, ptb, dep, ner)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def read_ids(loc):
|
|
||||||
return open(loc).read().strip().split('\n')
|
|
||||||
|
|
||||||
|
|
||||||
def main(onto_dir, raw_dir, out_dir):
|
|
||||||
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
|
|
||||||
|
|
||||||
for partition in ('train', 'test', 'development'):
|
|
||||||
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
|
|
||||||
docs_by_genre = defaultdict(list)
|
|
||||||
for file_path in ids:
|
|
||||||
doc = get_doc(onto_dir, file_path, wsj_docs)
|
|
||||||
if doc is not None:
|
|
||||||
genre = file_path.split('/')[3]
|
|
||||||
docs_by_genre[genre].append(doc)
|
|
||||||
part_dir = path.join(out_dir, partition)
|
|
||||||
if not path.exists(part_dir):
|
|
||||||
os.mkdir(part_dir)
|
|
||||||
for genre, docs in sorted(docs_by_genre.items()):
|
|
||||||
out_loc = path.join(part_dir, genre + '.json')
|
|
||||||
with open(out_loc, 'w') as file_:
|
|
||||||
json.dump(docs, file_, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,13 +0,0 @@
|
||||||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
|
||||||
|
|
||||||
import plac
|
|
||||||
|
|
||||||
from spacy.vocab import write_binary_vectors
|
|
||||||
|
|
||||||
|
|
||||||
def main(in_loc, out_loc):
|
|
||||||
write_binary_vectors(in_loc, out_loc)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,175 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold)
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_sents(sents):
|
|
||||||
m_deps = [[], [], [], [], [], []]
|
|
||||||
m_brackets = []
|
|
||||||
i = 0
|
|
||||||
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
|
||||||
m_deps[0].extend(id_ + i for id_ in ids)
|
|
||||||
m_deps[1].extend(words)
|
|
||||||
m_deps[2].extend(tags)
|
|
||||||
m_deps[3].extend(head + i for head in heads)
|
|
||||||
m_deps[4].extend(labels)
|
|
||||||
m_deps[5].extend(ner)
|
|
||||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
|
||||||
i += len(ids)
|
|
||||||
return [(m_deps, m_brackets)]
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|
||||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
|
||||||
beam_width=1, verbose=False,
|
|
||||||
use_orig_arc_eager=False):
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
|
||||||
|
|
||||||
templates = Tagger.default_templates()
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False)
|
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, templates)
|
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, ctnt in sents:
|
|
||||||
words = annot_tuples[1]
|
|
||||||
gold_tags = annot_tuples[2]
|
|
||||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
loss += nlp.tagger.train(tokens, gold_tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
|
||||||
scorer.tags_acc,
|
|
||||||
scorer.token_acc))
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
|
||||||
beam_width=None):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
gold_tuples = read_json_file(dev_loc)
|
|
||||||
scorer = Scorer()
|
|
||||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=False)
|
|
||||||
for t in tokens:
|
|
||||||
out_file.write(
|
|
||||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
|
||||||
)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
|
||||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
|
||||||
out_loc=("Out location", "option", "o", str),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
debug=("Debug mode", "flag", "d", bool),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
|
||||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
|
|
||||||
if not eval_only:
|
|
||||||
gold_train = list(read_json_file(train_loc))
|
|
||||||
train(English, gold_train, model_dir,
|
|
||||||
feat_set='basic' if not debug else 'debug',
|
|
||||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
|
||||||
corruption_level=corruption_level, n_iter=n_iter,
|
|
||||||
verbose=verbose)
|
|
||||||
#if out_loc:
|
|
||||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
|
||||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
|
||||||
print('TOK', scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
print('NER P', scorer.ents_p)
|
|
||||||
print('NER R', scorer.ents_r)
|
|
||||||
print('NER F', scorer.ents_f)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,160 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.de import German
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.scorer import PRFScore
|
|
||||||
|
|
||||||
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
|
|
||||||
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
|
|
||||||
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
|
|
||||||
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
|
|
||||||
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
def default_templates():
|
|
||||||
return spacy.tagger.Tagger.default_templates()
|
|
||||||
|
|
||||||
def default_templates_without_clusters():
|
|
||||||
return (
|
|
||||||
(W_orth,),
|
|
||||||
(P1_lemma, P1_pos),
|
|
||||||
(P2_lemma, P2_pos),
|
|
||||||
(N1_orth,),
|
|
||||||
(N2_orth,),
|
|
||||||
|
|
||||||
(W_suffix,),
|
|
||||||
(W_prefix,),
|
|
||||||
|
|
||||||
(P1_pos,),
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos, P2_pos),
|
|
||||||
(P1_pos, W_orth),
|
|
||||||
(P1_suffix,),
|
|
||||||
(N1_suffix,),
|
|
||||||
|
|
||||||
(W_shape,),
|
|
||||||
|
|
||||||
(W_flags,),
|
|
||||||
(N1_flags,),
|
|
||||||
(N2_flags,),
|
|
||||||
(P1_flags,),
|
|
||||||
(P2_flags,),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def make_tagger(vocab, templates):
|
|
||||||
model = spacy.tagger.TaggerModel(templates)
|
|
||||||
return spacy.tagger.Tagger(vocab,model)
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
|
||||||
def sentences():
|
|
||||||
words, tags = [], []
|
|
||||||
for line in file_:
|
|
||||||
line = line.strip()
|
|
||||||
if line:
|
|
||||||
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
|
|
||||||
words.append(word)
|
|
||||||
tags.append(tag)
|
|
||||||
elif words:
|
|
||||||
yield words, tags
|
|
||||||
words, tags = [], []
|
|
||||||
if words:
|
|
||||||
yield words, tags
|
|
||||||
return [ s for s in sentences() ]
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(score, nlp, words, gold_tags):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
assert(len(tokens) == len(gold_tags))
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
|
|
||||||
for token, gold_tag in zip(tokens,gold_tags):
|
|
||||||
score.score_set(set([token.tag_]),set([gold_tag]))
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
|
|
||||||
# make shuffling deterministic
|
|
||||||
random.seed(seed)
|
|
||||||
|
|
||||||
# set up directory for model
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
|
||||||
nlp.tagger = make_tagger(nlp.vocab,default_templates())
|
|
||||||
|
|
||||||
print("Itn.\ttrain acc %\tdev acc %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
# train on train set
|
|
||||||
#train_acc = PRFScore()
|
|
||||||
correct, total = 0., 0.
|
|
||||||
for words, gold_tags in train_sents:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
correct += nlp.tagger.train(tokens, gold_tags)
|
|
||||||
total += len(words)
|
|
||||||
train_acc = correct/total
|
|
||||||
|
|
||||||
# test on dev set
|
|
||||||
dev_acc = PRFScore()
|
|
||||||
for words, gold_tags in dev_sents:
|
|
||||||
score_model(dev_acc, nlp, words, gold_tags)
|
|
||||||
|
|
||||||
random.shuffle(train_sents)
|
|
||||||
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
|
|
||||||
|
|
||||||
|
|
||||||
print('end training')
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
|
||||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
|
||||||
model_dir=("Location of output model directory"),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
|
|
||||||
# training
|
|
||||||
if not eval_only:
|
|
||||||
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
|
|
||||||
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
|
|
||||||
train_sents = read_conll(trainfile_)
|
|
||||||
dev_sents = read_conll(devfile_)
|
|
||||||
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
|
|
||||||
|
|
||||||
# testing
|
|
||||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
|
||||||
dev_sents = read_conll(file_)
|
|
||||||
nlp = German(data_dir=model_dir)
|
|
||||||
|
|
||||||
dev_acc = PRFScore()
|
|
||||||
for words, gold_tags in dev_sents:
|
|
||||||
score_model(dev_acc, nlp, words, gold_tags)
|
|
||||||
|
|
||||||
print('POS: %6.2f %%' % (100*dev_acc.precision))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
Loading…
Reference in New Issue
Block a user