Working NN, but very messy. Relies on BLIS.

This commit is contained in:
Matthew Honnibal 2016-07-20 16:28:02 +02:00
parent 7c2f1a673b
commit de7c6c48d8
13 changed files with 683 additions and 213 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function
from __future__ import division from __future__ import division
from __future__ import unicode_literals from __future__ import unicode_literals
@ -9,6 +10,8 @@ import io
import random import random
import time import time
import gzip import gzip
import re
import numpy
import plac import plac
import cProfile import cProfile
@ -20,23 +23,29 @@ from spacy.gold import GoldParse
from spacy.syntax.util import Config from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.scorer import Scorer from spacy.scorer import Scorer
from spacy.tagger import Tagger from spacy.tagger import Tagger
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax import _parse_features as pf
# Last updated for spaCy v0.97 # Last updated for spaCy v0.97
def read_conll(file_): def read_conll(file_, n=0):
"""Read a standard CoNLL/MALT-style format""" """Read a standard CoNLL/MALT-style format"""
sents = [] text = file_.read().strip()
for sent_str in file_.read().strip().split('\n\n'): sent_strs = re.split(r'\n\s*\n', text)
for sent_id, sent_str in enumerate(sent_strs):
if not sent_str.strip():
continue
ids = [] ids = []
words = [] words = []
heads = [] heads = []
labels = [] labels = []
tags = [] tags = []
for i, line in enumerate(sent_str.split('\n')): for i, line in enumerate(sent_str.strip().split('\n')):
word, pos_string, head_idx, label = _parse_line(line) word, pos_string, head_idx, label = _parse_line(line)
words.append(word) words.append(word)
if head_idx < 0: if head_idx < 0:
@ -45,10 +54,10 @@ def read_conll(file_):
heads.append(head_idx) heads.append(head_idx)
labels.append(label) labels.append(label)
tags.append(pos_string) tags.append(pos_string)
text = ' '.join(words)
annot = (ids, words, tags, heads, labels, ['O'] * len(ids)) annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
sents.append((None, [(annot, [])])) yield (None, [(annot, None)])
return sents if n and sent_id >= n:
break
def _parse_line(line): def _parse_line(line):
@ -68,21 +77,33 @@ def _parse_line(line):
pos = pieces[4] pos = pieces[4]
head_idx = int(pieces[6])-1 head_idx = int(pieces[6])-1
label = pieces[7] label = pieces[7]
if head_idx == 0: if head_idx < 0:
label = 'ROOT' label = 'ROOT'
return word, pos, head_idx, label return word, pos, head_idx, label
def print_words(strings, words, embeddings):
ids = {strings[word]: word for word in words}
vectors = {}
for key, values in embeddings[5]:
if key in ids:
vectors[strings[key]] = values
for word in words:
if word in vectors:
print(word, vectors[word])
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens) nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
nlp.parser(tokens) nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=False) gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct')) scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
gold_preproc=False, force_gold=False): learn_rate=0.001, update_step='sgd_cm',
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
dep_model_dir = path.join(model_dir, 'deps') dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos') pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir): if path.exists(dep_model_dir):
@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
os.mkdir(dep_model_dir) os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir) os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, if feat_set != 'neural':
labels=ArcEager.get_labels(gold_tuples)) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples))
else:
feat_groups = [
(pf.core_words, 8),
(pf.core_tags, 4),
(pf.core_labels, 4),
(pf.core_shapes, 4),
([f[0] for f in pf.valencies], 2)
]
slots = []
vector_widths = []
feat_set = []
input_length = 0
for i, (feat_group, width) in enumerate(feat_groups):
feat_set.extend((f,) for f in feat_group)
slots += [i] * len(feat_group)
vector_widths.append(width)
input_length += width * len(feat_group)
hidden_layers = [128] * 5
rho = 1e-4
Config.write(dep_model_dir, 'config',
model='neural',
seed=seed,
labels=ArcEager.get_labels(gold_tuples),
feat_set=feat_set,
vector_widths=vector_widths,
slots=slots,
hidden_layers=hidden_layers,
update_step=update_step,
batch_norm=batch_norm,
eta=learn_rate,
mu=0.9,
ensemble_size=1,
rho=rho)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
for word in nlp.vocab:
word.norm = word.orth
words = list(nlp.vocab)
top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32')
norms = numpy.ndarray(shape=(10000,), dtype='float32')
for i in range(10000):
if i >= 400 and words[i].has_vector:
top5k[i] = words[i].vector
norms[i] = numpy.sqrt(sum(top5k[i] ** 2))
else:
# Make these way off values, to make big distance.
top5k[i] = 100.0
norms[i] = 100.0
print("Setting vectors")
for word in words[10000:]:
if word.has_vector:
cosines = numpy.dot(top5k, word.vector)
cosines /= norms * numpy.sqrt(sum(word.vector ** 2))
most_similar = words[numpy.argmax(cosines)]
word.norm = most_similar.norm
else:
word.norm = word.shape
print(nlp.parser.model.widths)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize")
last_score = 0.0
nr_trimmed = 0
eg_seen = 0
loss = 0
for itn in range(n_iter): for itn in range(n_iter):
scorer = Scorer() random.shuffle(gold_tuples)
loss = 0
for _, sents in gold_tuples: for _, sents in gold_tuples:
for annot_tuples, _ in sents: for annot_tuples, _ in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, None, annot_tuples, verbose=False)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens) nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
gold = GoldParse(tokens, annot_tuples, make_projective=True) gold = GoldParse(tokens, annot_tuples)
if not gold.is_projective:
raise Exception(
"Non-projective sentence in training, after we should "
"have enforced projectivity: %s" % annot_tuples
)
loss += nlp.parser.train(tokens, gold) loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags) eg_seen += 1
random.shuffle(gold_tuples) if eg_seen % 10000 == 0:
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer = Scorer()
scorer.tags_acc, scorer.token_acc)) with io.open(dev_loc, 'r', encoding='utf8') as file_:
print('end training') for _, sents in read_conll(file_):
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
train_scorer = Scorer()
for _, sents in gold_tuples[:1000]:
for annot_tuples, _ in sents:
score_model(train_scorer, nlp, None, annot_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
train_scorer.uas, scorer.uas,
nlp.parser.model.mem.size))
loss = 0
if feat_set != 'basic':
nlp.parser.model.eta *= 0.99
threshold = 0.05 * (1.05 ** itn)
nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True)
nlp.end_training(model_dir) nlp.end_training(model_dir)
print('done') return nlp
@plac.annotations( @plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"), train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"), dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"), model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int), n_iter=("Number of training iterations", "option", "i", int),
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
update_step=("Update step", "option", "u", str),
learn_rate=("Learn rate", "option", "e", float),
neural=("Use neural network?", "flag", "N")
) )
def main(train_loc, dev_loc, model_dir, n_iter=15): def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
learn_rate=0.001, update_step='sgd_cm'):
with io.open(train_loc, 'r', encoding='utf8') as file_: with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = read_conll(file_) train_sents = list(read_conll(file_))
if not eval_only: # preprocess training data here before ArcEager.get_labels() is called
train(English, train_sents, model_dir, n_iter=n_iter) train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
nlp = English(data_dir=model_dir)
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
feat_set='neural' if neural else 'basic',
batch_norm=batch_norm,
learn_rate=learn_rate,
update_step=update_step)
scorer = Scorer() scorer = Scorer()
for _, sents in dev_sents: with io.open(dev_loc, 'r', encoding='utf8') as file_:
for annot_tuples, _ in sents: for _, sents in read_conll(file_):
score_model(scorer, nlp, None, annot_tuples) for annot_tuples, _ in sents:
print('TOK', 100-scorer.token_acc) score_model(scorer, nlp, None, annot_tuples)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc) print('POS', scorer.tags_acc)
print('UAS', scorer.uas) print('UAS', scorer.uas)
print('LAS', scorer.las) print('LAS', scorer.las)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -23,7 +23,8 @@ from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger from spacy.tagger import Tagger
from spacy.syntax.parser import Parser from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.syntax.nonproj import PseudoProjectivity from spacy.syntax.nonproj import PseudoProjectivity
@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples), labels=ArcEager.get_labels(gold_tuples),
beam_width=beam_width,projectivize=pseudoprojective) beam_width=beam_width,projectivize=pseudoprojective)
#feat_set, slots = get_templates('neural')
#vector_widths = [10, 10, 10]
#hidden_layers = [100, 100, 100]
#update_step = 'adam'
#eta = 0.001
#rho = 1e-4
#Config.write(dep_model_dir, 'config', model='neural',
# seed=seed, labels=ArcEager.get_labels(gold_tuples),
# feat_set=feat_set,
# vector_widths=vector_widths,
# slots=slots,
# hidden_layers=hidden_layers,
# update_step=update_step,
# eta=eta,
# rho=rho)
Config.write(ner_model_dir, 'config', features='ner', seed=seed, Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=BiluoPushDown.get_labels(gold_tuples), labels=BiluoPushDown.get_labels(gold_tuples),
beam_width=0) beam_width=0)
@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
print(nlp.parser.model.widths)
for raw_text, sents in gold_tuples:
for annot_tuples, ctnt in sents:
for word in annot_tuples[1]:
_ = nlp.vocab[word]
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter): for itn in range(n_iter):
scorer = Scorer() scorer = Scorer()
@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
if not eval_only: if not eval_only:
gold_train = list(read_json_file(train_loc)) gold_train = list(read_json_file(train_loc))
train(lang, gold_train, model_dir, train(lang, gold_train, model_dir,
feat_set='basic' if not debug else 'debug', feat_set='neural' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents, gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter, corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose,pseudoprojective=pseudoprojective) verbose=verbose,pseudoprojective=pseudoprojective)
if out_loc: if out_loc:
write_parses(lang, dev_loc, model_dir, out_loc) write_parses(lang, dev_loc, model_dir, out_loc)
print(model_dir)
scorer = evaluate(lang, list(read_json_file(dev_loc)), scorer = evaluate(lang, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose) model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc) print('TOK', scorer.token_acc)

View File

@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import get_templates from spacy.syntax.parser import get_templates
from spacy.scorer import Scorer from spacy.scorer import Scorer
import spacy.attrs import spacy.attrs
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax._parse_features import *
from spacy.language import Language from spacy.language import Language
from spacy.tagger import W_orth
TAGGER_TEMPLATES = (
(W_orth,),
)
try: try:
from codecs import open from codecs import open
except ImportError: except ImportError:
pass pass
features = [
(S2W,),
(S1W, ),
(S1rW,),
(S0lW, ),
(S0l2W, ),
(S0W, ),
(S0r2W, ),
(S0rW, ),
(N0l2W, ),
(N0lW, ),
(N0W, ),
(N1W, ),
(N2W, )
]
slots = [0] * len(features)
features += [
(S2p,),
(S1p, ),
(S1rp,),
(S0lp,),
(S0l2p,),
(S0p, ),
(S0r2p, ),
(S0rp, ),
(N0l2p, ),
(N0lp, ),
(N0p, ),
(N1p, ),
(N2p, )
]
slots += [1] * (len(features) - len(slots))
features += [
(S2L,),
(S1L,),
(S1rL,),
(S0lL,),
(S0l2L,),
(S0L,),
(S0rL,),
(S0r2L,),
(N0l2L,),
(N0lL,),
]
slots += [2] * (len(features) - len(slots))
#
#features += [(S2p, S1p), (S1p, S0p)]
#slots += [3, 3]
#features += [(S0p, N0p)]
#slots += [4]
# (S0l2p, S0l2L, S0lp, S0l2L),
# (N0l2p, N0l2L, N0lp, N0lL),
# (S1p, S1rp, S1rL),
# (S0p, S0rp, S0rL),
#)
class TreebankParser(object): class TreebankParser(object):
@staticmethod @staticmethod
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0): def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,),
hidden_layers=(300, 300),
feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0):
dep_model_dir = path.join(model_dir, 'deps') dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos') pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir): if path.exists(dep_model_dir):
@ -43,15 +105,16 @@ class TreebankParser(object):
os.mkdir(dep_model_dir) os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir) os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set,
labels=labels) seed=seed, labels=labels, vector_widths=vector_widths, slots=slots,
hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho)
@classmethod @classmethod
def from_dir(cls, tag_map, model_dir): def from_dir(cls, tag_map, model_dir):
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs())
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0 vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer = Tokenizer(vocab, {}, None, None, None)
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) tagger = Tagger.blank(vocab, Tagger.default_templates())
cfg = Config.read(path.join(model_dir, 'deps'), 'config') cfg = Config.read(path.join(model_dir, 'deps'), 'config')
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager) parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
@ -64,22 +127,14 @@ class TreebankParser(object):
self.parser = parser self.parser = parser
def train(self, words, tags, heads, deps): def train(self, words, tags, heads, deps):
tokens = self.tokenizer.tokens_from_list(list(words))
self.tagger.train(tokens, tags)
tokens = self.tokenizer.tokens_from_list(list(words)) tokens = self.tokenizer.tokens_from_list(list(words))
ids = range(len(words)) ids = range(len(words))
ner = ['O'] * len(words) ner = ['O'] * len(words)
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)), gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)))
make_projective=False) self.tagger.tag_from_strings(tokens, tags)
self.tagger(tokens) loss = self.parser.train(tokens, gold)
if gold.is_projective: PseudoProjectivity.deprojectivize(tokens)
try: return loss
self.parser.train(tokens, gold)
except:
for id_, word, head, dep in zip(ids, words, heads, deps):
print(id_, word, head, dep)
raise
def __call__(self, words, tags=None): def __call__(self, words, tags=None):
tokens = self.tokenizer.tokens_from_list(list(words)) tokens = self.tokenizer.tokens_from_list(list(words))
@ -88,6 +143,7 @@ class TreebankParser(object):
else: else:
self.tagger.tag_from_strings(tokens, tags) self.tagger.tag_from_strings(tokens, tags)
self.parser(tokens) self.parser(tokens)
PseudoProjectivity.deprojectivize(tokens)
return tokens return tokens
def end_training(self, data_dir): def end_training(self, data_dir):
@ -101,8 +157,6 @@ class TreebankParser(object):
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin')) self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
def read_conllx(loc): def read_conllx(loc):
with open(loc, 'r', 'utf8') as file_: with open(loc, 'r', 'utf8') as file_:
text = file_.read() text = file_.read()
@ -119,8 +173,8 @@ def read_conllx(loc):
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_ head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep dep = 'ROOT' if dep == 'root' else dep
tokens.append((id_, word, tag, head, dep, 'O')) tokens.append([id_, word, tag, head, dep, 'O'])
tuples = zip(*tokens) tuples = [list(el) for el in zip(*tokens)]
yield (None, [(tuples, [])]) yield (None, [(tuples, [])])
@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False):
return scorer return scorer
def main(train_loc, dev_loc, model_dir, tag_map_loc): @plac.annotations(
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10):
with open(tag_map_loc) as file_: with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read()) tag_map = json.loads(file_.read())
train_sents = list(read_conllx(train_loc)) train_sents = list(read_conllx(train_loc))
labels = ArcEager.get_labels(train_sents) train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
templates = get_templates('basic') dev_sents = list(read_conllx(dev_loc))
TreebankParser.setup_model_dir(model_dir, labels, templates) labels = ArcEager.get_labels(train_sents)
TreebankParser.setup_model_dir(model_dir, labels,
feat_set=features, vector_widths=(10,10,10,30,30), slots=slots,
hidden_layers=(100,100,100), update_step='adam')
nlp = TreebankParser.from_dir(tag_map, model_dir) nlp = TreebankParser.from_dir(tag_map, model_dir)
nlp.parser.model.rho = 1e-4
print(nlp.parser.model.widths)
for itn in range(15): for itn in range(n_iter):
loss = 0.0
for _, doc_sents in train_sents: for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents:
nlp.train(words, tags, heads, deps) loss += nlp.train(words, tags, heads, deps)
random.shuffle(train_sents) random.shuffle(train_sents)
scorer = score_model(nlp, read_conllx(dev_loc)) scorer = score_model(nlp, dev_sents)
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
print(nlp.parser.model.mem.size)
nlp.end_training(model_dir) nlp.end_training(model_dir)
scorer = score_model(nlp, read_conllx(dev_loc)) scorer = score_model(nlp, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -51,6 +51,7 @@ MOD_NAMES = [
'spacy.syntax._state', 'spacy.syntax._state',
'spacy.tokenizer', 'spacy.tokenizer',
'spacy.syntax.parser', 'spacy.syntax.parser',
'spacy.syntax.beam_parser',
'spacy.syntax.nonproj', 'spacy.syntax.nonproj',
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',
'spacy.syntax.arc_eager', 'spacy.syntax.arc_eager',
@ -73,7 +74,8 @@ MOD_NAMES = [
compile_options = { compile_options = {
'msvc': ['/Ox', '/EHsc'], 'msvc': ['/Ox', '/EHsc'],
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'], 'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
'-I/Users/matt/blis/include/blis']
} }

View File

@ -1,3 +1,4 @@
# cython: profile=True
import numpy import numpy
import io import io
import json import json
@ -264,13 +265,3 @@ cdef class GoldParse:
def is_punct_label(label): def is_punct_label(label):
return label == 'P' or label.lower() == 'punct' return label == 'P' or label.lower() == 'punct'

View File

@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[11] = 0 context[11] = 0
context[12] = 0 context[12] = 0
else: else:
context[0] = token.lex.orth context[0] = token.lex.norm
context[1] = token.lemma context[1] = token.lex.norm
context[2] = token.tag context[2] = token.tag
context[3] = token.lex.cluster context[3] = token.lex.cluster
# We've read in the string little-endian, so now we can take & (2**n)-1 # We've read in the string little-endian, so now we can take & (2**n)-1
@ -366,27 +366,26 @@ trigrams = (
words = ( words = (
S2w, S2W,
S1w, S1W,
S1rw, S1rW,
S0lw, S0lW,
S0l2w, S0l2W,
S0w, S0W,
S0r2w, S0r2W,
S0rw, S0rW,
N0lw, N0lW,
N0l2w, N0l2W,
N0w, N0W,
N1w, N1W,
N2w, N2W,
P1w, P1W,
P2w P2W
) )
tags = ( tags = (
S2p, S2p,
S1p, S1p,
S1rp,
S0lp, S0lp,
S0l2p, S0l2p,
S0p, S0p,
@ -404,7 +403,6 @@ tags = (
labels = ( labels = (
S2L, S2L,
S1L, S1L,
S1rL,
S0lL, S0lL,
S0l2L, S0l2L,
S0L, S0L,
@ -412,9 +410,88 @@ labels = (
S0rL, S0rL,
N0lL, N0lL,
N0l2L, N0l2L,
N0L,
N1L,
N2L,
P1L,
P2L
) )
core_words = (
S2w,
S1w,
S0lw,
S0l2w,
S0w,
S0rw,
S0r2w,
N0lw,
N0l2w,
N0w,
N1w,
N2w,
)
core_shapes = (
S2_shape,
S1_shape,
S0l_shape,
S0l2_shape,
S0_shape,
S0r_shape,
S0r2_shape,
N0l_shape,
N0l2_shape,
N0_shape,
N1_shape,
N2_shape,
)
core_clusters = (
S2c,
S1c,
S0lc,
S0l2c,
S0c,
S0rc,
S0r2c,
N0lc,
N0l2c,
N0c,
N1c,
N2c,
)
core_tags = (
S2p,
S1p,
S0lp,
S0l2p,
S0p,
S0r2p,
S0rp,
N0lp,
N0l2p,
N0p,
N1p,
N2p,
)
core_labels = (
S2L,
S1L,
S0lL,
S0l2L,
S0L,
S0r2L,
S0rL,
N0lL,
N0l2L,
)
valencies = (
(N0lv,),
(S0lv,),
(S0rv,),
(S1lv,),
(S1rv,),
)

View File

@ -1,6 +1,9 @@
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t, uint64_t
from murmurhash.mrmr cimport hash64
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME
from ..structs cimport TokenC, Entity from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
@ -201,6 +204,21 @@ cdef cppclass StateC:
else: else:
return this.length - this._b_i return this.length - this._b_i
uint64_t hash() nogil const:
cdef TokenC[11] sig
sig[0] = this.S_(2)[0]
sig[1] = this.S_(1)[0]
sig[2] = this.R_(this.S(1), 1)[0]
sig[3] = this.L_(this.S(0), 1)[0]
sig[4] = this.L_(this.S(0), 2)[0]
sig[5] = this.S_(0)[0]
sig[6] = this.R_(this.S(0), 2)[0]
sig[7] = this.R_(this.S(0), 1)[0]
sig[8] = this.B_(0)[0]
sig[9] = this.E_(0)[0]
sig[10] = this.E_(1)[0]
return hash64(sig, sizeof(sig), this._s_i)
void push() nogil: void push() nogil:
if this.B(0) != -1: if this.B(0) != -1:
this._stack[this._s_i] = this.B(0) this._stack[this._s_i] = this.B(0)
@ -290,6 +308,8 @@ cdef cppclass StateC:
memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i this._b_i = src._b_i
this._s_i = src._s_i this._s_i = src._s_i
this._e_i = src._e_i this._e_i = src._e_i

View File

@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem):
else: else:
is_valid[i] = False is_valid[i] = False
costs[i] = 9000 costs[i] = 9000
assert n_gold >= 1 if n_gold < 1:
for annot in gold.orig_annot:
print(annot)
print([move_costs[i] for i in range(N_MOVES)])
print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1])
print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)])
print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)])
raise Exception("No gold moves")

View File

@ -10,7 +10,7 @@ def english_noun_chunks(doc):
for i, word in enumerate(doc): for i, word in enumerate(doc):
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj: elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj:
head = word.head head = word.head
while head.dep == conj and head.head.i < head.i: while head.dep == conj and head.head.i < head.i:
head = head.head head = head.head

View File

@ -1,25 +1,37 @@
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.neural.nn cimport NeuralNet from thinc.neural.nn cimport NeuralNet
from thinc.linear.features cimport ConjunctionExtracter
from thinc.base cimport Model from thinc.base cimport Model
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.typedefs cimport weight_t
from thinc.structs cimport FeatureC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from .arc_eager cimport TransitionSystem from .arc_eager cimport TransitionSystem
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..structs cimport TokenC from ..structs cimport TokenC
from thinc.structs cimport ExampleC from thinc.structs cimport NeuralNetC, ExampleC
from ._state cimport StateC from ._state cimport StateC
cdef class ParserNeuralNet(NeuralNet): cdef class ParserNeuralNet(NeuralNet):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil cdef ConjunctionExtracter extracter
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserPerceptron(AveragedPerceptron): cdef class ParserPerceptron(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
cdef object _models
cdef NeuralNetC** _models_c
cdef int** _masks
cdef int _nr_model
cdef class Parser: cdef class Parser:
cdef readonly ParserNeuralNet model cdef readonly Model model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef int _projectivize cdef int _projectivize

View File

@ -1,4 +1,5 @@
# cython: infer_types=True # cython: infer_types=True
# cython: profile=True
""" """
MALT-style dependency parser MALT-style dependency parser
""" """
@ -18,13 +19,14 @@ import shutil
import json import json
import sys import sys
from .nonproj import PseudoProjectivity from .nonproj import PseudoProjectivity
import random
from cymem.cymem cimport Pool, Address from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC, ExampleC from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.structs cimport FeatureC from thinc.structs cimport FeatureC
@ -61,8 +63,10 @@ def get_templates(name):
return pf.ner return pf.ner
elif name == 'debug': elif name == 'debug':
return pf.unigrams return pf.unigrams
elif name.startswith('embed'): elif name.startswith('neural'):
return (pf.words, pf.tags, pf.labels) features = pf.words + pf.tags + pf.labels
slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
return ([(f,) for f in features], slots)
else: else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams) pf.tree_shape + pf.trigrams)
@ -73,72 +77,238 @@ def ParserFactory(transition_system):
cdef class ParserPerceptron(AveragedPerceptron): cdef class ParserPerceptron(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: @property
def widths(self):
return (self.extracter.nr_templ,)
def update(self, Example eg):
'''Does regression on negative cost. Sort of cute?'''
self.time += 1
cdef weight_t loss = 0.0
best = eg.best
for clas in range(eg.c.nr_class):
if not eg.c.is_valid[clas]:
continue
if eg.c.scores[clas] < eg.c.scores[best]:
continue
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
step = d_loss * 0.001
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, clas, feat.value * step)
return int(loss)
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
state = <const StateC*>_state
fill_context(eg.atoms, state) fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef class ParserNeuralNet(NeuralNet): cdef class ParserNeuralNet(NeuralNet):
def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50, def __init__(self, shape, **kwargs):
tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0): vector_widths = [4] * 57
#input_length = 3 * word_width + 5 * tag_width + 3 * dep_width slots = [0, 1, 2, 3] # S0
input_length = 12 * word_width + 7 * dep_width slots += [4, 5, 6, 7] # S1
widths = [input_length] + [hidden_width] * depth + [nr_class] slots += [8, 9, 10, 11] # S2
#vector_widths = [word_width, tag_width, dep_width] slots += [12, 13, 14, 15] # S3+
#slots = [0] * 3 + [1] * 5 + [2] * 3 slots += [16, 17, 18, 19] # B0
vector_widths = [word_width, dep_width] slots += [20, 21, 22, 23] # B1
slots = [0] * 12 + [1] * 7 slots += [24, 25, 26, 27] # B2
NeuralNet.__init__( slots += [28, 29, 30, 31] # B3+
self, slots += [32, 33, 34, 35] * 2 # S0l, S0r
widths, slots += [36, 37, 38, 39] * 2 # B0l, B0r
embed=(vector_widths, slots), slots += [40, 41, 42, 43] * 2 # S1l, S1r
eta=eta, slots += [44, 45, 46, 47] * 2 # S2l, S2r
rho=rho, slots += [48, 49, 50, 51, 52]
update_step=update_step) slots += [53, 54, 55, 56]
input_length = sum(vector_widths[slot] for slot in slots)
widths = [input_length] + shape[3:]
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
@property @property
def nr_feat(self): def nr_feat(self):
#return 3+5+3 return 2000
return 12+7
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
memset(eg.features, 0, 2000 * sizeof(FeatureC))
state = <const StateC*>_state
fill_context(eg.atoms, state) fill_context(eg.atoms, state)
eg.nr_feat = 12 + 7 feats = eg.features
for j in range(eg.nr_feat):
eg.features[j].value = 1.0
eg.features[j].i = j
#eg.features[0].key = eg.atoms[S0w]
#eg.features[1].key = eg.atoms[S1w]
#eg.features[2].key = eg.atoms[N0w]
eg.features[0].key = eg.atoms[S2W] feats = _add_token(feats, 0, state.S_(0), 1.0)
eg.features[1].key = eg.atoms[S1W] feats = _add_token(feats, 4, state.S_(1), 1.0)
eg.features[2].key = eg.atoms[S0lW] feats = _add_token(feats, 8, state.S_(2), 1.0)
eg.features[3].key = eg.atoms[S0l2W] # Rest of the stack, with exponential decay
eg.features[4].key = eg.atoms[S0W] for i in range(3, state.stack_depth()):
eg.features[5].key = eg.atoms[S0r2W] feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
eg.features[6].key = eg.atoms[S0rW] feats = _add_token(feats, 16, state.B_(0), 1.0)
eg.features[7].key = eg.atoms[N0lW] feats = _add_token(feats, 20, state.B_(1), 1.0)
eg.features[8].key = eg.atoms[N0l2W] feats = _add_token(feats, 24, state.B_(2), 1.0)
eg.features[9].key = eg.atoms[N0W] # Rest of the buffer, with exponential decay
eg.features[10].key = eg.atoms[N1W] for i in range(3, min(8, state.buffer_length())):
eg.features[11].key = eg.atoms[N2W] feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
feats = _add_subtree(feats, 32, state, state.S(0))
feats = _add_subtree(feats, 40, state, state.B(0))
feats = _add_subtree(feats, 48, state, state.S(1))
feats = _add_subtree(feats, 56, state, state.S(2))
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
state.R_(state.S(0), 2))
feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
state.L_(state.S(0), 2))
eg.nr_feat = feats - eg.features
eg.features[12].key = eg.atoms[S2L]
eg.features[13].key = eg.atoms[S1L] cdef inline FeatureC* _add_token(FeatureC* feats,
eg.features[14].key = eg.atoms[S0l2L] int slot, const TokenC* token, weight_t value) nogil:
eg.features[15].key = eg.atoms[S0lL] # Word
eg.features[16].key = eg.atoms[S0L] feats.i = slot
eg.features[17].key = eg.atoms[S0r2L] feats.key = token.lex.norm
eg.features[18].key = eg.atoms[S0rL] feats.value = value
feats += 1
# POS tag
feats.i = slot+1
feats.key = token.tag
feats.value = value
feats += 1
# Dependency label
feats.i = slot+2
feats.key = token.dep
feats.value = value
feats += 1
# Word, label, tag
feats.i = slot+3
cdef uint64_t key[3]
key[0] = token.lex.cluster
key[1] = token.tag
key[2] = token.dep
feats.key = hash64(key, sizeof(key), 0)
feats.value = value
feats += 1
return feats
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
value = 1.0
for i in range(state.n_R(t)):
feats = _add_token(feats, slot, state.R_(t, i+1), value)
value *= 0.5
slot += 4
value = 1.0
for i in range(state.n_L(t)):
feats = _add_token(feats, slot, state.L_(t, i+1), value)
value *= 0.5
return feats
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2) nogil:
cdef uint64_t[2] key
key[0] = t1.tag
key[1] = t2.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
cdef uint64_t[3] key
key[0] = t1.tag
key[1] = t2.tag
key[2] = t3.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
self._masks = <int**>self.mem.alloc(sizeof(int*), n)
self._models = []
cdef ParserNeuralNet model
threshold = 1.5 / n
self._nr_model = n
for i in range(n):
self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
for j in range(self.nr_feat):
self._masks[i][j] = random.random() < threshold
# We have to pass our pool here, because the embedding table passes
# it around.
model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
self._models_c[i] = &model.c
self._models.append(model)
property eta:
def __get__(self):
return self._models[0].eta
def __set__(self, weight_t value):
for model in self._models:
model.eta = value
def sparsify_embeddings(self, penalty):
p = 0.0
for model in self._models:
p += model.sparsify_embeddings(penalty)
return p / len(self._models)
cdef void set_scoresC(self, weight_t* scores, const void* _feats,
int nr_feat, int is_sparse) nogil:
nr_class = self.c.widths[self.c.nr_layer-1]
sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
feats = <const FeatureC*>_feats
for i in range(self._nr_model):
for j in range(nr_feat):
sub_feats[j] = feats[j]
sub_feats[j].value *= self._masks[i][j]
self.c = self._models_c[i][0]
self.c.weights = self._models_c[i].weights
self.c.gradient = self._models_c[i].gradient
ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
for j in range(nr_class):
scores[j] += sub_scores[j]
sub_scores[j] = 0.0
for j in range(nr_class):
scores[j] /= self._nr_model
free(sub_feats)
free(sub_scores)
def update(self, Example eg):
if eg.cost == 0:
return 0.0
loss = 0.0
full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
cdef ParserNeuralNet model
for i, model in enumerate(self._models):
for j in range(eg.nr_feat):
eg.c.features[j].value *= self._masks[i][j]
loss += model.update(eg)
memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
free(full_feats)
return loss
def end_training(self):
for model in self._models:
model.end_training()
cdef class Parser: cdef class Parser:
def __init__(self, StringStore strings, transition_system, ParserNeuralNet model, def __init__(self, StringStore strings, transition_system, model):
int projectivize = 0):
self.moves = transition_system self.moves = transition_system
self.model = model self.model = model
self._projectivize = projectivize
@classmethod @classmethod
def from_dir(cls, model_dir, strings, transition_system): def from_dir(cls, model_dir, strings, transition_system):
@ -148,16 +318,24 @@ cdef class Parser:
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
cfg = Config.read(model_dir, 'config') cfg = Config.read(model_dir, 'config')
moves = transition_system(strings, cfg.labels) moves = transition_system(strings, cfg.labels)
model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width,
depth=cfg.depth, word_width=cfg.word_width,
tag_width=cfg.tag_width, dep_width=cfg.dep_width,
update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho)
project = cfg.projectivize if hasattr(cfg,'projectivize') else False if cfg.get('model') == 'neural':
shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
shape.extend(cfg.hidden_layers)
shape.append(moves.n_moves)
if cfg.get('ensemble_size') >= 2:
model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho,
n=cfg.ensemble_size)
else:
model = ParserNeuralNet(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho)
else:
model = ParserPerceptron(get_templates(cfg.feat_set))
if path.exists(path.join(model_dir, 'model')): if path.exists(path.join(model_dir, 'model')):
model.load(path.join(model_dir, 'model')) model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model, project) return cls(strings, moves, model)
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, vocab): def load(cls, pkg_or_str_or_file, vocab):
@ -253,18 +431,18 @@ cdef class Parser:
widths=self.model.widths, widths=self.model.widths,
nr_atom=CONTEXT_SIZE, nr_atom=CONTEXT_SIZE,
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
cdef weight_t loss = 0 loss = 0
cdef Transition action cdef Transition action
while not stcls.is_final(): while not stcls.is_final():
self.model.set_featuresC(eg.c, stcls.c) self.model.set_featuresC(eg.c, stcls.c)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
# Sets eg.c.scores, which Example uses to calculate eg.guess assert guess >= 0
self.model.updateC(eg.c) action = self.moves.c[guess]
action = self.moves.c[eg.guess]
action.do(stcls.c, action.label) action.do(stcls.c, action.label)
loss += eg.loss
loss += self.model.update(eg)
eg.reset() eg.reset()
return loss return loss

View File

@ -7,7 +7,7 @@ from .vocab cimport Vocab
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except * cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil
cdef class Tagger: cdef class Tagger:

View File

@ -71,13 +71,13 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil:
token = <const TokenC*>_token
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P2_orth], token - 2)
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[P1_orth], token - 1)
_fill_from_token(&eg.atoms[W_orth], &tokens[i]) _fill_from_token(&eg.atoms[W_orth], token)
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) _fill_from_token(&eg.atoms[N1_orth], token + 1)
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) _fill_from_token(&eg.atoms[N2_orth], token + 2)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
@ -153,7 +153,7 @@ cdef class Tagger:
@classmethod @classmethod
def from_package(cls, pkg, vocab): def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
# templates = cls.default_templates() #templates = cls.default_templates()
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates()) templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
model = TaggerModel(templates) model = TaggerModel(templates)
@ -202,12 +202,13 @@ cdef class Tagger:
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
if tokens.c[i].pos == 0: if tokens.c[i].pos == 0:
self.model.set_featuresC(eg.c, tokens.c, i) self.model.set_featuresC(eg.c, &tokens.c[i])
self.model.set_scoresC(eg.c.scores, self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat, 1) eg.c.features, eg.c.nr_feat, 1)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
self.vocab.morphology.assign_tag(&tokens.c[i], guess) self.vocab.morphology.assign_tag(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class) eg.fill_scores(0, eg.c.nr_class)
eg.reset()
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
@ -231,18 +232,15 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags, nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
self.model.set_featuresC(eg.c, tokens.c, i) self.model.set_featuresC(eg.c, &tokens.c[i])
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
self.model.set_scoresC(eg.c.scores, self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat, 1) eg.c.features, eg.c.nr_feat, 1)
self.model.updateC(eg.c)
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess) self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
self.model.update(eg)
correct += eg.cost == 0 correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1 self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class) eg.reset()
eg.fill_costs(0, eg.c.nr_class)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
return correct return correct