Working NN, but very messy. Relies on BLIS.

This commit is contained in:
Matthew Honnibal 2016-07-20 16:28:02 +02:00
parent 7c2f1a673b
commit de7c6c48d8
13 changed files with 683 additions and 213 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
@ -9,6 +10,8 @@ import io
import random
import time
import gzip
import re
import numpy
import plac
import cProfile
@ -20,23 +23,29 @@ from spacy.gold import GoldParse
from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser
from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.scorer import Scorer
from spacy.tagger import Tagger
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax import _parse_features as pf
# Last updated for spaCy v0.97
def read_conll(file_):
def read_conll(file_, n=0):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
text = file_.read().strip()
sent_strs = re.split(r'\n\s*\n', text)
for sent_id, sent_str in enumerate(sent_strs):
if not sent_str.strip():
continue
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
for i, line in enumerate(sent_str.strip().split('\n')):
word, pos_string, head_idx, label = _parse_line(line)
words.append(word)
if head_idx < 0:
@ -45,10 +54,10 @@ def read_conll(file_):
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
text = ' '.join(words)
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
sents.append((None, [(annot, [])]))
return sents
yield (None, [(annot, None)])
if n and sent_id >= n:
break
def _parse_line(line):
@ -68,21 +77,33 @@ def _parse_line(line):
pos = pieces[4]
head_idx = int(pieces[6])-1
label = pieces[7]
if head_idx == 0:
if head_idx < 0:
label = 'ROOT'
return word, pos, head_idx, label
def print_words(strings, words, embeddings):
ids = {strings[word]: word for word in words}
vectors = {}
for key, values in embeddings[5]:
if key in ids:
vectors[strings[key]] = values
for word in words:
if word in vectors:
print(word, vectors[word])
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
gold_preproc=False, force_gold=False):
def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
learn_rate=0.001, update_step='sgd_cm',
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples))
if feat_set != 'neural':
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples))
else:
feat_groups = [
(pf.core_words, 8),
(pf.core_tags, 4),
(pf.core_labels, 4),
(pf.core_shapes, 4),
([f[0] for f in pf.valencies], 2)
]
slots = []
vector_widths = []
feat_set = []
input_length = 0
for i, (feat_group, width) in enumerate(feat_groups):
feat_set.extend((f,) for f in feat_group)
slots += [i] * len(feat_group)
vector_widths.append(width)
input_length += width * len(feat_group)
hidden_layers = [128] * 5
rho = 1e-4
Config.write(dep_model_dir, 'config',
model='neural',
seed=seed,
labels=ArcEager.get_labels(gold_tuples),
feat_set=feat_set,
vector_widths=vector_widths,
slots=slots,
hidden_layers=hidden_layers,
update_step=update_step,
batch_norm=batch_norm,
eta=learn_rate,
mu=0.9,
ensemble_size=1,
rho=rho)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
for word in nlp.vocab:
word.norm = word.orth
words = list(nlp.vocab)
top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32')
norms = numpy.ndarray(shape=(10000,), dtype='float32')
for i in range(10000):
if i >= 400 and words[i].has_vector:
top5k[i] = words[i].vector
norms[i] = numpy.sqrt(sum(top5k[i] ** 2))
else:
# Make these way off values, to make big distance.
top5k[i] = 100.0
norms[i] = 100.0
print("Setting vectors")
for word in words[10000:]:
if word.has_vector:
cosines = numpy.dot(top5k, word.vector)
cosines /= norms * numpy.sqrt(sum(word.vector ** 2))
most_similar = words[numpy.argmax(cosines)]
word.norm = most_similar.norm
else:
word.norm = word.shape
print(nlp.parser.model.widths)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize")
last_score = 0.0
nr_trimmed = 0
eg_seen = 0
loss = 0
for itn in range(n_iter):
scorer = Scorer()
loss = 0
random.shuffle(gold_tuples)
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, None, annot_tuples, verbose=False)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)
if not gold.is_projective:
raise Exception(
"Non-projective sentence in training, after we should "
"have enforced projectivity: %s" % annot_tuples
)
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
gold = GoldParse(tokens, annot_tuples)
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc, scorer.token_acc))
print('end training')
eg_seen += 1
if eg_seen % 10000 == 0:
scorer = Scorer()
with io.open(dev_loc, 'r', encoding='utf8') as file_:
for _, sents in read_conll(file_):
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
train_scorer = Scorer()
for _, sents in gold_tuples[:1000]:
for annot_tuples, _ in sents:
score_model(train_scorer, nlp, None, annot_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
train_scorer.uas, scorer.uas,
nlp.parser.model.mem.size))
loss = 0
if feat_set != 'basic':
nlp.parser.model.eta *= 0.99
threshold = 0.05 * (1.05 ** itn)
nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True)
nlp.end_training(model_dir)
print('done')
return nlp
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
update_step=("Update step", "option", "u", str),
learn_rate=("Learn rate", "option", "e", float),
neural=("Use neural network?", "flag", "N")
)
def main(train_loc, dev_loc, model_dir, n_iter=15):
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
learn_rate=0.001, update_step='sgd_cm'):
with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = read_conll(file_)
if not eval_only:
train(English, train_sents, model_dir, n_iter=n_iter)
nlp = English(data_dir=model_dir)
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
train_sents = list(read_conll(file_))
# preprocess training data here before ArcEager.get_labels() is called
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
feat_set='neural' if neural else 'basic',
batch_norm=batch_norm,
learn_rate=learn_rate,
update_step=update_step)
scorer = Scorer()
for _, sents in dev_sents:
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
print('TOK', 100-scorer.token_acc)
with io.open(dev_loc, 'r', encoding='utf8') as file_:
for _, sents in read_conll(file_):
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
if __name__ == '__main__':
plac.call(main)

View File

@ -23,7 +23,8 @@ from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.parser import Parser, get_templates
from spacy.syntax.beam_parser import BeamParser
from spacy.syntax.nonproj import PseudoProjectivity
@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples),
beam_width=beam_width,projectivize=pseudoprojective)
#feat_set, slots = get_templates('neural')
#vector_widths = [10, 10, 10]
#hidden_layers = [100, 100, 100]
#update_step = 'adam'
#eta = 0.001
#rho = 1e-4
#Config.write(dep_model_dir, 'config', model='neural',
# seed=seed, labels=ArcEager.get_labels(gold_tuples),
# feat_set=feat_set,
# vector_widths=vector_widths,
# slots=slots,
# hidden_layers=hidden_layers,
# update_step=update_step,
# eta=eta,
# rho=rho)
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=BiluoPushDown.get_labels(gold_tuples),
beam_width=0)
@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
print(nlp.parser.model.widths)
for raw_text, sents in gold_tuples:
for annot_tuples, ctnt in sents:
for word in annot_tuples[1]:
_ = nlp.vocab[word]
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(lang, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
feat_set='neural' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose,pseudoprojective=pseudoprojective)
if out_loc:
write_parses(lang, dev_loc, model_dir, out_loc)
print(model_dir)
scorer = evaluate(lang, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)

View File

@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import get_templates
from spacy.scorer import Scorer
import spacy.attrs
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.syntax._parse_features import *
from spacy.language import Language
from spacy.tagger import W_orth
TAGGER_TEMPLATES = (
(W_orth,),
)
try:
from codecs import open
except ImportError:
pass
features = [
(S2W,),
(S1W, ),
(S1rW,),
(S0lW, ),
(S0l2W, ),
(S0W, ),
(S0r2W, ),
(S0rW, ),
(N0l2W, ),
(N0lW, ),
(N0W, ),
(N1W, ),
(N2W, )
]
slots = [0] * len(features)
features += [
(S2p,),
(S1p, ),
(S1rp,),
(S0lp,),
(S0l2p,),
(S0p, ),
(S0r2p, ),
(S0rp, ),
(N0l2p, ),
(N0lp, ),
(N0p, ),
(N1p, ),
(N2p, )
]
slots += [1] * (len(features) - len(slots))
features += [
(S2L,),
(S1L,),
(S1rL,),
(S0lL,),
(S0l2L,),
(S0L,),
(S0rL,),
(S0r2L,),
(N0l2L,),
(N0lL,),
]
slots += [2] * (len(features) - len(slots))
#
#features += [(S2p, S1p), (S1p, S0p)]
#slots += [3, 3]
#features += [(S0p, N0p)]
#slots += [4]
# (S0l2p, S0l2L, S0lp, S0l2L),
# (N0l2p, N0l2L, N0lp, N0lL),
# (S1p, S1rp, S1rL),
# (S0p, S0rp, S0rL),
#)
class TreebankParser(object):
@staticmethod
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,),
hidden_layers=(300, 300),
feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
@ -43,15 +105,16 @@ class TreebankParser(object):
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=labels)
Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set,
seed=seed, labels=labels, vector_widths=vector_widths, slots=slots,
hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho)
@classmethod
def from_dir(cls, tag_map, model_dir):
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs())
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
tokenizer = Tokenizer(vocab, {}, None, None, None)
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
tagger = Tagger.blank(vocab, Tagger.default_templates())
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
@ -64,22 +127,14 @@ class TreebankParser(object):
self.parser = parser
def train(self, words, tags, heads, deps):
tokens = self.tokenizer.tokens_from_list(list(words))
self.tagger.train(tokens, tags)
tokens = self.tokenizer.tokens_from_list(list(words))
ids = range(len(words))
ner = ['O'] * len(words)
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
make_projective=False)
self.tagger(tokens)
if gold.is_projective:
try:
self.parser.train(tokens, gold)
except:
for id_, word, head, dep in zip(ids, words, heads, deps):
print(id_, word, head, dep)
raise
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)))
self.tagger.tag_from_strings(tokens, tags)
loss = self.parser.train(tokens, gold)
PseudoProjectivity.deprojectivize(tokens)
return loss
def __call__(self, words, tags=None):
tokens = self.tokenizer.tokens_from_list(list(words))
@ -88,6 +143,7 @@ class TreebankParser(object):
else:
self.tagger.tag_from_strings(tokens, tags)
self.parser(tokens)
PseudoProjectivity.deprojectivize(tokens)
return tokens
def end_training(self, data_dir):
@ -101,8 +157,6 @@ class TreebankParser(object):
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
def read_conllx(loc):
with open(loc, 'r', 'utf8') as file_:
text = file_.read()
@ -119,8 +173,8 @@ def read_conllx(loc):
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep
tokens.append((id_, word, tag, head, dep, 'O'))
tuples = zip(*tokens)
tokens.append([id_, word, tag, head, dep, 'O'])
tuples = [list(el) for el in zip(*tokens)]
yield (None, [(tuples, [])])
@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False):
return scorer
def main(train_loc, dev_loc, model_dir, tag_map_loc):
@plac.annotations(
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10):
with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read())
train_sents = list(read_conllx(train_loc))
labels = ArcEager.get_labels(train_sents)
templates = get_templates('basic')
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
dev_sents = list(read_conllx(dev_loc))
TreebankParser.setup_model_dir(model_dir, labels, templates)
labels = ArcEager.get_labels(train_sents)
TreebankParser.setup_model_dir(model_dir, labels,
feat_set=features, vector_widths=(10,10,10,30,30), slots=slots,
hidden_layers=(100,100,100), update_step='adam')
nlp = TreebankParser.from_dir(tag_map, model_dir)
nlp.parser.model.rho = 1e-4
print(nlp.parser.model.widths)
for itn in range(15):
for itn in range(n_iter):
loss = 0.0
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
nlp.train(words, tags, heads, deps)
loss += nlp.train(words, tags, heads, deps)
random.shuffle(train_sents)
scorer = score_model(nlp, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
scorer = score_model(nlp, dev_sents)
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
print(nlp.parser.model.mem.size)
nlp.end_training(model_dir)
scorer = score_model(nlp, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__':

View File

@ -51,6 +51,7 @@ MOD_NAMES = [
'spacy.syntax._state',
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.beam_parser',
'spacy.syntax.nonproj',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
@ -73,7 +74,8 @@ MOD_NAMES = [
compile_options = {
'msvc': ['/Ox', '/EHsc'],
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
'-I/Users/matt/blis/include/blis']
}

View File

@ -1,3 +1,4 @@
# cython: profile=True
import numpy
import io
import json
@ -264,13 +265,3 @@ cdef class GoldParse:
def is_punct_label(label):
return label == 'P' or label.lower() == 'punct'

View File

@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[11] = 0
context[12] = 0
else:
context[0] = token.lex.orth
context[1] = token.lemma
context[0] = token.lex.norm
context[1] = token.lex.norm
context[2] = token.tag
context[3] = token.lex.cluster
# We've read in the string little-endian, so now we can take & (2**n)-1
@ -366,27 +366,26 @@ trigrams = (
words = (
S2w,
S1w,
S1rw,
S0lw,
S0l2w,
S0w,
S0r2w,
S0rw,
N0lw,
N0l2w,
N0w,
N1w,
N2w,
P1w,
P2w
S2W,
S1W,
S1rW,
S0lW,
S0l2W,
S0W,
S0r2W,
S0rW,
N0lW,
N0l2W,
N0W,
N1W,
N2W,
P1W,
P2W
)
tags = (
S2p,
S1p,
S1rp,
S0lp,
S0l2p,
S0p,
@ -404,7 +403,6 @@ tags = (
labels = (
S2L,
S1L,
S1rL,
S0lL,
S0l2L,
S0L,
@ -412,9 +410,88 @@ labels = (
S0rL,
N0lL,
N0l2L,
N0L,
N1L,
N2L,
P1L,
P2L
)
core_words = (
S2w,
S1w,
S0lw,
S0l2w,
S0w,
S0rw,
S0r2w,
N0lw,
N0l2w,
N0w,
N1w,
N2w,
)
core_shapes = (
S2_shape,
S1_shape,
S0l_shape,
S0l2_shape,
S0_shape,
S0r_shape,
S0r2_shape,
N0l_shape,
N0l2_shape,
N0_shape,
N1_shape,
N2_shape,
)
core_clusters = (
S2c,
S1c,
S0lc,
S0l2c,
S0c,
S0rc,
S0r2c,
N0lc,
N0l2c,
N0c,
N1c,
N2c,
)
core_tags = (
S2p,
S1p,
S0lp,
S0l2p,
S0p,
S0r2p,
S0rp,
N0lp,
N0l2p,
N0p,
N1p,
N2p,
)
core_labels = (
S2L,
S1L,
S0lL,
S0l2L,
S0L,
S0r2L,
S0rL,
N0lL,
N0l2L,
)
valencies = (
(N0lv,),
(S0lv,),
(S0rv,),
(S1lv,),
(S1rv,),
)

View File

@ -1,6 +1,9 @@
from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint32_t
from libc.stdint cimport uint32_t, uint64_t
from murmurhash.mrmr cimport hash64
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme
@ -201,6 +204,21 @@ cdef cppclass StateC:
else:
return this.length - this._b_i
uint64_t hash() nogil const:
cdef TokenC[11] sig
sig[0] = this.S_(2)[0]
sig[1] = this.S_(1)[0]
sig[2] = this.R_(this.S(1), 1)[0]
sig[3] = this.L_(this.S(0), 1)[0]
sig[4] = this.L_(this.S(0), 2)[0]
sig[5] = this.S_(0)[0]
sig[6] = this.R_(this.S(0), 2)[0]
sig[7] = this.R_(this.S(0), 1)[0]
sig[8] = this.B_(0)[0]
sig[9] = this.E_(0)[0]
sig[10] = this.E_(1)[0]
return hash64(sig, sizeof(sig), this._s_i)
void push() nogil:
if this.B(0) != -1:
this._stack[this._s_i] = this.B(0)
@ -290,6 +308,8 @@ cdef cppclass StateC:
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i
this._s_i = src._s_i
this._e_i = src._e_i

View File

@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem):
else:
is_valid[i] = False
costs[i] = 9000
assert n_gold >= 1
if n_gold < 1:
for annot in gold.orig_annot:
print(annot)
print([move_costs[i] for i in range(N_MOVES)])
print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1])
print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)])
print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)])
raise Exception("No gold moves")

View File

@ -10,7 +10,7 @@ def english_noun_chunks(doc):
for i, word in enumerate(doc):
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
yield word.left_edge.i, word.i+1, np_label
elif word.pos == NOUN and word.dep == conj:
elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head

View File

@ -1,25 +1,37 @@
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.neural.nn cimport NeuralNet
from thinc.linear.features cimport ConjunctionExtracter
from thinc.base cimport Model
from thinc.extra.eg cimport Example
from thinc.typedefs cimport weight_t
from thinc.structs cimport FeatureC
from .stateclass cimport StateClass
from .arc_eager cimport TransitionSystem
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from thinc.structs cimport ExampleC
from thinc.structs cimport NeuralNetC, ExampleC
from ._state cimport StateC
cdef class ParserNeuralNet(NeuralNet):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
cdef ConjunctionExtracter extracter
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserPerceptron(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
cdef object _models
cdef NeuralNetC** _models_c
cdef int** _masks
cdef int _nr_model
cdef class Parser:
cdef readonly ParserNeuralNet model
cdef readonly Model model
cdef readonly TransitionSystem moves
cdef int _projectivize

View File

@ -1,4 +1,5 @@
# cython: infer_types=True
# cython: profile=True
"""
MALT-style dependency parser
"""
@ -18,13 +19,14 @@ import shutil
import json
import sys
from .nonproj import PseudoProjectivity
import random
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC, ExampleC
from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.structs cimport FeatureC
@ -61,8 +63,10 @@ def get_templates(name):
return pf.ner
elif name == 'debug':
return pf.unigrams
elif name.startswith('embed'):
return (pf.words, pf.tags, pf.labels)
elif name.startswith('neural'):
features = pf.words + pf.tags + pf.labels
slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
return ([(f,) for f in features], slots)
else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams)
@ -73,72 +77,238 @@ def ParserFactory(transition_system):
cdef class ParserPerceptron(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
@property
def widths(self):
return (self.extracter.nr_templ,)
def update(self, Example eg):
'''Does regression on negative cost. Sort of cute?'''
self.time += 1
cdef weight_t loss = 0.0
best = eg.best
for clas in range(eg.c.nr_class):
if not eg.c.is_valid[clas]:
continue
if eg.c.scores[clas] < eg.c.scores[best]:
continue
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
step = d_loss * 0.001
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, clas, feat.value * step)
return int(loss)
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
state = <const StateC*>_state
fill_context(eg.atoms, state)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef class ParserNeuralNet(NeuralNet):
def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50,
tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0):
#input_length = 3 * word_width + 5 * tag_width + 3 * dep_width
input_length = 12 * word_width + 7 * dep_width
widths = [input_length] + [hidden_width] * depth + [nr_class]
#vector_widths = [word_width, tag_width, dep_width]
#slots = [0] * 3 + [1] * 5 + [2] * 3
vector_widths = [word_width, dep_width]
slots = [0] * 12 + [1] * 7
NeuralNet.__init__(
self,
widths,
embed=(vector_widths, slots),
eta=eta,
rho=rho,
update_step=update_step)
def __init__(self, shape, **kwargs):
vector_widths = [4] * 57
slots = [0, 1, 2, 3] # S0
slots += [4, 5, 6, 7] # S1
slots += [8, 9, 10, 11] # S2
slots += [12, 13, 14, 15] # S3+
slots += [16, 17, 18, 19] # B0
slots += [20, 21, 22, 23] # B1
slots += [24, 25, 26, 27] # B2
slots += [28, 29, 30, 31] # B3+
slots += [32, 33, 34, 35] * 2 # S0l, S0r
slots += [36, 37, 38, 39] * 2 # B0l, B0r
slots += [40, 41, 42, 43] * 2 # S1l, S1r
slots += [44, 45, 46, 47] * 2 # S2l, S2r
slots += [48, 49, 50, 51, 52]
slots += [53, 54, 55, 56]
input_length = sum(vector_widths[slot] for slot in slots)
widths = [input_length] + shape[3:]
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
@property
def nr_feat(self):
#return 3+5+3
return 12+7
return 2000
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
memset(eg.features, 0, 2000 * sizeof(FeatureC))
state = <const StateC*>_state
fill_context(eg.atoms, state)
eg.nr_feat = 12 + 7
for j in range(eg.nr_feat):
eg.features[j].value = 1.0
eg.features[j].i = j
#eg.features[0].key = eg.atoms[S0w]
#eg.features[1].key = eg.atoms[S1w]
#eg.features[2].key = eg.atoms[N0w]
feats = eg.features
eg.features[0].key = eg.atoms[S2W]
eg.features[1].key = eg.atoms[S1W]
eg.features[2].key = eg.atoms[S0lW]
eg.features[3].key = eg.atoms[S0l2W]
eg.features[4].key = eg.atoms[S0W]
eg.features[5].key = eg.atoms[S0r2W]
eg.features[6].key = eg.atoms[S0rW]
eg.features[7].key = eg.atoms[N0lW]
eg.features[8].key = eg.atoms[N0l2W]
eg.features[9].key = eg.atoms[N0W]
eg.features[10].key = eg.atoms[N1W]
eg.features[11].key = eg.atoms[N2W]
feats = _add_token(feats, 0, state.S_(0), 1.0)
feats = _add_token(feats, 4, state.S_(1), 1.0)
feats = _add_token(feats, 8, state.S_(2), 1.0)
# Rest of the stack, with exponential decay
for i in range(3, state.stack_depth()):
feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
feats = _add_token(feats, 16, state.B_(0), 1.0)
feats = _add_token(feats, 20, state.B_(1), 1.0)
feats = _add_token(feats, 24, state.B_(2), 1.0)
# Rest of the buffer, with exponential decay
for i in range(3, min(8, state.buffer_length())):
feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
feats = _add_subtree(feats, 32, state, state.S(0))
feats = _add_subtree(feats, 40, state, state.B(0))
feats = _add_subtree(feats, 48, state, state.S(1))
feats = _add_subtree(feats, 56, state, state.S(2))
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
state.R_(state.S(0), 2))
feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
state.L_(state.S(0), 2))
eg.nr_feat = feats - eg.features
eg.features[12].key = eg.atoms[S2L]
eg.features[13].key = eg.atoms[S1L]
eg.features[14].key = eg.atoms[S0l2L]
eg.features[15].key = eg.atoms[S0lL]
eg.features[16].key = eg.atoms[S0L]
eg.features[17].key = eg.atoms[S0r2L]
eg.features[18].key = eg.atoms[S0rL]
cdef inline FeatureC* _add_token(FeatureC* feats,
int slot, const TokenC* token, weight_t value) nogil:
# Word
feats.i = slot
feats.key = token.lex.norm
feats.value = value
feats += 1
# POS tag
feats.i = slot+1
feats.key = token.tag
feats.value = value
feats += 1
# Dependency label
feats.i = slot+2
feats.key = token.dep
feats.value = value
feats += 1
# Word, label, tag
feats.i = slot+3
cdef uint64_t key[3]
key[0] = token.lex.cluster
key[1] = token.tag
key[2] = token.dep
feats.key = hash64(key, sizeof(key), 0)
feats.value = value
feats += 1
return feats
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
value = 1.0
for i in range(state.n_R(t)):
feats = _add_token(feats, slot, state.R_(t, i+1), value)
value *= 0.5
slot += 4
value = 1.0
for i in range(state.n_L(t)):
feats = _add_token(feats, slot, state.L_(t, i+1), value)
value *= 0.5
return feats
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2) nogil:
cdef uint64_t[2] key
key[0] = t1.tag
key[1] = t2.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
cdef uint64_t[3] key
key[0] = t1.tag
key[1] = t2.tag
key[2] = t3.tag
feat.i = slot
feat.key = hash64(key, sizeof(key), slot)
feat.value = 1.0
return feat+1
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
self._masks = <int**>self.mem.alloc(sizeof(int*), n)
self._models = []
cdef ParserNeuralNet model
threshold = 1.5 / n
self._nr_model = n
for i in range(n):
self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
for j in range(self.nr_feat):
self._masks[i][j] = random.random() < threshold
# We have to pass our pool here, because the embedding table passes
# it around.
model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
self._models_c[i] = &model.c
self._models.append(model)
property eta:
def __get__(self):
return self._models[0].eta
def __set__(self, weight_t value):
for model in self._models:
model.eta = value
def sparsify_embeddings(self, penalty):
p = 0.0
for model in self._models:
p += model.sparsify_embeddings(penalty)
return p / len(self._models)
cdef void set_scoresC(self, weight_t* scores, const void* _feats,
int nr_feat, int is_sparse) nogil:
nr_class = self.c.widths[self.c.nr_layer-1]
sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
feats = <const FeatureC*>_feats
for i in range(self._nr_model):
for j in range(nr_feat):
sub_feats[j] = feats[j]
sub_feats[j].value *= self._masks[i][j]
self.c = self._models_c[i][0]
self.c.weights = self._models_c[i].weights
self.c.gradient = self._models_c[i].gradient
ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
for j in range(nr_class):
scores[j] += sub_scores[j]
sub_scores[j] = 0.0
for j in range(nr_class):
scores[j] /= self._nr_model
free(sub_feats)
free(sub_scores)
def update(self, Example eg):
if eg.cost == 0:
return 0.0
loss = 0.0
full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
cdef ParserNeuralNet model
for i, model in enumerate(self._models):
for j in range(eg.nr_feat):
eg.c.features[j].value *= self._masks[i][j]
loss += model.update(eg)
memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
free(full_feats)
return loss
def end_training(self):
for model in self._models:
model.end_training()
cdef class Parser:
def __init__(self, StringStore strings, transition_system, ParserNeuralNet model,
int projectivize = 0):
def __init__(self, StringStore strings, transition_system, model):
self.moves = transition_system
self.model = model
self._projectivize = projectivize
@classmethod
def from_dir(cls, model_dir, strings, transition_system):
@ -148,16 +318,24 @@ cdef class Parser:
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
cfg = Config.read(model_dir, 'config')
moves = transition_system(strings, cfg.labels)
model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width,
depth=cfg.depth, word_width=cfg.word_width,
tag_width=cfg.tag_width, dep_width=cfg.dep_width,
update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho)
project = cfg.projectivize if hasattr(cfg,'projectivize') else False
if cfg.get('model') == 'neural':
shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
shape.extend(cfg.hidden_layers)
shape.append(moves.n_moves)
if cfg.get('ensemble_size') >= 2:
model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho,
n=cfg.ensemble_size)
else:
model = ParserNeuralNet(shape, update_step=cfg.update_step,
eta=cfg.eta, rho=cfg.rho)
else:
model = ParserPerceptron(get_templates(cfg.feat_set))
if path.exists(path.join(model_dir, 'model')):
model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model, project)
return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
@ -253,18 +431,18 @@ cdef class Parser:
widths=self.model.widths,
nr_atom=CONTEXT_SIZE,
nr_feat=self.model.nr_feat)
cdef weight_t loss = 0
loss = 0
cdef Transition action
while not stcls.is_final():
self.model.set_featuresC(eg.c, stcls.c)
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
# Sets eg.c.scores, which Example uses to calculate eg.guess
self.model.updateC(eg.c)
action = self.moves.c[eg.guess]
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
assert guess >= 0
action = self.moves.c[guess]
action.do(stcls.c, action.label)
loss += eg.loss
loss += self.model.update(eg)
eg.reset()
return loss

View File

@ -7,7 +7,7 @@ from .vocab cimport Vocab
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil
cdef class Tagger:

View File

@ -71,13 +71,13 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil:
token = <const TokenC*>_token
_fill_from_token(&eg.atoms[P2_orth], token - 2)
_fill_from_token(&eg.atoms[P1_orth], token - 1)
_fill_from_token(&eg.atoms[W_orth], token)
_fill_from_token(&eg.atoms[N1_orth], token + 1)
_fill_from_token(&eg.atoms[N2_orth], token + 2)
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
@ -153,7 +153,7 @@ cdef class Tagger:
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package
# templates = cls.default_templates()
#templates = cls.default_templates()
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
model = TaggerModel(templates)
@ -202,12 +202,13 @@ cdef class Tagger:
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
if tokens.c[i].pos == 0:
self.model.set_featuresC(eg.c, tokens.c, i)
self.model.set_featuresC(eg.c, &tokens.c[i])
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat, 1)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class)
eg.reset()
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
@ -231,18 +232,15 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
self.model.set_featuresC(eg.c, tokens.c, i)
self.model.set_featuresC(eg.c, &tokens.c[i])
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat, 1)
self.model.updateC(eg.c)
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
self.model.update(eg)
correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.c.nr_class)
eg.reset()
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
return correct