mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Working NN, but very messy. Relies on BLIS.
This commit is contained in:
parent
7c2f1a673b
commit
de7c6c48d8
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
@ -9,6 +10,8 @@ import io
|
|||
import random
|
||||
import time
|
||||
import gzip
|
||||
import re
|
||||
import numpy
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
|
@ -20,23 +23,29 @@ from spacy.gold import GoldParse
|
|||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.syntax.parser import Parser, get_templates
|
||||
from spacy.syntax.beam_parser import BeamParser
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.syntax import _parse_features as pf
|
||||
|
||||
# Last updated for spaCy v0.97
|
||||
|
||||
|
||||
def read_conll(file_):
|
||||
def read_conll(file_, n=0):
|
||||
"""Read a standard CoNLL/MALT-style format"""
|
||||
sents = []
|
||||
for sent_str in file_.read().strip().split('\n\n'):
|
||||
text = file_.read().strip()
|
||||
sent_strs = re.split(r'\n\s*\n', text)
|
||||
for sent_id, sent_str in enumerate(sent_strs):
|
||||
if not sent_str.strip():
|
||||
continue
|
||||
ids = []
|
||||
words = []
|
||||
heads = []
|
||||
labels = []
|
||||
tags = []
|
||||
for i, line in enumerate(sent_str.split('\n')):
|
||||
for i, line in enumerate(sent_str.strip().split('\n')):
|
||||
word, pos_string, head_idx, label = _parse_line(line)
|
||||
words.append(word)
|
||||
if head_idx < 0:
|
||||
|
@ -45,10 +54,10 @@ def read_conll(file_):
|
|||
heads.append(head_idx)
|
||||
labels.append(label)
|
||||
tags.append(pos_string)
|
||||
text = ' '.join(words)
|
||||
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
||||
sents.append((None, [(annot, [])]))
|
||||
return sents
|
||||
yield (None, [(annot, None)])
|
||||
if n and sent_id >= n:
|
||||
break
|
||||
|
||||
|
||||
def _parse_line(line):
|
||||
|
@ -68,21 +77,33 @@ def _parse_line(line):
|
|||
pos = pieces[4]
|
||||
head_idx = int(pieces[6])-1
|
||||
label = pieces[7]
|
||||
if head_idx == 0:
|
||||
if head_idx < 0:
|
||||
label = 'ROOT'
|
||||
return word, pos, head_idx, label
|
||||
|
||||
|
||||
def print_words(strings, words, embeddings):
|
||||
ids = {strings[word]: word for word in words}
|
||||
vectors = {}
|
||||
for key, values in embeddings[5]:
|
||||
if key in ids:
|
||||
vectors[strings[key]] = values
|
||||
for word in words:
|
||||
if word in vectors:
|
||||
print(word, vectors[word])
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
gold_preproc=False, force_gold=False):
|
||||
def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
|
||||
learn_rate=0.001, update_step='sgd_cm',
|
||||
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
|
@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
|
|||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples))
|
||||
if feat_set != 'neural':
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples))
|
||||
|
||||
else:
|
||||
feat_groups = [
|
||||
(pf.core_words, 8),
|
||||
(pf.core_tags, 4),
|
||||
(pf.core_labels, 4),
|
||||
(pf.core_shapes, 4),
|
||||
([f[0] for f in pf.valencies], 2)
|
||||
]
|
||||
slots = []
|
||||
vector_widths = []
|
||||
feat_set = []
|
||||
input_length = 0
|
||||
for i, (feat_group, width) in enumerate(feat_groups):
|
||||
feat_set.extend((f,) for f in feat_group)
|
||||
slots += [i] * len(feat_group)
|
||||
vector_widths.append(width)
|
||||
input_length += width * len(feat_group)
|
||||
hidden_layers = [128] * 5
|
||||
rho = 1e-4
|
||||
Config.write(dep_model_dir, 'config',
|
||||
model='neural',
|
||||
seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples),
|
||||
feat_set=feat_set,
|
||||
vector_widths=vector_widths,
|
||||
slots=slots,
|
||||
hidden_layers=hidden_layers,
|
||||
update_step=update_step,
|
||||
batch_norm=batch_norm,
|
||||
eta=learn_rate,
|
||||
mu=0.9,
|
||||
ensemble_size=1,
|
||||
rho=rho)
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
for word in nlp.vocab:
|
||||
word.norm = word.orth
|
||||
words = list(nlp.vocab)
|
||||
top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32')
|
||||
norms = numpy.ndarray(shape=(10000,), dtype='float32')
|
||||
for i in range(10000):
|
||||
if i >= 400 and words[i].has_vector:
|
||||
top5k[i] = words[i].vector
|
||||
norms[i] = numpy.sqrt(sum(top5k[i] ** 2))
|
||||
else:
|
||||
# Make these way off values, to make big distance.
|
||||
top5k[i] = 100.0
|
||||
norms[i] = 100.0
|
||||
print("Setting vectors")
|
||||
for word in words[10000:]:
|
||||
if word.has_vector:
|
||||
cosines = numpy.dot(top5k, word.vector)
|
||||
cosines /= norms * numpy.sqrt(sum(word.vector ** 2))
|
||||
most_similar = words[numpy.argmax(cosines)]
|
||||
word.norm = most_similar.norm
|
||||
else:
|
||||
word.norm = word.shape
|
||||
|
||||
print(nlp.parser.model.widths)
|
||||
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize")
|
||||
last_score = 0.0
|
||||
nr_trimmed = 0
|
||||
eg_seen = 0
|
||||
loss = 0
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
random.shuffle(gold_tuples)
|
||||
for _, sents in gold_tuples:
|
||||
for annot_tuples, _ in sents:
|
||||
if len(annot_tuples[1]) == 1:
|
||||
continue
|
||||
|
||||
score_model(scorer, nlp, None, annot_tuples, verbose=False)
|
||||
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||
if not gold.is_projective:
|
||||
raise Exception(
|
||||
"Non-projective sentence in training, after we should "
|
||||
"have enforced projectivity: %s" % annot_tuples
|
||||
)
|
||||
|
||||
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
loss += nlp.parser.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
||||
scorer.tags_acc, scorer.token_acc))
|
||||
print('end training')
|
||||
eg_seen += 1
|
||||
if eg_seen % 10000 == 0:
|
||||
scorer = Scorer()
|
||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||
for _, sents in read_conll(file_):
|
||||
for annot_tuples, _ in sents:
|
||||
score_model(scorer, nlp, None, annot_tuples)
|
||||
train_scorer = Scorer()
|
||||
for _, sents in gold_tuples[:1000]:
|
||||
for annot_tuples, _ in sents:
|
||||
score_model(train_scorer, nlp, None, annot_tuples)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
|
||||
train_scorer.uas, scorer.uas,
|
||||
nlp.parser.model.mem.size))
|
||||
loss = 0
|
||||
if feat_set != 'basic':
|
||||
nlp.parser.model.eta *= 0.99
|
||||
threshold = 0.05 * (1.05 ** itn)
|
||||
nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True)
|
||||
nlp.end_training(model_dir)
|
||||
print('done')
|
||||
return nlp
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||
model_dir=("Location of output model directory"),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
|
||||
update_step=("Update step", "option", "u", str),
|
||||
learn_rate=("Learn rate", "option", "e", float),
|
||||
neural=("Use neural network?", "flag", "N")
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
||||
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
|
||||
learn_rate=0.001, update_step='sgd_cm'):
|
||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
||||
train_sents = read_conll(file_)
|
||||
if not eval_only:
|
||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
||||
nlp = English(data_dir=model_dir)
|
||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
||||
train_sents = list(read_conll(file_))
|
||||
# preprocess training data here before ArcEager.get_labels() is called
|
||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||
|
||||
nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
|
||||
feat_set='neural' if neural else 'basic',
|
||||
batch_norm=batch_norm,
|
||||
learn_rate=learn_rate,
|
||||
update_step=update_step)
|
||||
scorer = Scorer()
|
||||
for _, sents in dev_sents:
|
||||
for annot_tuples, _ in sents:
|
||||
score_model(scorer, nlp, None, annot_tuples)
|
||||
print('TOK', 100-scorer.token_acc)
|
||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||
for _, sents in read_conll(file_):
|
||||
for annot_tuples, _ in sents:
|
||||
score_model(scorer, nlp, None, annot_tuples)
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
|
|
@ -23,7 +23,8 @@ from spacy.scorer import Scorer
|
|||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.ner import BiluoPushDown
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.syntax.parser import Parser, get_templates
|
||||
from spacy.syntax.beam_parser import BeamParser
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
|
||||
|
||||
|
@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples),
|
||||
beam_width=beam_width,projectivize=pseudoprojective)
|
||||
#feat_set, slots = get_templates('neural')
|
||||
#vector_widths = [10, 10, 10]
|
||||
#hidden_layers = [100, 100, 100]
|
||||
#update_step = 'adam'
|
||||
#eta = 0.001
|
||||
#rho = 1e-4
|
||||
#Config.write(dep_model_dir, 'config', model='neural',
|
||||
# seed=seed, labels=ArcEager.get_labels(gold_tuples),
|
||||
# feat_set=feat_set,
|
||||
# vector_widths=vector_widths,
|
||||
# slots=slots,
|
||||
# hidden_layers=hidden_layers,
|
||||
# update_step=update_step,
|
||||
# eta=eta,
|
||||
# rho=rho)
|
||||
|
||||
|
||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=BiluoPushDown.get_labels(gold_tuples),
|
||||
beam_width=0)
|
||||
|
@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
||||
nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
||||
print(nlp.parser.model.widths)
|
||||
for raw_text, sents in gold_tuples:
|
||||
for annot_tuples, ctnt in sents:
|
||||
for word in annot_tuples[1]:
|
||||
_ = nlp.vocab[word]
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
|
@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
|
|||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
train(lang, gold_train, model_dir,
|
||||
feat_set='basic' if not debug else 'debug',
|
||||
feat_set='neural' if not debug else 'debug',
|
||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||
corruption_level=corruption_level, n_iter=n_iter,
|
||||
verbose=verbose,pseudoprojective=pseudoprojective)
|
||||
if out_loc:
|
||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
||||
print(model_dir)
|
||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print('TOK', scorer.token_acc)
|
||||
|
|
|
@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager
|
|||
from spacy.syntax.parser import get_templates
|
||||
from spacy.scorer import Scorer
|
||||
import spacy.attrs
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
|
||||
from spacy.syntax._parse_features import *
|
||||
|
||||
from spacy.language import Language
|
||||
|
||||
from spacy.tagger import W_orth
|
||||
|
||||
TAGGER_TEMPLATES = (
|
||||
(W_orth,),
|
||||
)
|
||||
|
||||
try:
|
||||
from codecs import open
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
features = [
|
||||
(S2W,),
|
||||
(S1W, ),
|
||||
(S1rW,),
|
||||
(S0lW, ),
|
||||
(S0l2W, ),
|
||||
(S0W, ),
|
||||
(S0r2W, ),
|
||||
(S0rW, ),
|
||||
(N0l2W, ),
|
||||
(N0lW, ),
|
||||
(N0W, ),
|
||||
(N1W, ),
|
||||
(N2W, )
|
||||
]
|
||||
|
||||
slots = [0] * len(features)
|
||||
|
||||
features += [
|
||||
(S2p,),
|
||||
(S1p, ),
|
||||
(S1rp,),
|
||||
(S0lp,),
|
||||
(S0l2p,),
|
||||
(S0p, ),
|
||||
(S0r2p, ),
|
||||
(S0rp, ),
|
||||
(N0l2p, ),
|
||||
(N0lp, ),
|
||||
(N0p, ),
|
||||
(N1p, ),
|
||||
(N2p, )
|
||||
]
|
||||
|
||||
slots += [1] * (len(features) - len(slots))
|
||||
|
||||
features += [
|
||||
(S2L,),
|
||||
(S1L,),
|
||||
(S1rL,),
|
||||
(S0lL,),
|
||||
(S0l2L,),
|
||||
(S0L,),
|
||||
(S0rL,),
|
||||
(S0r2L,),
|
||||
(N0l2L,),
|
||||
(N0lL,),
|
||||
]
|
||||
slots += [2] * (len(features) - len(slots))
|
||||
#
|
||||
#features += [(S2p, S1p), (S1p, S0p)]
|
||||
#slots += [3, 3]
|
||||
#features += [(S0p, N0p)]
|
||||
#slots += [4]
|
||||
# (S0l2p, S0l2L, S0lp, S0l2L),
|
||||
# (N0l2p, N0l2L, N0lp, N0lL),
|
||||
# (S1p, S1rp, S1rL),
|
||||
# (S0p, S0rp, S0rL),
|
||||
#)
|
||||
|
||||
|
||||
|
||||
|
||||
class TreebankParser(object):
|
||||
@staticmethod
|
||||
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
|
||||
def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,),
|
||||
hidden_layers=(300, 300),
|
||||
feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
|
@ -43,15 +105,16 @@ class TreebankParser(object):
|
|||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=labels)
|
||||
Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set,
|
||||
seed=seed, labels=labels, vector_widths=vector_widths, slots=slots,
|
||||
hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, tag_map, model_dir):
|
||||
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
|
||||
vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs())
|
||||
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
|
||||
tagger = Tagger.blank(vocab, Tagger.default_templates())
|
||||
|
||||
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
|
||||
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
|
||||
|
@ -64,22 +127,14 @@ class TreebankParser(object):
|
|||
self.parser = parser
|
||||
|
||||
def train(self, words, tags, heads, deps):
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
self.tagger.train(tokens, tags)
|
||||
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
ids = range(len(words))
|
||||
ner = ['O'] * len(words)
|
||||
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
|
||||
make_projective=False)
|
||||
self.tagger(tokens)
|
||||
if gold.is_projective:
|
||||
try:
|
||||
self.parser.train(tokens, gold)
|
||||
except:
|
||||
for id_, word, head, dep in zip(ids, words, heads, deps):
|
||||
print(id_, word, head, dep)
|
||||
raise
|
||||
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)))
|
||||
self.tagger.tag_from_strings(tokens, tags)
|
||||
loss = self.parser.train(tokens, gold)
|
||||
PseudoProjectivity.deprojectivize(tokens)
|
||||
return loss
|
||||
|
||||
def __call__(self, words, tags=None):
|
||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||
|
@ -88,6 +143,7 @@ class TreebankParser(object):
|
|||
else:
|
||||
self.tagger.tag_from_strings(tokens, tags)
|
||||
self.parser(tokens)
|
||||
PseudoProjectivity.deprojectivize(tokens)
|
||||
return tokens
|
||||
|
||||
def end_training(self, data_dir):
|
||||
|
@ -101,8 +157,6 @@ class TreebankParser(object):
|
|||
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
|
||||
|
||||
|
||||
|
||||
|
||||
def read_conllx(loc):
|
||||
with open(loc, 'r', 'utf8') as file_:
|
||||
text = file_.read()
|
||||
|
@ -119,8 +173,8 @@ def read_conllx(loc):
|
|||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||
tuples = zip(*tokens)
|
||||
tokens.append([id_, word, tag, head, dep, 'O'])
|
||||
tuples = [list(el) for el in zip(*tokens)]
|
||||
yield (None, [(tuples, [])])
|
||||
|
||||
|
||||
|
@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False):
|
|||
return scorer
|
||||
|
||||
|
||||
def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
||||
@plac.annotations(
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10):
|
||||
with open(tag_map_loc) as file_:
|
||||
tag_map = json.loads(file_.read())
|
||||
train_sents = list(read_conllx(train_loc))
|
||||
labels = ArcEager.get_labels(train_sents)
|
||||
templates = get_templates('basic')
|
||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||
dev_sents = list(read_conllx(dev_loc))
|
||||
|
||||
TreebankParser.setup_model_dir(model_dir, labels, templates)
|
||||
labels = ArcEager.get_labels(train_sents)
|
||||
|
||||
TreebankParser.setup_model_dir(model_dir, labels,
|
||||
feat_set=features, vector_widths=(10,10,10,30,30), slots=slots,
|
||||
hidden_layers=(100,100,100), update_step='adam')
|
||||
|
||||
nlp = TreebankParser.from_dir(tag_map, model_dir)
|
||||
nlp.parser.model.rho = 1e-4
|
||||
print(nlp.parser.model.widths)
|
||||
|
||||
for itn in range(15):
|
||||
for itn in range(n_iter):
|
||||
loss = 0.0
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
nlp.train(words, tags, heads, deps)
|
||||
loss += nlp.train(words, tags, heads, deps)
|
||||
random.shuffle(train_sents)
|
||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
|
||||
scorer = score_model(nlp, dev_sents)
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
||||
print(nlp.parser.model.mem.size)
|
||||
nlp.end_training(model_dir)
|
||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
4
setup.py
4
setup.py
|
@ -51,6 +51,7 @@ MOD_NAMES = [
|
|||
'spacy.syntax._state',
|
||||
'spacy.tokenizer',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.beam_parser',
|
||||
'spacy.syntax.nonproj',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
|
@ -73,7 +74,8 @@ MOD_NAMES = [
|
|||
compile_options = {
|
||||
'msvc': ['/Ox', '/EHsc'],
|
||||
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
|
||||
'-I/Users/matt/blis/include/blis']
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# cython: profile=True
|
||||
import numpy
|
||||
import io
|
||||
import json
|
||||
|
@ -264,13 +265,3 @@ cdef class GoldParse:
|
|||
|
||||
def is_punct_label(label):
|
||||
return label == 'P' or label.lower() == 'punct'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
|||
context[11] = 0
|
||||
context[12] = 0
|
||||
else:
|
||||
context[0] = token.lex.orth
|
||||
context[1] = token.lemma
|
||||
context[0] = token.lex.norm
|
||||
context[1] = token.lex.norm
|
||||
context[2] = token.tag
|
||||
context[3] = token.lex.cluster
|
||||
# We've read in the string little-endian, so now we can take & (2**n)-1
|
||||
|
@ -366,27 +366,26 @@ trigrams = (
|
|||
|
||||
|
||||
words = (
|
||||
S2w,
|
||||
S1w,
|
||||
S1rw,
|
||||
S0lw,
|
||||
S0l2w,
|
||||
S0w,
|
||||
S0r2w,
|
||||
S0rw,
|
||||
N0lw,
|
||||
N0l2w,
|
||||
N0w,
|
||||
N1w,
|
||||
N2w,
|
||||
P1w,
|
||||
P2w
|
||||
S2W,
|
||||
S1W,
|
||||
S1rW,
|
||||
S0lW,
|
||||
S0l2W,
|
||||
S0W,
|
||||
S0r2W,
|
||||
S0rW,
|
||||
N0lW,
|
||||
N0l2W,
|
||||
N0W,
|
||||
N1W,
|
||||
N2W,
|
||||
P1W,
|
||||
P2W
|
||||
)
|
||||
|
||||
tags = (
|
||||
S2p,
|
||||
S1p,
|
||||
S1rp,
|
||||
S0lp,
|
||||
S0l2p,
|
||||
S0p,
|
||||
|
@ -404,7 +403,6 @@ tags = (
|
|||
labels = (
|
||||
S2L,
|
||||
S1L,
|
||||
S1rL,
|
||||
S0lL,
|
||||
S0l2L,
|
||||
S0L,
|
||||
|
@ -412,9 +410,88 @@ labels = (
|
|||
S0rL,
|
||||
N0lL,
|
||||
N0l2L,
|
||||
N0L,
|
||||
N1L,
|
||||
N2L,
|
||||
P1L,
|
||||
P2L
|
||||
)
|
||||
|
||||
core_words = (
|
||||
S2w,
|
||||
S1w,
|
||||
S0lw,
|
||||
S0l2w,
|
||||
S0w,
|
||||
S0rw,
|
||||
S0r2w,
|
||||
N0lw,
|
||||
N0l2w,
|
||||
N0w,
|
||||
N1w,
|
||||
N2w,
|
||||
)
|
||||
|
||||
|
||||
core_shapes = (
|
||||
S2_shape,
|
||||
S1_shape,
|
||||
S0l_shape,
|
||||
S0l2_shape,
|
||||
S0_shape,
|
||||
S0r_shape,
|
||||
S0r2_shape,
|
||||
N0l_shape,
|
||||
N0l2_shape,
|
||||
N0_shape,
|
||||
N1_shape,
|
||||
N2_shape,
|
||||
)
|
||||
|
||||
|
||||
core_clusters = (
|
||||
S2c,
|
||||
S1c,
|
||||
S0lc,
|
||||
S0l2c,
|
||||
S0c,
|
||||
S0rc,
|
||||
S0r2c,
|
||||
N0lc,
|
||||
N0l2c,
|
||||
N0c,
|
||||
N1c,
|
||||
N2c,
|
||||
)
|
||||
|
||||
|
||||
|
||||
core_tags = (
|
||||
S2p,
|
||||
S1p,
|
||||
S0lp,
|
||||
S0l2p,
|
||||
S0p,
|
||||
S0r2p,
|
||||
S0rp,
|
||||
N0lp,
|
||||
N0l2p,
|
||||
N0p,
|
||||
N1p,
|
||||
N2p,
|
||||
)
|
||||
|
||||
core_labels = (
|
||||
S2L,
|
||||
S1L,
|
||||
S0lL,
|
||||
S0l2L,
|
||||
S0L,
|
||||
S0r2L,
|
||||
S0rL,
|
||||
N0lL,
|
||||
N0l2L,
|
||||
)
|
||||
|
||||
valencies = (
|
||||
(N0lv,),
|
||||
(S0lv,),
|
||||
(S0rv,),
|
||||
(S1lv,),
|
||||
(S1rv,),
|
||||
)
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
from libc.string cimport memcpy, memset
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..lexeme cimport Lexeme
|
||||
|
@ -201,6 +204,21 @@ cdef cppclass StateC:
|
|||
else:
|
||||
return this.length - this._b_i
|
||||
|
||||
uint64_t hash() nogil const:
|
||||
cdef TokenC[11] sig
|
||||
sig[0] = this.S_(2)[0]
|
||||
sig[1] = this.S_(1)[0]
|
||||
sig[2] = this.R_(this.S(1), 1)[0]
|
||||
sig[3] = this.L_(this.S(0), 1)[0]
|
||||
sig[4] = this.L_(this.S(0), 2)[0]
|
||||
sig[5] = this.S_(0)[0]
|
||||
sig[6] = this.R_(this.S(0), 2)[0]
|
||||
sig[7] = this.R_(this.S(0), 1)[0]
|
||||
sig[8] = this.B_(0)[0]
|
||||
sig[9] = this.E_(0)[0]
|
||||
sig[10] = this.E_(1)[0]
|
||||
return hash64(sig, sizeof(sig), this._s_i)
|
||||
|
||||
void push() nogil:
|
||||
if this.B(0) != -1:
|
||||
this._stack[this._s_i] = this.B(0)
|
||||
|
@ -290,6 +308,8 @@ cdef cppclass StateC:
|
|||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
||||
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
||||
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
||||
this.length = src.length
|
||||
this._b_i = src._b_i
|
||||
this._s_i = src._s_i
|
||||
this._e_i = src._e_i
|
||||
|
|
|
@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem):
|
|||
else:
|
||||
is_valid[i] = False
|
||||
costs[i] = 9000
|
||||
assert n_gold >= 1
|
||||
if n_gold < 1:
|
||||
for annot in gold.orig_annot:
|
||||
print(annot)
|
||||
print([move_costs[i] for i in range(N_MOVES)])
|
||||
print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1])
|
||||
print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)])
|
||||
print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)])
|
||||
raise Exception("No gold moves")
|
||||
|
|
|
@ -10,7 +10,7 @@ def english_noun_chunks(doc):
|
|||
for i, word in enumerate(doc):
|
||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||
yield word.left_edge.i, word.i+1, np_label
|
||||
elif word.pos == NOUN and word.dep == conj:
|
||||
elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
|
|
|
@ -1,25 +1,37 @@
|
|||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.neural.nn cimport NeuralNet
|
||||
from thinc.linear.features cimport ConjunctionExtracter
|
||||
from thinc.base cimport Model
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.typedefs cimport weight_t
|
||||
from thinc.structs cimport FeatureC
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from .arc_eager cimport TransitionSystem
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..structs cimport TokenC
|
||||
from thinc.structs cimport ExampleC
|
||||
from thinc.structs cimport NeuralNetC, ExampleC
|
||||
from ._state cimport StateC
|
||||
|
||||
|
||||
cdef class ParserNeuralNet(NeuralNet):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
|
||||
cdef ConjunctionExtracter extracter
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
|
||||
|
||||
|
||||
cdef class ParserPerceptron(AveragedPerceptron):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
|
||||
|
||||
|
||||
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
|
||||
cdef object _models
|
||||
cdef NeuralNetC** _models_c
|
||||
cdef int** _masks
|
||||
cdef int _nr_model
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
cdef readonly ParserNeuralNet model
|
||||
cdef readonly Model model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef int _projectivize
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
|
@ -18,13 +19,14 @@ import shutil
|
|||
import json
|
||||
import sys
|
||||
from .nonproj import PseudoProjectivity
|
||||
import random
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
from thinc.structs cimport SparseArrayC, ExampleC
|
||||
from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
from thinc.structs cimport FeatureC
|
||||
|
@ -61,8 +63,10 @@ def get_templates(name):
|
|||
return pf.ner
|
||||
elif name == 'debug':
|
||||
return pf.unigrams
|
||||
elif name.startswith('embed'):
|
||||
return (pf.words, pf.tags, pf.labels)
|
||||
elif name.startswith('neural'):
|
||||
features = pf.words + pf.tags + pf.labels
|
||||
slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
|
||||
return ([(f,) for f in features], slots)
|
||||
else:
|
||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
||||
pf.tree_shape + pf.trigrams)
|
||||
|
@ -73,72 +77,238 @@ def ParserFactory(transition_system):
|
|||
|
||||
|
||||
cdef class ParserPerceptron(AveragedPerceptron):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
|
||||
@property
|
||||
def widths(self):
|
||||
return (self.extracter.nr_templ,)
|
||||
|
||||
def update(self, Example eg):
|
||||
'''Does regression on negative cost. Sort of cute?'''
|
||||
self.time += 1
|
||||
cdef weight_t loss = 0.0
|
||||
best = eg.best
|
||||
for clas in range(eg.c.nr_class):
|
||||
if not eg.c.is_valid[clas]:
|
||||
continue
|
||||
if eg.c.scores[clas] < eg.c.scores[best]:
|
||||
continue
|
||||
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
|
||||
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
|
||||
step = d_loss * 0.001
|
||||
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||
self.update_weight(feat.key, clas, feat.value * step)
|
||||
return int(loss)
|
||||
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
|
||||
state = <const StateC*>_state
|
||||
fill_context(eg.atoms, state)
|
||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
||||
|
||||
|
||||
cdef class ParserNeuralNet(NeuralNet):
|
||||
def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50,
|
||||
tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0):
|
||||
#input_length = 3 * word_width + 5 * tag_width + 3 * dep_width
|
||||
input_length = 12 * word_width + 7 * dep_width
|
||||
widths = [input_length] + [hidden_width] * depth + [nr_class]
|
||||
#vector_widths = [word_width, tag_width, dep_width]
|
||||
#slots = [0] * 3 + [1] * 5 + [2] * 3
|
||||
vector_widths = [word_width, dep_width]
|
||||
slots = [0] * 12 + [1] * 7
|
||||
NeuralNet.__init__(
|
||||
self,
|
||||
widths,
|
||||
embed=(vector_widths, slots),
|
||||
eta=eta,
|
||||
rho=rho,
|
||||
update_step=update_step)
|
||||
def __init__(self, shape, **kwargs):
|
||||
vector_widths = [4] * 57
|
||||
slots = [0, 1, 2, 3] # S0
|
||||
slots += [4, 5, 6, 7] # S1
|
||||
slots += [8, 9, 10, 11] # S2
|
||||
slots += [12, 13, 14, 15] # S3+
|
||||
slots += [16, 17, 18, 19] # B0
|
||||
slots += [20, 21, 22, 23] # B1
|
||||
slots += [24, 25, 26, 27] # B2
|
||||
slots += [28, 29, 30, 31] # B3+
|
||||
slots += [32, 33, 34, 35] * 2 # S0l, S0r
|
||||
slots += [36, 37, 38, 39] * 2 # B0l, B0r
|
||||
slots += [40, 41, 42, 43] * 2 # S1l, S1r
|
||||
slots += [44, 45, 46, 47] * 2 # S2l, S2r
|
||||
slots += [48, 49, 50, 51, 52]
|
||||
slots += [53, 54, 55, 56]
|
||||
input_length = sum(vector_widths[slot] for slot in slots)
|
||||
widths = [input_length] + shape[3:]
|
||||
|
||||
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
|
||||
|
||||
@property
|
||||
def nr_feat(self):
|
||||
#return 3+5+3
|
||||
return 12+7
|
||||
return 2000
|
||||
|
||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
|
||||
memset(eg.features, 0, 2000 * sizeof(FeatureC))
|
||||
state = <const StateC*>_state
|
||||
fill_context(eg.atoms, state)
|
||||
eg.nr_feat = 12 + 7
|
||||
for j in range(eg.nr_feat):
|
||||
eg.features[j].value = 1.0
|
||||
eg.features[j].i = j
|
||||
#eg.features[0].key = eg.atoms[S0w]
|
||||
#eg.features[1].key = eg.atoms[S1w]
|
||||
#eg.features[2].key = eg.atoms[N0w]
|
||||
feats = eg.features
|
||||
|
||||
eg.features[0].key = eg.atoms[S2W]
|
||||
eg.features[1].key = eg.atoms[S1W]
|
||||
eg.features[2].key = eg.atoms[S0lW]
|
||||
eg.features[3].key = eg.atoms[S0l2W]
|
||||
eg.features[4].key = eg.atoms[S0W]
|
||||
eg.features[5].key = eg.atoms[S0r2W]
|
||||
eg.features[6].key = eg.atoms[S0rW]
|
||||
eg.features[7].key = eg.atoms[N0lW]
|
||||
eg.features[8].key = eg.atoms[N0l2W]
|
||||
eg.features[9].key = eg.atoms[N0W]
|
||||
eg.features[10].key = eg.atoms[N1W]
|
||||
eg.features[11].key = eg.atoms[N2W]
|
||||
feats = _add_token(feats, 0, state.S_(0), 1.0)
|
||||
feats = _add_token(feats, 4, state.S_(1), 1.0)
|
||||
feats = _add_token(feats, 8, state.S_(2), 1.0)
|
||||
# Rest of the stack, with exponential decay
|
||||
for i in range(3, state.stack_depth()):
|
||||
feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
|
||||
feats = _add_token(feats, 16, state.B_(0), 1.0)
|
||||
feats = _add_token(feats, 20, state.B_(1), 1.0)
|
||||
feats = _add_token(feats, 24, state.B_(2), 1.0)
|
||||
# Rest of the buffer, with exponential decay
|
||||
for i in range(3, min(8, state.buffer_length())):
|
||||
feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
|
||||
feats = _add_subtree(feats, 32, state, state.S(0))
|
||||
feats = _add_subtree(feats, 40, state, state.B(0))
|
||||
feats = _add_subtree(feats, 48, state, state.S(1))
|
||||
feats = _add_subtree(feats, 56, state, state.S(2))
|
||||
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
|
||||
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
|
||||
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
|
||||
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
|
||||
feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
|
||||
feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
|
||||
feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
|
||||
feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
|
||||
state.R_(state.S(0), 2))
|
||||
feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
|
||||
state.L_(state.S(0), 2))
|
||||
eg.nr_feat = feats - eg.features
|
||||
|
||||
eg.features[12].key = eg.atoms[S2L]
|
||||
eg.features[13].key = eg.atoms[S1L]
|
||||
eg.features[14].key = eg.atoms[S0l2L]
|
||||
eg.features[15].key = eg.atoms[S0lL]
|
||||
eg.features[16].key = eg.atoms[S0L]
|
||||
eg.features[17].key = eg.atoms[S0r2L]
|
||||
eg.features[18].key = eg.atoms[S0rL]
|
||||
|
||||
cdef inline FeatureC* _add_token(FeatureC* feats,
|
||||
int slot, const TokenC* token, weight_t value) nogil:
|
||||
# Word
|
||||
feats.i = slot
|
||||
feats.key = token.lex.norm
|
||||
feats.value = value
|
||||
feats += 1
|
||||
# POS tag
|
||||
feats.i = slot+1
|
||||
feats.key = token.tag
|
||||
feats.value = value
|
||||
feats += 1
|
||||
# Dependency label
|
||||
feats.i = slot+2
|
||||
feats.key = token.dep
|
||||
feats.value = value
|
||||
feats += 1
|
||||
# Word, label, tag
|
||||
feats.i = slot+3
|
||||
cdef uint64_t key[3]
|
||||
key[0] = token.lex.cluster
|
||||
key[1] = token.tag
|
||||
key[2] = token.dep
|
||||
feats.key = hash64(key, sizeof(key), 0)
|
||||
feats.value = value
|
||||
feats += 1
|
||||
return feats
|
||||
|
||||
|
||||
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
|
||||
value = 1.0
|
||||
for i in range(state.n_R(t)):
|
||||
feats = _add_token(feats, slot, state.R_(t, i+1), value)
|
||||
value *= 0.5
|
||||
slot += 4
|
||||
value = 1.0
|
||||
for i in range(state.n_L(t)):
|
||||
feats = _add_token(feats, slot, state.L_(t, i+1), value)
|
||||
value *= 0.5
|
||||
return feats
|
||||
|
||||
|
||||
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
|
||||
const TokenC* t1, const TokenC* t2) nogil:
|
||||
cdef uint64_t[2] key
|
||||
key[0] = t1.tag
|
||||
key[1] = t2.tag
|
||||
feat.i = slot
|
||||
feat.key = hash64(key, sizeof(key), slot)
|
||||
feat.value = 1.0
|
||||
return feat+1
|
||||
|
||||
|
||||
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
|
||||
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
|
||||
cdef uint64_t[3] key
|
||||
key[0] = t1.tag
|
||||
key[1] = t2.tag
|
||||
key[2] = t3.tag
|
||||
feat.i = slot
|
||||
feat.key = hash64(key, sizeof(key), slot)
|
||||
feat.value = 1.0
|
||||
return feat+1
|
||||
|
||||
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
|
||||
def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
|
||||
ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
|
||||
self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
|
||||
self._masks = <int**>self.mem.alloc(sizeof(int*), n)
|
||||
self._models = []
|
||||
cdef ParserNeuralNet model
|
||||
threshold = 1.5 / n
|
||||
self._nr_model = n
|
||||
for i in range(n):
|
||||
self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
|
||||
for j in range(self.nr_feat):
|
||||
self._masks[i][j] = random.random() < threshold
|
||||
# We have to pass our pool here, because the embedding table passes
|
||||
# it around.
|
||||
model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
|
||||
self._models_c[i] = &model.c
|
||||
self._models.append(model)
|
||||
|
||||
property eta:
|
||||
def __get__(self):
|
||||
return self._models[0].eta
|
||||
|
||||
def __set__(self, weight_t value):
|
||||
for model in self._models:
|
||||
model.eta = value
|
||||
|
||||
def sparsify_embeddings(self, penalty):
|
||||
p = 0.0
|
||||
for model in self._models:
|
||||
p += model.sparsify_embeddings(penalty)
|
||||
return p / len(self._models)
|
||||
|
||||
cdef void set_scoresC(self, weight_t* scores, const void* _feats,
|
||||
int nr_feat, int is_sparse) nogil:
|
||||
nr_class = self.c.widths[self.c.nr_layer-1]
|
||||
sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
|
||||
sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
|
||||
feats = <const FeatureC*>_feats
|
||||
for i in range(self._nr_model):
|
||||
for j in range(nr_feat):
|
||||
sub_feats[j] = feats[j]
|
||||
sub_feats[j].value *= self._masks[i][j]
|
||||
self.c = self._models_c[i][0]
|
||||
self.c.weights = self._models_c[i].weights
|
||||
self.c.gradient = self._models_c[i].gradient
|
||||
ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
|
||||
for j in range(nr_class):
|
||||
scores[j] += sub_scores[j]
|
||||
sub_scores[j] = 0.0
|
||||
for j in range(nr_class):
|
||||
scores[j] /= self._nr_model
|
||||
free(sub_feats)
|
||||
free(sub_scores)
|
||||
|
||||
def update(self, Example eg):
|
||||
if eg.cost == 0:
|
||||
return 0.0
|
||||
loss = 0.0
|
||||
full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
|
||||
memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
|
||||
cdef ParserNeuralNet model
|
||||
for i, model in enumerate(self._models):
|
||||
for j in range(eg.nr_feat):
|
||||
eg.c.features[j].value *= self._masks[i][j]
|
||||
loss += model.update(eg)
|
||||
memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
|
||||
free(full_feats)
|
||||
return loss
|
||||
|
||||
def end_training(self):
|
||||
for model in self._models:
|
||||
model.end_training()
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
def __init__(self, StringStore strings, transition_system, ParserNeuralNet model,
|
||||
int projectivize = 0):
|
||||
def __init__(self, StringStore strings, transition_system, model):
|
||||
self.moves = transition_system
|
||||
self.model = model
|
||||
self._projectivize = projectivize
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, model_dir, strings, transition_system):
|
||||
|
@ -148,16 +318,24 @@ cdef class Parser:
|
|||
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
|
||||
cfg = Config.read(model_dir, 'config')
|
||||
moves = transition_system(strings, cfg.labels)
|
||||
model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width,
|
||||
depth=cfg.depth, word_width=cfg.word_width,
|
||||
tag_width=cfg.tag_width, dep_width=cfg.dep_width,
|
||||
update_step=cfg.update_step,
|
||||
eta=cfg.eta, rho=cfg.rho)
|
||||
|
||||
project = cfg.projectivize if hasattr(cfg,'projectivize') else False
|
||||
if cfg.get('model') == 'neural':
|
||||
shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
|
||||
shape.extend(cfg.hidden_layers)
|
||||
shape.append(moves.n_moves)
|
||||
if cfg.get('ensemble_size') >= 2:
|
||||
model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
|
||||
eta=cfg.eta, rho=cfg.rho,
|
||||
n=cfg.ensemble_size)
|
||||
else:
|
||||
model = ParserNeuralNet(shape, update_step=cfg.update_step,
|
||||
eta=cfg.eta, rho=cfg.rho)
|
||||
else:
|
||||
model = ParserPerceptron(get_templates(cfg.feat_set))
|
||||
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
model.load(path.join(model_dir, 'model'))
|
||||
return cls(strings, moves, model, project)
|
||||
return cls(strings, moves, model)
|
||||
|
||||
@classmethod
|
||||
def load(cls, pkg_or_str_or_file, vocab):
|
||||
|
@ -253,18 +431,18 @@ cdef class Parser:
|
|||
widths=self.model.widths,
|
||||
nr_atom=CONTEXT_SIZE,
|
||||
nr_feat=self.model.nr_feat)
|
||||
cdef weight_t loss = 0
|
||||
loss = 0
|
||||
cdef Transition action
|
||||
while not stcls.is_final():
|
||||
self.model.set_featuresC(eg.c, stcls.c)
|
||||
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
|
||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||
|
||||
# Sets eg.c.scores, which Example uses to calculate eg.guess
|
||||
self.model.updateC(eg.c)
|
||||
|
||||
action = self.moves.c[eg.guess]
|
||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||
assert guess >= 0
|
||||
action = self.moves.c[guess]
|
||||
action.do(stcls.c, action.label)
|
||||
loss += eg.loss
|
||||
|
||||
loss += self.model.update(eg)
|
||||
eg.reset()
|
||||
return loss
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from .vocab cimport Vocab
|
|||
|
||||
|
||||
cdef class TaggerModel(AveragedPerceptron):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
|
|
|
@ -71,13 +71,13 @@ cpdef enum:
|
|||
|
||||
|
||||
cdef class TaggerModel(AveragedPerceptron):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
||||
|
||||
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
||||
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
|
||||
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil:
|
||||
token = <const TokenC*>_token
|
||||
_fill_from_token(&eg.atoms[P2_orth], token - 2)
|
||||
_fill_from_token(&eg.atoms[P1_orth], token - 1)
|
||||
_fill_from_token(&eg.atoms[W_orth], token)
|
||||
_fill_from_token(&eg.atoms[N1_orth], token + 1)
|
||||
_fill_from_token(&eg.atoms[N2_orth], token + 2)
|
||||
|
||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
||||
|
||||
|
@ -153,7 +153,7 @@ cdef class Tagger:
|
|||
@classmethod
|
||||
def from_package(cls, pkg, vocab):
|
||||
# TODO: templates.json deprecated? not present in latest package
|
||||
# templates = cls.default_templates()
|
||||
#templates = cls.default_templates()
|
||||
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
|
||||
|
||||
model = TaggerModel(templates)
|
||||
|
@ -202,12 +202,13 @@ cdef class Tagger:
|
|||
nr_feat=self.model.nr_feat)
|
||||
for i in range(tokens.length):
|
||||
if tokens.c[i].pos == 0:
|
||||
self.model.set_featuresC(eg.c, tokens.c, i)
|
||||
self.model.set_featuresC(eg.c, &tokens.c[i])
|
||||
self.model.set_scoresC(eg.c.scores,
|
||||
eg.c.features, eg.c.nr_feat, 1)
|
||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
|
||||
eg.fill_scores(0, eg.c.nr_class)
|
||||
eg.reset()
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
|
@ -231,18 +232,15 @@ cdef class Tagger:
|
|||
nr_class=self.vocab.morphology.n_tags,
|
||||
nr_feat=self.model.nr_feat)
|
||||
for i in range(tokens.length):
|
||||
self.model.set_featuresC(eg.c, tokens.c, i)
|
||||
self.model.set_featuresC(eg.c, &tokens.c[i])
|
||||
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
||||
self.model.set_scoresC(eg.c.scores,
|
||||
eg.c.features, eg.c.nr_feat, 1)
|
||||
self.model.updateC(eg.c)
|
||||
|
||||
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
|
||||
|
||||
self.model.update(eg)
|
||||
correct += eg.cost == 0
|
||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
||||
eg.fill_scores(0, eg.c.nr_class)
|
||||
eg.fill_costs(0, eg.c.nr_class)
|
||||
eg.reset()
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
return correct
|
||||
|
|
Loading…
Reference in New Issue
Block a user