mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Working NN, but very messy. Relies on BLIS.
This commit is contained in:
parent
7c2f1a673b
commit
de7c6c48d8
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
from __future__ import print_function
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
@ -9,6 +10,8 @@ import io
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import gzip
|
import gzip
|
||||||
|
import re
|
||||||
|
import numpy
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import cProfile
|
import cProfile
|
||||||
|
@ -20,23 +23,29 @@ from spacy.gold import GoldParse
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
from spacy.syntax.util import Config
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
from spacy.syntax.parser import Parser
|
from spacy.syntax.parser import Parser, get_templates
|
||||||
|
from spacy.syntax.beam_parser import BeamParser
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
|
from spacy.syntax.nonproj import PseudoProjectivity
|
||||||
|
from spacy.syntax import _parse_features as pf
|
||||||
|
|
||||||
# Last updated for spaCy v0.97
|
# Last updated for spaCy v0.97
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
def read_conll(file_, n=0):
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
"""Read a standard CoNLL/MALT-style format"""
|
||||||
sents = []
|
text = file_.read().strip()
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
sent_strs = re.split(r'\n\s*\n', text)
|
||||||
|
for sent_id, sent_str in enumerate(sent_strs):
|
||||||
|
if not sent_str.strip():
|
||||||
|
continue
|
||||||
ids = []
|
ids = []
|
||||||
words = []
|
words = []
|
||||||
heads = []
|
heads = []
|
||||||
labels = []
|
labels = []
|
||||||
tags = []
|
tags = []
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
for i, line in enumerate(sent_str.strip().split('\n')):
|
||||||
word, pos_string, head_idx, label = _parse_line(line)
|
word, pos_string, head_idx, label = _parse_line(line)
|
||||||
words.append(word)
|
words.append(word)
|
||||||
if head_idx < 0:
|
if head_idx < 0:
|
||||||
|
@ -45,10 +54,10 @@ def read_conll(file_):
|
||||||
heads.append(head_idx)
|
heads.append(head_idx)
|
||||||
labels.append(label)
|
labels.append(label)
|
||||||
tags.append(pos_string)
|
tags.append(pos_string)
|
||||||
text = ' '.join(words)
|
|
||||||
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
||||||
sents.append((None, [(annot, [])]))
|
yield (None, [(annot, None)])
|
||||||
return sents
|
if n and sent_id >= n:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
def _parse_line(line):
|
||||||
|
@ -68,21 +77,33 @@ def _parse_line(line):
|
||||||
pos = pieces[4]
|
pos = pieces[4]
|
||||||
head_idx = int(pieces[6])-1
|
head_idx = int(pieces[6])-1
|
||||||
label = pieces[7]
|
label = pieces[7]
|
||||||
if head_idx == 0:
|
if head_idx < 0:
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
return word, pos, head_idx, label
|
return word, pos, head_idx, label
|
||||||
|
|
||||||
|
|
||||||
|
def print_words(strings, words, embeddings):
|
||||||
|
ids = {strings[word]: word for word in words}
|
||||||
|
vectors = {}
|
||||||
|
for key, values in embeddings[5]:
|
||||||
|
if key in ids:
|
||||||
|
vectors[strings[key]] = values
|
||||||
|
for word in words:
|
||||||
|
if word in vectors:
|
||||||
|
print(word, vectors[word])
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
nlp.tagger(tokens)
|
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
||||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic',
|
||||||
gold_preproc=False, force_gold=False):
|
learn_rate=0.001, update_step='sgd_cm',
|
||||||
|
batch_norm=False, seed=0, gold_preproc=False, force_gold=False):
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
if path.exists(dep_model_dir):
|
if path.exists(dep_model_dir):
|
||||||
|
@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
|
||||||
os.mkdir(dep_model_dir)
|
os.mkdir(dep_model_dir)
|
||||||
os.mkdir(pos_model_dir)
|
os.mkdir(pos_model_dir)
|
||||||
|
|
||||||
|
if feat_set != 'neural':
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||||
labels=ArcEager.get_labels(gold_tuples))
|
labels=ArcEager.get_labels(gold_tuples))
|
||||||
|
|
||||||
|
else:
|
||||||
|
feat_groups = [
|
||||||
|
(pf.core_words, 8),
|
||||||
|
(pf.core_tags, 4),
|
||||||
|
(pf.core_labels, 4),
|
||||||
|
(pf.core_shapes, 4),
|
||||||
|
([f[0] for f in pf.valencies], 2)
|
||||||
|
]
|
||||||
|
slots = []
|
||||||
|
vector_widths = []
|
||||||
|
feat_set = []
|
||||||
|
input_length = 0
|
||||||
|
for i, (feat_group, width) in enumerate(feat_groups):
|
||||||
|
feat_set.extend((f,) for f in feat_group)
|
||||||
|
slots += [i] * len(feat_group)
|
||||||
|
vector_widths.append(width)
|
||||||
|
input_length += width * len(feat_group)
|
||||||
|
hidden_layers = [128] * 5
|
||||||
|
rho = 1e-4
|
||||||
|
Config.write(dep_model_dir, 'config',
|
||||||
|
model='neural',
|
||||||
|
seed=seed,
|
||||||
|
labels=ArcEager.get_labels(gold_tuples),
|
||||||
|
feat_set=feat_set,
|
||||||
|
vector_widths=vector_widths,
|
||||||
|
slots=slots,
|
||||||
|
hidden_layers=hidden_layers,
|
||||||
|
update_step=update_step,
|
||||||
|
batch_norm=batch_norm,
|
||||||
|
eta=learn_rate,
|
||||||
|
mu=0.9,
|
||||||
|
ensemble_size=1,
|
||||||
|
rho=rho)
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||||
|
for word in nlp.vocab:
|
||||||
|
word.norm = word.orth
|
||||||
|
words = list(nlp.vocab)
|
||||||
|
top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32')
|
||||||
|
norms = numpy.ndarray(shape=(10000,), dtype='float32')
|
||||||
|
for i in range(10000):
|
||||||
|
if i >= 400 and words[i].has_vector:
|
||||||
|
top5k[i] = words[i].vector
|
||||||
|
norms[i] = numpy.sqrt(sum(top5k[i] ** 2))
|
||||||
|
else:
|
||||||
|
# Make these way off values, to make big distance.
|
||||||
|
top5k[i] = 100.0
|
||||||
|
norms[i] = 100.0
|
||||||
|
print("Setting vectors")
|
||||||
|
for word in words[10000:]:
|
||||||
|
if word.has_vector:
|
||||||
|
cosines = numpy.dot(top5k, word.vector)
|
||||||
|
cosines /= norms * numpy.sqrt(sum(word.vector ** 2))
|
||||||
|
most_similar = words[numpy.argmax(cosines)]
|
||||||
|
word.norm = most_similar.norm
|
||||||
|
else:
|
||||||
|
word.norm = word.shape
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
print(nlp.parser.model.widths)
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize")
|
||||||
|
last_score = 0.0
|
||||||
|
nr_trimmed = 0
|
||||||
|
eg_seen = 0
|
||||||
loss = 0
|
loss = 0
|
||||||
|
for itn in range(n_iter):
|
||||||
|
random.shuffle(gold_tuples)
|
||||||
for _, sents in gold_tuples:
|
for _, sents in gold_tuples:
|
||||||
for annot_tuples, _ in sents:
|
for annot_tuples, _ in sents:
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
|
|
||||||
score_model(scorer, nlp, None, annot_tuples, verbose=False)
|
|
||||||
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||||
nlp.tagger(tokens)
|
nlp.tagger.tag_from_strings(tokens, annot_tuples[2])
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
if not gold.is_projective:
|
|
||||||
raise Exception(
|
|
||||||
"Non-projective sentence in training, after we should "
|
|
||||||
"have enforced projectivity: %s" % annot_tuples
|
|
||||||
)
|
|
||||||
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
loss += nlp.parser.train(tokens, gold)
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
eg_seen += 1
|
||||||
random.shuffle(gold_tuples)
|
if eg_seen % 10000 == 0:
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
scorer = Scorer()
|
||||||
scorer.tags_acc, scorer.token_acc))
|
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||||
print('end training')
|
for _, sents in read_conll(file_):
|
||||||
|
for annot_tuples, _ in sents:
|
||||||
|
score_model(scorer, nlp, None, annot_tuples)
|
||||||
|
train_scorer = Scorer()
|
||||||
|
for _, sents in gold_tuples[:1000]:
|
||||||
|
for annot_tuples, _ in sents:
|
||||||
|
score_model(train_scorer, nlp, None, annot_tuples)
|
||||||
|
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed,
|
||||||
|
train_scorer.uas, scorer.uas,
|
||||||
|
nlp.parser.model.mem.size))
|
||||||
|
loss = 0
|
||||||
|
if feat_set != 'basic':
|
||||||
|
nlp.parser.model.eta *= 0.99
|
||||||
|
threshold = 0.05 * (1.05 ** itn)
|
||||||
|
nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True)
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
print('done')
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||||
model_dir=("Location of output model directory"),
|
model_dir=("Location of output model directory"),
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
n_iter=("Number of training iterations", "option", "i", int),
|
||||||
|
batch_norm=("Use batch normalization and residual connections", "flag", "b"),
|
||||||
|
update_step=("Update step", "option", "u", str),
|
||||||
|
learn_rate=("Learn rate", "option", "e", float),
|
||||||
|
neural=("Use neural network?", "flag", "N")
|
||||||
)
|
)
|
||||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False,
|
||||||
|
learn_rate=0.001, update_step='sgd_cm'):
|
||||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
||||||
train_sents = read_conll(file_)
|
train_sents = list(read_conll(file_))
|
||||||
if not eval_only:
|
# preprocess training data here before ArcEager.get_labels() is called
|
||||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||||
nlp = English(data_dir=model_dir)
|
|
||||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter,
|
||||||
|
feat_set='neural' if neural else 'basic',
|
||||||
|
batch_norm=batch_norm,
|
||||||
|
learn_rate=learn_rate,
|
||||||
|
update_step=update_step)
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
for _, sents in dev_sents:
|
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||||
|
for _, sents in read_conll(file_):
|
||||||
for annot_tuples, _ in sents:
|
for annot_tuples, _ in sents:
|
||||||
score_model(scorer, nlp, None, annot_tuples)
|
score_model(scorer, nlp, None, annot_tuples)
|
||||||
print('TOK', 100-scorer.token_acc)
|
print('TOK', scorer.token_acc)
|
||||||
print('POS', scorer.tags_acc)
|
print('POS', scorer.tags_acc)
|
||||||
print('UAS', scorer.uas)
|
print('UAS', scorer.uas)
|
||||||
print('LAS', scorer.las)
|
print('LAS', scorer.las)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -23,7 +23,8 @@ from spacy.scorer import Scorer
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
from spacy.syntax.ner import BiluoPushDown
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
from spacy.syntax.parser import Parser
|
from spacy.syntax.parser import Parser, get_templates
|
||||||
|
from spacy.syntax.beam_parser import BeamParser
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
from spacy.syntax.nonproj import PseudoProjectivity
|
||||||
|
|
||||||
|
|
||||||
|
@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||||
labels=ArcEager.get_labels(gold_tuples),
|
labels=ArcEager.get_labels(gold_tuples),
|
||||||
beam_width=beam_width,projectivize=pseudoprojective)
|
beam_width=beam_width,projectivize=pseudoprojective)
|
||||||
|
#feat_set, slots = get_templates('neural')
|
||||||
|
#vector_widths = [10, 10, 10]
|
||||||
|
#hidden_layers = [100, 100, 100]
|
||||||
|
#update_step = 'adam'
|
||||||
|
#eta = 0.001
|
||||||
|
#rho = 1e-4
|
||||||
|
#Config.write(dep_model_dir, 'config', model='neural',
|
||||||
|
# seed=seed, labels=ArcEager.get_labels(gold_tuples),
|
||||||
|
# feat_set=feat_set,
|
||||||
|
# vector_widths=vector_widths,
|
||||||
|
# slots=slots,
|
||||||
|
# hidden_layers=hidden_layers,
|
||||||
|
# update_step=update_step,
|
||||||
|
# eta=eta,
|
||||||
|
# rho=rho)
|
||||||
|
|
||||||
|
|
||||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||||
labels=BiluoPushDown.get_labels(gold_tuples),
|
labels=BiluoPushDown.get_labels(gold_tuples),
|
||||||
beam_width=0)
|
beam_width=0)
|
||||||
|
@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||||
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
||||||
|
print(nlp.parser.model.widths)
|
||||||
|
for raw_text, sents in gold_tuples:
|
||||||
|
for annot_tuples, ctnt in sents:
|
||||||
|
for word in annot_tuples[1]:
|
||||||
|
_ = nlp.vocab[word]
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc=
|
||||||
if not eval_only:
|
if not eval_only:
|
||||||
gold_train = list(read_json_file(train_loc))
|
gold_train = list(read_json_file(train_loc))
|
||||||
train(lang, gold_train, model_dir,
|
train(lang, gold_train, model_dir,
|
||||||
feat_set='basic' if not debug else 'debug',
|
feat_set='neural' if not debug else 'debug',
|
||||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||||
corruption_level=corruption_level, n_iter=n_iter,
|
corruption_level=corruption_level, n_iter=n_iter,
|
||||||
verbose=verbose,pseudoprojective=pseudoprojective)
|
verbose=verbose,pseudoprojective=pseudoprojective)
|
||||||
if out_loc:
|
if out_loc:
|
||||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
write_parses(lang, dev_loc, model_dir, out_loc)
|
||||||
|
print(model_dir)
|
||||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||||
print('TOK', scorer.token_acc)
|
print('TOK', scorer.token_acc)
|
||||||
|
|
|
@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager
|
||||||
from spacy.syntax.parser import get_templates
|
from spacy.syntax.parser import get_templates
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
import spacy.attrs
|
import spacy.attrs
|
||||||
|
from spacy.syntax.nonproj import PseudoProjectivity
|
||||||
|
|
||||||
|
from spacy.syntax._parse_features import *
|
||||||
|
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
|
|
||||||
from spacy.tagger import W_orth
|
|
||||||
|
|
||||||
TAGGER_TEMPLATES = (
|
|
||||||
(W_orth,),
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from codecs import open
|
from codecs import open
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
features = [
|
||||||
|
(S2W,),
|
||||||
|
(S1W, ),
|
||||||
|
(S1rW,),
|
||||||
|
(S0lW, ),
|
||||||
|
(S0l2W, ),
|
||||||
|
(S0W, ),
|
||||||
|
(S0r2W, ),
|
||||||
|
(S0rW, ),
|
||||||
|
(N0l2W, ),
|
||||||
|
(N0lW, ),
|
||||||
|
(N0W, ),
|
||||||
|
(N1W, ),
|
||||||
|
(N2W, )
|
||||||
|
]
|
||||||
|
|
||||||
|
slots = [0] * len(features)
|
||||||
|
|
||||||
|
features += [
|
||||||
|
(S2p,),
|
||||||
|
(S1p, ),
|
||||||
|
(S1rp,),
|
||||||
|
(S0lp,),
|
||||||
|
(S0l2p,),
|
||||||
|
(S0p, ),
|
||||||
|
(S0r2p, ),
|
||||||
|
(S0rp, ),
|
||||||
|
(N0l2p, ),
|
||||||
|
(N0lp, ),
|
||||||
|
(N0p, ),
|
||||||
|
(N1p, ),
|
||||||
|
(N2p, )
|
||||||
|
]
|
||||||
|
|
||||||
|
slots += [1] * (len(features) - len(slots))
|
||||||
|
|
||||||
|
features += [
|
||||||
|
(S2L,),
|
||||||
|
(S1L,),
|
||||||
|
(S1rL,),
|
||||||
|
(S0lL,),
|
||||||
|
(S0l2L,),
|
||||||
|
(S0L,),
|
||||||
|
(S0rL,),
|
||||||
|
(S0r2L,),
|
||||||
|
(N0l2L,),
|
||||||
|
(N0lL,),
|
||||||
|
]
|
||||||
|
slots += [2] * (len(features) - len(slots))
|
||||||
|
#
|
||||||
|
#features += [(S2p, S1p), (S1p, S0p)]
|
||||||
|
#slots += [3, 3]
|
||||||
|
#features += [(S0p, N0p)]
|
||||||
|
#slots += [4]
|
||||||
|
# (S0l2p, S0l2L, S0lp, S0l2L),
|
||||||
|
# (N0l2p, N0l2L, N0lp, N0lL),
|
||||||
|
# (S1p, S1rp, S1rL),
|
||||||
|
# (S0p, S0rp, S0rL),
|
||||||
|
#)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TreebankParser(object):
|
class TreebankParser(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
|
def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,),
|
||||||
|
hidden_layers=(300, 300),
|
||||||
|
feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0):
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
if path.exists(dep_model_dir):
|
if path.exists(dep_model_dir):
|
||||||
|
@ -43,15 +105,16 @@ class TreebankParser(object):
|
||||||
os.mkdir(dep_model_dir)
|
os.mkdir(dep_model_dir)
|
||||||
os.mkdir(pos_model_dir)
|
os.mkdir(pos_model_dir)
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set,
|
||||||
labels=labels)
|
seed=seed, labels=labels, vector_widths=vector_widths, slots=slots,
|
||||||
|
hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, tag_map, model_dir):
|
def from_dir(cls, tag_map, model_dir):
|
||||||
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
|
vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs())
|
||||||
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
|
vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
|
tagger = Tagger.blank(vocab, Tagger.default_templates())
|
||||||
|
|
||||||
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
|
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
|
||||||
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
|
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
|
||||||
|
@ -64,22 +127,14 @@ class TreebankParser(object):
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
|
|
||||||
def train(self, words, tags, heads, deps):
|
def train(self, words, tags, heads, deps):
|
||||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
|
||||||
self.tagger.train(tokens, tags)
|
|
||||||
|
|
||||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||||
ids = range(len(words))
|
ids = range(len(words))
|
||||||
ner = ['O'] * len(words)
|
ner = ['O'] * len(words)
|
||||||
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
|
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)))
|
||||||
make_projective=False)
|
self.tagger.tag_from_strings(tokens, tags)
|
||||||
self.tagger(tokens)
|
loss = self.parser.train(tokens, gold)
|
||||||
if gold.is_projective:
|
PseudoProjectivity.deprojectivize(tokens)
|
||||||
try:
|
return loss
|
||||||
self.parser.train(tokens, gold)
|
|
||||||
except:
|
|
||||||
for id_, word, head, dep in zip(ids, words, heads, deps):
|
|
||||||
print(id_, word, head, dep)
|
|
||||||
raise
|
|
||||||
|
|
||||||
def __call__(self, words, tags=None):
|
def __call__(self, words, tags=None):
|
||||||
tokens = self.tokenizer.tokens_from_list(list(words))
|
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||||
|
@ -88,6 +143,7 @@ class TreebankParser(object):
|
||||||
else:
|
else:
|
||||||
self.tagger.tag_from_strings(tokens, tags)
|
self.tagger.tag_from_strings(tokens, tags)
|
||||||
self.parser(tokens)
|
self.parser(tokens)
|
||||||
|
PseudoProjectivity.deprojectivize(tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def end_training(self, data_dir):
|
def end_training(self, data_dir):
|
||||||
|
@ -101,8 +157,6 @@ class TreebankParser(object):
|
||||||
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
|
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(loc):
|
def read_conllx(loc):
|
||||||
with open(loc, 'r', 'utf8') as file_:
|
with open(loc, 'r', 'utf8') as file_:
|
||||||
text = file_.read()
|
text = file_.read()
|
||||||
|
@ -119,8 +173,8 @@ def read_conllx(loc):
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != '0' else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = 'ROOT' if dep == 'root' else dep
|
||||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
tokens.append([id_, word, tag, head, dep, 'O'])
|
||||||
tuples = zip(*tokens)
|
tuples = [list(el) for el in zip(*tokens)]
|
||||||
yield (None, [(tuples, [])])
|
yield (None, [(tuples, [])])
|
||||||
|
|
||||||
|
|
||||||
|
@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False):
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
|
|
||||||
def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
@plac.annotations(
|
||||||
|
n_iter=("Number of training iterations", "option", "i", int),
|
||||||
|
)
|
||||||
|
def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10):
|
||||||
with open(tag_map_loc) as file_:
|
with open(tag_map_loc) as file_:
|
||||||
tag_map = json.loads(file_.read())
|
tag_map = json.loads(file_.read())
|
||||||
train_sents = list(read_conllx(train_loc))
|
train_sents = list(read_conllx(train_loc))
|
||||||
labels = ArcEager.get_labels(train_sents)
|
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||||
templates = get_templates('basic')
|
dev_sents = list(read_conllx(dev_loc))
|
||||||
|
|
||||||
TreebankParser.setup_model_dir(model_dir, labels, templates)
|
labels = ArcEager.get_labels(train_sents)
|
||||||
|
|
||||||
|
TreebankParser.setup_model_dir(model_dir, labels,
|
||||||
|
feat_set=features, vector_widths=(10,10,10,30,30), slots=slots,
|
||||||
|
hidden_layers=(100,100,100), update_step='adam')
|
||||||
|
|
||||||
nlp = TreebankParser.from_dir(tag_map, model_dir)
|
nlp = TreebankParser.from_dir(tag_map, model_dir)
|
||||||
|
nlp.parser.model.rho = 1e-4
|
||||||
|
print(nlp.parser.model.widths)
|
||||||
|
|
||||||
for itn in range(15):
|
for itn in range(n_iter):
|
||||||
|
loss = 0.0
|
||||||
for _, doc_sents in train_sents:
|
for _, doc_sents in train_sents:
|
||||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||||
nlp.train(words, tags, heads, deps)
|
loss += nlp.train(words, tags, heads, deps)
|
||||||
random.shuffle(train_sents)
|
random.shuffle(train_sents)
|
||||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
scorer = score_model(nlp, dev_sents)
|
||||||
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
|
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
|
||||||
|
print(nlp.parser.model.mem.size)
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
scorer = score_model(nlp, read_conllx(dev_loc))
|
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -51,6 +51,7 @@ MOD_NAMES = [
|
||||||
'spacy.syntax._state',
|
'spacy.syntax._state',
|
||||||
'spacy.tokenizer',
|
'spacy.tokenizer',
|
||||||
'spacy.syntax.parser',
|
'spacy.syntax.parser',
|
||||||
|
'spacy.syntax.beam_parser',
|
||||||
'spacy.syntax.nonproj',
|
'spacy.syntax.nonproj',
|
||||||
'spacy.syntax.transition_system',
|
'spacy.syntax.transition_system',
|
||||||
'spacy.syntax.arc_eager',
|
'spacy.syntax.arc_eager',
|
||||||
|
@ -73,7 +74,8 @@ MOD_NAMES = [
|
||||||
compile_options = {
|
compile_options = {
|
||||||
'msvc': ['/Ox', '/EHsc'],
|
'msvc': ['/Ox', '/EHsc'],
|
||||||
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
|
'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'],
|
||||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
|
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function',
|
||||||
|
'-I/Users/matt/blis/include/blis']
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: profile=True
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
@ -264,13 +265,3 @@ cdef class GoldParse:
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
return label == 'P' or label.lower() == 'punct'
|
return label == 'P' or label.lower() == 'punct'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
context[11] = 0
|
context[11] = 0
|
||||||
context[12] = 0
|
context[12] = 0
|
||||||
else:
|
else:
|
||||||
context[0] = token.lex.orth
|
context[0] = token.lex.norm
|
||||||
context[1] = token.lemma
|
context[1] = token.lex.norm
|
||||||
context[2] = token.tag
|
context[2] = token.tag
|
||||||
context[3] = token.lex.cluster
|
context[3] = token.lex.cluster
|
||||||
# We've read in the string little-endian, so now we can take & (2**n)-1
|
# We've read in the string little-endian, so now we can take & (2**n)-1
|
||||||
|
@ -366,27 +366,26 @@ trigrams = (
|
||||||
|
|
||||||
|
|
||||||
words = (
|
words = (
|
||||||
S2w,
|
S2W,
|
||||||
S1w,
|
S1W,
|
||||||
S1rw,
|
S1rW,
|
||||||
S0lw,
|
S0lW,
|
||||||
S0l2w,
|
S0l2W,
|
||||||
S0w,
|
S0W,
|
||||||
S0r2w,
|
S0r2W,
|
||||||
S0rw,
|
S0rW,
|
||||||
N0lw,
|
N0lW,
|
||||||
N0l2w,
|
N0l2W,
|
||||||
N0w,
|
N0W,
|
||||||
N1w,
|
N1W,
|
||||||
N2w,
|
N2W,
|
||||||
P1w,
|
P1W,
|
||||||
P2w
|
P2W
|
||||||
)
|
)
|
||||||
|
|
||||||
tags = (
|
tags = (
|
||||||
S2p,
|
S2p,
|
||||||
S1p,
|
S1p,
|
||||||
S1rp,
|
|
||||||
S0lp,
|
S0lp,
|
||||||
S0l2p,
|
S0l2p,
|
||||||
S0p,
|
S0p,
|
||||||
|
@ -404,7 +403,6 @@ tags = (
|
||||||
labels = (
|
labels = (
|
||||||
S2L,
|
S2L,
|
||||||
S1L,
|
S1L,
|
||||||
S1rL,
|
|
||||||
S0lL,
|
S0lL,
|
||||||
S0l2L,
|
S0l2L,
|
||||||
S0L,
|
S0L,
|
||||||
|
@ -412,9 +410,88 @@ labels = (
|
||||||
S0rL,
|
S0rL,
|
||||||
N0lL,
|
N0lL,
|
||||||
N0l2L,
|
N0l2L,
|
||||||
N0L,
|
)
|
||||||
N1L,
|
|
||||||
N2L,
|
core_words = (
|
||||||
P1L,
|
S2w,
|
||||||
P2L
|
S1w,
|
||||||
|
S0lw,
|
||||||
|
S0l2w,
|
||||||
|
S0w,
|
||||||
|
S0rw,
|
||||||
|
S0r2w,
|
||||||
|
N0lw,
|
||||||
|
N0l2w,
|
||||||
|
N0w,
|
||||||
|
N1w,
|
||||||
|
N2w,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
core_shapes = (
|
||||||
|
S2_shape,
|
||||||
|
S1_shape,
|
||||||
|
S0l_shape,
|
||||||
|
S0l2_shape,
|
||||||
|
S0_shape,
|
||||||
|
S0r_shape,
|
||||||
|
S0r2_shape,
|
||||||
|
N0l_shape,
|
||||||
|
N0l2_shape,
|
||||||
|
N0_shape,
|
||||||
|
N1_shape,
|
||||||
|
N2_shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
core_clusters = (
|
||||||
|
S2c,
|
||||||
|
S1c,
|
||||||
|
S0lc,
|
||||||
|
S0l2c,
|
||||||
|
S0c,
|
||||||
|
S0rc,
|
||||||
|
S0r2c,
|
||||||
|
N0lc,
|
||||||
|
N0l2c,
|
||||||
|
N0c,
|
||||||
|
N1c,
|
||||||
|
N2c,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
core_tags = (
|
||||||
|
S2p,
|
||||||
|
S1p,
|
||||||
|
S0lp,
|
||||||
|
S0l2p,
|
||||||
|
S0p,
|
||||||
|
S0r2p,
|
||||||
|
S0rp,
|
||||||
|
N0lp,
|
||||||
|
N0l2p,
|
||||||
|
N0p,
|
||||||
|
N1p,
|
||||||
|
N2p,
|
||||||
|
)
|
||||||
|
|
||||||
|
core_labels = (
|
||||||
|
S2L,
|
||||||
|
S1L,
|
||||||
|
S0lL,
|
||||||
|
S0l2L,
|
||||||
|
S0L,
|
||||||
|
S0r2L,
|
||||||
|
S0rL,
|
||||||
|
N0lL,
|
||||||
|
N0l2L,
|
||||||
|
)
|
||||||
|
|
||||||
|
valencies = (
|
||||||
|
(N0lv,),
|
||||||
|
(S0lv,),
|
||||||
|
(S0rv,),
|
||||||
|
(S1lv,),
|
||||||
|
(S1rv,),
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
|
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
from ..structs cimport TokenC, Entity
|
from ..structs cimport TokenC, Entity
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
@ -201,6 +204,21 @@ cdef cppclass StateC:
|
||||||
else:
|
else:
|
||||||
return this.length - this._b_i
|
return this.length - this._b_i
|
||||||
|
|
||||||
|
uint64_t hash() nogil const:
|
||||||
|
cdef TokenC[11] sig
|
||||||
|
sig[0] = this.S_(2)[0]
|
||||||
|
sig[1] = this.S_(1)[0]
|
||||||
|
sig[2] = this.R_(this.S(1), 1)[0]
|
||||||
|
sig[3] = this.L_(this.S(0), 1)[0]
|
||||||
|
sig[4] = this.L_(this.S(0), 2)[0]
|
||||||
|
sig[5] = this.S_(0)[0]
|
||||||
|
sig[6] = this.R_(this.S(0), 2)[0]
|
||||||
|
sig[7] = this.R_(this.S(0), 1)[0]
|
||||||
|
sig[8] = this.B_(0)[0]
|
||||||
|
sig[9] = this.E_(0)[0]
|
||||||
|
sig[10] = this.E_(1)[0]
|
||||||
|
return hash64(sig, sizeof(sig), this._s_i)
|
||||||
|
|
||||||
void push() nogil:
|
void push() nogil:
|
||||||
if this.B(0) != -1:
|
if this.B(0) != -1:
|
||||||
this._stack[this._s_i] = this.B(0)
|
this._stack[this._s_i] = this.B(0)
|
||||||
|
@ -290,6 +308,8 @@ cdef cppclass StateC:
|
||||||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
||||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
||||||
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
||||||
|
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
||||||
|
this.length = src.length
|
||||||
this._b_i = src._b_i
|
this._b_i = src._b_i
|
||||||
this._s_i = src._s_i
|
this._s_i = src._s_i
|
||||||
this._e_i = src._e_i
|
this._e_i = src._e_i
|
||||||
|
|
|
@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem):
|
||||||
else:
|
else:
|
||||||
is_valid[i] = False
|
is_valid[i] = False
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
assert n_gold >= 1
|
if n_gold < 1:
|
||||||
|
for annot in gold.orig_annot:
|
||||||
|
print(annot)
|
||||||
|
print([move_costs[i] for i in range(N_MOVES)])
|
||||||
|
print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1])
|
||||||
|
print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)])
|
||||||
|
print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)])
|
||||||
|
raise Exception("No gold moves")
|
||||||
|
|
|
@ -10,7 +10,7 @@ def english_noun_chunks(doc):
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(doc):
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
yield word.left_edge.i, word.i+1, np_label
|
||||||
elif word.pos == NOUN and word.dep == conj:
|
elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
while head.dep == conj and head.head.i < head.i:
|
while head.dep == conj and head.head.i < head.i:
|
||||||
head = head.head
|
head = head.head
|
||||||
|
|
|
@ -1,25 +1,37 @@
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.neural.nn cimport NeuralNet
|
from thinc.neural.nn cimport NeuralNet
|
||||||
|
from thinc.linear.features cimport ConjunctionExtracter
|
||||||
from thinc.base cimport Model
|
from thinc.base cimport Model
|
||||||
from thinc.extra.eg cimport Example
|
from thinc.extra.eg cimport Example
|
||||||
|
from thinc.typedefs cimport weight_t
|
||||||
|
from thinc.structs cimport FeatureC
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from .arc_eager cimport TransitionSystem
|
from .arc_eager cimport TransitionSystem
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from thinc.structs cimport ExampleC
|
from thinc.structs cimport NeuralNetC, ExampleC
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
|
|
||||||
cdef class ParserNeuralNet(NeuralNet):
|
cdef class ParserNeuralNet(NeuralNet):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
|
cdef ConjunctionExtracter extracter
|
||||||
|
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef class ParserPerceptron(AveragedPerceptron):
|
cdef class ParserPerceptron(AveragedPerceptron):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
|
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil
|
||||||
|
|
||||||
|
|
||||||
|
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
|
||||||
|
cdef object _models
|
||||||
|
cdef NeuralNetC** _models_c
|
||||||
|
cdef int** _masks
|
||||||
|
cdef int _nr_model
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
cdef readonly ParserNeuralNet model
|
cdef readonly Model model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef int _projectivize
|
cdef int _projectivize
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# cython: profile=True
|
||||||
"""
|
"""
|
||||||
MALT-style dependency parser
|
MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
|
@ -18,13 +19,14 @@ import shutil
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from .nonproj import PseudoProjectivity
|
from .nonproj import PseudoProjectivity
|
||||||
|
import random
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
from cymem.cymem cimport Pool, Address
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.linalg cimport VecVec
|
from thinc.linalg cimport VecVec
|
||||||
from thinc.structs cimport SparseArrayC, ExampleC
|
from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC
|
||||||
from preshed.maps cimport MapStruct
|
from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
from thinc.structs cimport FeatureC
|
from thinc.structs cimport FeatureC
|
||||||
|
@ -61,8 +63,10 @@ def get_templates(name):
|
||||||
return pf.ner
|
return pf.ner
|
||||||
elif name == 'debug':
|
elif name == 'debug':
|
||||||
return pf.unigrams
|
return pf.unigrams
|
||||||
elif name.startswith('embed'):
|
elif name.startswith('neural'):
|
||||||
return (pf.words, pf.tags, pf.labels)
|
features = pf.words + pf.tags + pf.labels
|
||||||
|
slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels)
|
||||||
|
return ([(f,) for f in features], slots)
|
||||||
else:
|
else:
|
||||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
|
||||||
pf.tree_shape + pf.trigrams)
|
pf.tree_shape + pf.trigrams)
|
||||||
|
@ -73,72 +77,238 @@ def ParserFactory(transition_system):
|
||||||
|
|
||||||
|
|
||||||
cdef class ParserPerceptron(AveragedPerceptron):
|
cdef class ParserPerceptron(AveragedPerceptron):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
|
@property
|
||||||
|
def widths(self):
|
||||||
|
return (self.extracter.nr_templ,)
|
||||||
|
|
||||||
|
def update(self, Example eg):
|
||||||
|
'''Does regression on negative cost. Sort of cute?'''
|
||||||
|
self.time += 1
|
||||||
|
cdef weight_t loss = 0.0
|
||||||
|
best = eg.best
|
||||||
|
for clas in range(eg.c.nr_class):
|
||||||
|
if not eg.c.is_valid[clas]:
|
||||||
|
continue
|
||||||
|
if eg.c.scores[clas] < eg.c.scores[best]:
|
||||||
|
continue
|
||||||
|
loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
|
||||||
|
d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas])
|
||||||
|
step = d_loss * 0.001
|
||||||
|
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||||
|
self.update_weight(feat.key, clas, feat.value * step)
|
||||||
|
return int(loss)
|
||||||
|
|
||||||
|
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
|
||||||
|
state = <const StateC*>_state
|
||||||
fill_context(eg.atoms, state)
|
fill_context(eg.atoms, state)
|
||||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
||||||
|
|
||||||
|
|
||||||
cdef class ParserNeuralNet(NeuralNet):
|
cdef class ParserNeuralNet(NeuralNet):
|
||||||
def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50,
|
def __init__(self, shape, **kwargs):
|
||||||
tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0):
|
vector_widths = [4] * 57
|
||||||
#input_length = 3 * word_width + 5 * tag_width + 3 * dep_width
|
slots = [0, 1, 2, 3] # S0
|
||||||
input_length = 12 * word_width + 7 * dep_width
|
slots += [4, 5, 6, 7] # S1
|
||||||
widths = [input_length] + [hidden_width] * depth + [nr_class]
|
slots += [8, 9, 10, 11] # S2
|
||||||
#vector_widths = [word_width, tag_width, dep_width]
|
slots += [12, 13, 14, 15] # S3+
|
||||||
#slots = [0] * 3 + [1] * 5 + [2] * 3
|
slots += [16, 17, 18, 19] # B0
|
||||||
vector_widths = [word_width, dep_width]
|
slots += [20, 21, 22, 23] # B1
|
||||||
slots = [0] * 12 + [1] * 7
|
slots += [24, 25, 26, 27] # B2
|
||||||
NeuralNet.__init__(
|
slots += [28, 29, 30, 31] # B3+
|
||||||
self,
|
slots += [32, 33, 34, 35] * 2 # S0l, S0r
|
||||||
widths,
|
slots += [36, 37, 38, 39] * 2 # B0l, B0r
|
||||||
embed=(vector_widths, slots),
|
slots += [40, 41, 42, 43] * 2 # S1l, S1r
|
||||||
eta=eta,
|
slots += [44, 45, 46, 47] * 2 # S2l, S2r
|
||||||
rho=rho,
|
slots += [48, 49, 50, 51, 52]
|
||||||
update_step=update_step)
|
slots += [53, 54, 55, 56]
|
||||||
|
input_length = sum(vector_widths[slot] for slot in slots)
|
||||||
|
widths = [input_length] + shape[3:]
|
||||||
|
|
||||||
|
NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nr_feat(self):
|
def nr_feat(self):
|
||||||
#return 3+5+3
|
return 2000
|
||||||
return 12+7
|
|
||||||
|
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil:
|
cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil:
|
||||||
|
memset(eg.features, 0, 2000 * sizeof(FeatureC))
|
||||||
|
state = <const StateC*>_state
|
||||||
fill_context(eg.atoms, state)
|
fill_context(eg.atoms, state)
|
||||||
eg.nr_feat = 12 + 7
|
feats = eg.features
|
||||||
|
|
||||||
|
feats = _add_token(feats, 0, state.S_(0), 1.0)
|
||||||
|
feats = _add_token(feats, 4, state.S_(1), 1.0)
|
||||||
|
feats = _add_token(feats, 8, state.S_(2), 1.0)
|
||||||
|
# Rest of the stack, with exponential decay
|
||||||
|
for i in range(3, state.stack_depth()):
|
||||||
|
feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2))
|
||||||
|
feats = _add_token(feats, 16, state.B_(0), 1.0)
|
||||||
|
feats = _add_token(feats, 20, state.B_(1), 1.0)
|
||||||
|
feats = _add_token(feats, 24, state.B_(2), 1.0)
|
||||||
|
# Rest of the buffer, with exponential decay
|
||||||
|
for i in range(3, min(8, state.buffer_length())):
|
||||||
|
feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2))
|
||||||
|
feats = _add_subtree(feats, 32, state, state.S(0))
|
||||||
|
feats = _add_subtree(feats, 40, state, state.B(0))
|
||||||
|
feats = _add_subtree(feats, 48, state, state.S(1))
|
||||||
|
feats = _add_subtree(feats, 56, state, state.S(2))
|
||||||
|
feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0))
|
||||||
|
feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0))
|
||||||
|
feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0))
|
||||||
|
feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1))
|
||||||
|
feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1))
|
||||||
|
feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0))
|
||||||
|
feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1))
|
||||||
|
feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1),
|
||||||
|
state.R_(state.S(0), 2))
|
||||||
|
feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1),
|
||||||
|
state.L_(state.S(0), 2))
|
||||||
|
eg.nr_feat = feats - eg.features
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline FeatureC* _add_token(FeatureC* feats,
|
||||||
|
int slot, const TokenC* token, weight_t value) nogil:
|
||||||
|
# Word
|
||||||
|
feats.i = slot
|
||||||
|
feats.key = token.lex.norm
|
||||||
|
feats.value = value
|
||||||
|
feats += 1
|
||||||
|
# POS tag
|
||||||
|
feats.i = slot+1
|
||||||
|
feats.key = token.tag
|
||||||
|
feats.value = value
|
||||||
|
feats += 1
|
||||||
|
# Dependency label
|
||||||
|
feats.i = slot+2
|
||||||
|
feats.key = token.dep
|
||||||
|
feats.value = value
|
||||||
|
feats += 1
|
||||||
|
# Word, label, tag
|
||||||
|
feats.i = slot+3
|
||||||
|
cdef uint64_t key[3]
|
||||||
|
key[0] = token.lex.cluster
|
||||||
|
key[1] = token.tag
|
||||||
|
key[2] = token.dep
|
||||||
|
feats.key = hash64(key, sizeof(key), 0)
|
||||||
|
feats.value = value
|
||||||
|
feats += 1
|
||||||
|
return feats
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil:
|
||||||
|
value = 1.0
|
||||||
|
for i in range(state.n_R(t)):
|
||||||
|
feats = _add_token(feats, slot, state.R_(t, i+1), value)
|
||||||
|
value *= 0.5
|
||||||
|
slot += 4
|
||||||
|
value = 1.0
|
||||||
|
for i in range(state.n_L(t)):
|
||||||
|
feats = _add_token(feats, slot, state.L_(t, i+1), value)
|
||||||
|
value *= 0.5
|
||||||
|
return feats
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot,
|
||||||
|
const TokenC* t1, const TokenC* t2) nogil:
|
||||||
|
cdef uint64_t[2] key
|
||||||
|
key[0] = t1.tag
|
||||||
|
key[1] = t2.tag
|
||||||
|
feat.i = slot
|
||||||
|
feat.key = hash64(key, sizeof(key), slot)
|
||||||
|
feat.value = 1.0
|
||||||
|
return feat+1
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot,
|
||||||
|
const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil:
|
||||||
|
cdef uint64_t[3] key
|
||||||
|
key[0] = t1.tag
|
||||||
|
key[1] = t2.tag
|
||||||
|
key[2] = t3.tag
|
||||||
|
feat.i = slot
|
||||||
|
feat.key = hash64(key, sizeof(key), slot)
|
||||||
|
feat.value = 1.0
|
||||||
|
return feat+1
|
||||||
|
|
||||||
|
cdef class ParserNeuralNetEnsemble(ParserNeuralNet):
|
||||||
|
def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5):
|
||||||
|
ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho)
|
||||||
|
self._models_c = <NeuralNetC**>self.mem.alloc(sizeof(NeuralNetC*), n)
|
||||||
|
self._masks = <int**>self.mem.alloc(sizeof(int*), n)
|
||||||
|
self._models = []
|
||||||
|
cdef ParserNeuralNet model
|
||||||
|
threshold = 1.5 / n
|
||||||
|
self._nr_model = n
|
||||||
|
for i in range(n):
|
||||||
|
self._masks[i] = <int*>self.mem.alloc(sizeof(int), self.nr_feat)
|
||||||
|
for j in range(self.nr_feat):
|
||||||
|
self._masks[i][j] = random.random() < threshold
|
||||||
|
# We have to pass our pool here, because the embedding table passes
|
||||||
|
# it around.
|
||||||
|
model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho)
|
||||||
|
self._models_c[i] = &model.c
|
||||||
|
self._models.append(model)
|
||||||
|
|
||||||
|
property eta:
|
||||||
|
def __get__(self):
|
||||||
|
return self._models[0].eta
|
||||||
|
|
||||||
|
def __set__(self, weight_t value):
|
||||||
|
for model in self._models:
|
||||||
|
model.eta = value
|
||||||
|
|
||||||
|
def sparsify_embeddings(self, penalty):
|
||||||
|
p = 0.0
|
||||||
|
for model in self._models:
|
||||||
|
p += model.sparsify_embeddings(penalty)
|
||||||
|
return p / len(self._models)
|
||||||
|
|
||||||
|
cdef void set_scoresC(self, weight_t* scores, const void* _feats,
|
||||||
|
int nr_feat, int is_sparse) nogil:
|
||||||
|
nr_class = self.c.widths[self.c.nr_layer-1]
|
||||||
|
sub_scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
|
||||||
|
sub_feats = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
|
||||||
|
feats = <const FeatureC*>_feats
|
||||||
|
for i in range(self._nr_model):
|
||||||
|
for j in range(nr_feat):
|
||||||
|
sub_feats[j] = feats[j]
|
||||||
|
sub_feats[j].value *= self._masks[i][j]
|
||||||
|
self.c = self._models_c[i][0]
|
||||||
|
self.c.weights = self._models_c[i].weights
|
||||||
|
self.c.gradient = self._models_c[i].gradient
|
||||||
|
ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1)
|
||||||
|
for j in range(nr_class):
|
||||||
|
scores[j] += sub_scores[j]
|
||||||
|
sub_scores[j] = 0.0
|
||||||
|
for j in range(nr_class):
|
||||||
|
scores[j] /= self._nr_model
|
||||||
|
free(sub_feats)
|
||||||
|
free(sub_scores)
|
||||||
|
|
||||||
|
def update(self, Example eg):
|
||||||
|
if eg.cost == 0:
|
||||||
|
return 0.0
|
||||||
|
loss = 0.0
|
||||||
|
full_feats = <FeatureC*>calloc(sizeof(FeatureC), eg.nr_feat)
|
||||||
|
memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat)
|
||||||
|
cdef ParserNeuralNet model
|
||||||
|
for i, model in enumerate(self._models):
|
||||||
for j in range(eg.nr_feat):
|
for j in range(eg.nr_feat):
|
||||||
eg.features[j].value = 1.0
|
eg.c.features[j].value *= self._masks[i][j]
|
||||||
eg.features[j].i = j
|
loss += model.update(eg)
|
||||||
#eg.features[0].key = eg.atoms[S0w]
|
memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat)
|
||||||
#eg.features[1].key = eg.atoms[S1w]
|
free(full_feats)
|
||||||
#eg.features[2].key = eg.atoms[N0w]
|
return loss
|
||||||
|
|
||||||
eg.features[0].key = eg.atoms[S2W]
|
def end_training(self):
|
||||||
eg.features[1].key = eg.atoms[S1W]
|
for model in self._models:
|
||||||
eg.features[2].key = eg.atoms[S0lW]
|
model.end_training()
|
||||||
eg.features[3].key = eg.atoms[S0l2W]
|
|
||||||
eg.features[4].key = eg.atoms[S0W]
|
|
||||||
eg.features[5].key = eg.atoms[S0r2W]
|
|
||||||
eg.features[6].key = eg.atoms[S0rW]
|
|
||||||
eg.features[7].key = eg.atoms[N0lW]
|
|
||||||
eg.features[8].key = eg.atoms[N0l2W]
|
|
||||||
eg.features[9].key = eg.atoms[N0W]
|
|
||||||
eg.features[10].key = eg.atoms[N1W]
|
|
||||||
eg.features[11].key = eg.atoms[N2W]
|
|
||||||
|
|
||||||
eg.features[12].key = eg.atoms[S2L]
|
|
||||||
eg.features[13].key = eg.atoms[S1L]
|
|
||||||
eg.features[14].key = eg.atoms[S0l2L]
|
|
||||||
eg.features[15].key = eg.atoms[S0lL]
|
|
||||||
eg.features[16].key = eg.atoms[S0L]
|
|
||||||
eg.features[17].key = eg.atoms[S0r2L]
|
|
||||||
eg.features[18].key = eg.atoms[S0rL]
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
def __init__(self, StringStore strings, transition_system, ParserNeuralNet model,
|
def __init__(self, StringStore strings, transition_system, model):
|
||||||
int projectivize = 0):
|
|
||||||
self.moves = transition_system
|
self.moves = transition_system
|
||||||
self.model = model
|
self.model = model
|
||||||
self._projectivize = projectivize
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, model_dir, strings, transition_system):
|
def from_dir(cls, model_dir, strings, transition_system):
|
||||||
|
@ -148,16 +318,24 @@ cdef class Parser:
|
||||||
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
|
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
|
||||||
cfg = Config.read(model_dir, 'config')
|
cfg = Config.read(model_dir, 'config')
|
||||||
moves = transition_system(strings, cfg.labels)
|
moves = transition_system(strings, cfg.labels)
|
||||||
model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width,
|
|
||||||
depth=cfg.depth, word_width=cfg.word_width,
|
|
||||||
tag_width=cfg.tag_width, dep_width=cfg.dep_width,
|
|
||||||
update_step=cfg.update_step,
|
|
||||||
eta=cfg.eta, rho=cfg.rho)
|
|
||||||
|
|
||||||
project = cfg.projectivize if hasattr(cfg,'projectivize') else False
|
if cfg.get('model') == 'neural':
|
||||||
|
shape = [cfg.vector_widths, cfg.slots, cfg.feat_set]
|
||||||
|
shape.extend(cfg.hidden_layers)
|
||||||
|
shape.append(moves.n_moves)
|
||||||
|
if cfg.get('ensemble_size') >= 2:
|
||||||
|
model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step,
|
||||||
|
eta=cfg.eta, rho=cfg.rho,
|
||||||
|
n=cfg.ensemble_size)
|
||||||
|
else:
|
||||||
|
model = ParserNeuralNet(shape, update_step=cfg.update_step,
|
||||||
|
eta=cfg.eta, rho=cfg.rho)
|
||||||
|
else:
|
||||||
|
model = ParserPerceptron(get_templates(cfg.feat_set))
|
||||||
|
|
||||||
if path.exists(path.join(model_dir, 'model')):
|
if path.exists(path.join(model_dir, 'model')):
|
||||||
model.load(path.join(model_dir, 'model'))
|
model.load(path.join(model_dir, 'model'))
|
||||||
return cls(strings, moves, model, project)
|
return cls(strings, moves, model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, vocab):
|
def load(cls, pkg_or_str_or_file, vocab):
|
||||||
|
@ -253,18 +431,18 @@ cdef class Parser:
|
||||||
widths=self.model.widths,
|
widths=self.model.widths,
|
||||||
nr_atom=CONTEXT_SIZE,
|
nr_atom=CONTEXT_SIZE,
|
||||||
nr_feat=self.model.nr_feat)
|
nr_feat=self.model.nr_feat)
|
||||||
cdef weight_t loss = 0
|
loss = 0
|
||||||
cdef Transition action
|
cdef Transition action
|
||||||
while not stcls.is_final():
|
while not stcls.is_final():
|
||||||
self.model.set_featuresC(eg.c, stcls.c)
|
self.model.set_featuresC(eg.c, stcls.c)
|
||||||
|
self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1)
|
||||||
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
|
||||||
|
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||||
# Sets eg.c.scores, which Example uses to calculate eg.guess
|
assert guess >= 0
|
||||||
self.model.updateC(eg.c)
|
action = self.moves.c[guess]
|
||||||
|
|
||||||
action = self.moves.c[eg.guess]
|
|
||||||
action.do(stcls.c, action.label)
|
action.do(stcls.c, action.label)
|
||||||
loss += eg.loss
|
|
||||||
|
loss += self.model.update(eg)
|
||||||
eg.reset()
|
eg.reset()
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from .vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class TaggerModel(AveragedPerceptron):
|
cdef class TaggerModel(AveragedPerceptron):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
|
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
|
|
|
@ -71,13 +71,13 @@ cpdef enum:
|
||||||
|
|
||||||
|
|
||||||
cdef class TaggerModel(AveragedPerceptron):
|
cdef class TaggerModel(AveragedPerceptron):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil:
|
||||||
|
token = <const TokenC*>_token
|
||||||
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
_fill_from_token(&eg.atoms[P2_orth], token - 2)
|
||||||
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
_fill_from_token(&eg.atoms[P1_orth], token - 1)
|
||||||
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
_fill_from_token(&eg.atoms[W_orth], token)
|
||||||
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
|
_fill_from_token(&eg.atoms[N1_orth], token + 1)
|
||||||
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
|
_fill_from_token(&eg.atoms[N2_orth], token + 2)
|
||||||
|
|
||||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ cdef class Tagger:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, pkg, vocab):
|
def from_package(cls, pkg, vocab):
|
||||||
# TODO: templates.json deprecated? not present in latest package
|
# TODO: templates.json deprecated? not present in latest package
|
||||||
# templates = cls.default_templates()
|
#templates = cls.default_templates()
|
||||||
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
|
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
|
||||||
|
|
||||||
model = TaggerModel(templates)
|
model = TaggerModel(templates)
|
||||||
|
@ -202,12 +202,13 @@ cdef class Tagger:
|
||||||
nr_feat=self.model.nr_feat)
|
nr_feat=self.model.nr_feat)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.c[i].pos == 0:
|
if tokens.c[i].pos == 0:
|
||||||
self.model.set_featuresC(eg.c, tokens.c, i)
|
self.model.set_featuresC(eg.c, &tokens.c[i])
|
||||||
self.model.set_scoresC(eg.c.scores,
|
self.model.set_scoresC(eg.c.scores,
|
||||||
eg.c.features, eg.c.nr_feat, 1)
|
eg.c.features, eg.c.nr_feat, 1)
|
||||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||||
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
|
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
|
||||||
eg.fill_scores(0, eg.c.nr_class)
|
eg.fill_scores(0, eg.c.nr_class)
|
||||||
|
eg.reset()
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
|
@ -231,18 +232,15 @@ cdef class Tagger:
|
||||||
nr_class=self.vocab.morphology.n_tags,
|
nr_class=self.vocab.morphology.n_tags,
|
||||||
nr_feat=self.model.nr_feat)
|
nr_feat=self.model.nr_feat)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
self.model.set_featuresC(eg.c, tokens.c, i)
|
self.model.set_featuresC(eg.c, &tokens.c[i])
|
||||||
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
||||||
self.model.set_scoresC(eg.c.scores,
|
self.model.set_scoresC(eg.c.scores,
|
||||||
eg.c.features, eg.c.nr_feat, 1)
|
eg.c.features, eg.c.nr_feat, 1)
|
||||||
self.model.updateC(eg.c)
|
|
||||||
|
|
||||||
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
|
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
|
||||||
|
self.model.update(eg)
|
||||||
correct += eg.cost == 0
|
correct += eg.cost == 0
|
||||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
self.freqs[TAG][tokens.c[i].tag] += 1
|
||||||
eg.fill_scores(0, eg.c.nr_class)
|
eg.reset()
|
||||||
eg.fill_costs(0, eg.c.nr_class)
|
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
return correct
|
return correct
|
||||||
|
|
Loading…
Reference in New Issue
Block a user