mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Remove unfinished examples
This commit is contained in:
parent
c031c677cc
commit
f028f8ad28
|
@ -1,14 +0,0 @@
|
||||||
from paddle.trainer_config_helpers import *
|
|
||||||
|
|
||||||
define_py_data_sources2(train_list='train.list',
|
|
||||||
test_list='test.list',
|
|
||||||
module="dataprovider",
|
|
||||||
obj="process")
|
|
||||||
|
|
||||||
settings(
|
|
||||||
batch_size=128,
|
|
||||||
learning_rate=2e-3,
|
|
||||||
learning_method=AdamOptimizer(),
|
|
||||||
regularization=L2Regularization(8e-4),
|
|
||||||
gradient_clipping_threshold=25
|
|
||||||
)
|
|
|
@ -1,46 +0,0 @@
|
||||||
from paddle.trainer.PyDataProvider2 import *
|
|
||||||
from itertools import izip
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
def get_features(doc):
|
|
||||||
return numpy.asarray(
|
|
||||||
[t.rank+1 for t in doc
|
|
||||||
if t.has_vector and not t.is_punct and not t.is_space],
|
|
||||||
dtype='int32')
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(data_dir):
|
|
||||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
|
||||||
for filename in (data_dir / subdir).iterdir():
|
|
||||||
with filename.open() as file_:
|
|
||||||
text = file_.read()
|
|
||||||
yield text, label
|
|
||||||
|
|
||||||
|
|
||||||
def on_init(settings, **kwargs):
|
|
||||||
print("Loading spaCy")
|
|
||||||
nlp = spacy.load('en', entity=False)
|
|
||||||
vectors = get_vectors(nlp)
|
|
||||||
settings.input_types = [
|
|
||||||
# The text is a sequence of integer values, and each value is a word id.
|
|
||||||
# The whole sequence is the sentences that we want to predict its
|
|
||||||
# sentimental.
|
|
||||||
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
|
|
||||||
|
|
||||||
# label positive/negative
|
|
||||||
integer_value(2)
|
|
||||||
]
|
|
||||||
settings.nlp = nlp
|
|
||||||
settings.vectors = vectors
|
|
||||||
settings['batch_size'] = 32
|
|
||||||
|
|
||||||
|
|
||||||
@provider(init_hook=on_init)
|
|
||||||
def process(settings, data_dir): # settings is not used currently.
|
|
||||||
texts, labels = read_data(data_dir)
|
|
||||||
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
|
|
||||||
for sent in doc.sents:
|
|
||||||
ids = get_features(sent)
|
|
||||||
# give data to paddle.
|
|
||||||
yield ids, label
|
|
|
@ -1,19 +0,0 @@
|
||||||
from paddle.trainer_config_helpers import *
|
|
||||||
|
|
||||||
|
|
||||||
def bidirectional_lstm_net(input_dim,
|
|
||||||
class_dim=2,
|
|
||||||
emb_dim=128,
|
|
||||||
lstm_dim=128,
|
|
||||||
is_predict=False):
|
|
||||||
data = data_layer("word", input_dim)
|
|
||||||
emb = embedding_layer(input=data, size=emb_dim)
|
|
||||||
bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
|
|
||||||
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
|
|
||||||
output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
|
|
||||||
|
|
||||||
if not is_predict:
|
|
||||||
lbl = data_layer("label", 1)
|
|
||||||
outputs(classification_cost(input=output, label=lbl))
|
|
||||||
else:
|
|
||||||
outputs(output)
|
|
|
@ -1,14 +0,0 @@
|
||||||
config=config.py
|
|
||||||
output=./model_output
|
|
||||||
paddle train --config=$config \
|
|
||||||
--save_dir=$output \
|
|
||||||
--job=train \
|
|
||||||
--use_gpu=false \
|
|
||||||
--trainer_count=4 \
|
|
||||||
--num_passes=10 \
|
|
||||||
--log_period=20 \
|
|
||||||
--dot_period=20 \
|
|
||||||
--show_parameter_stats_period=100 \
|
|
||||||
--test_all_data_in_one_period=1 \
|
|
||||||
--config_args=batch_size=100 \
|
|
||||||
2>&1 | tee 'train.log'_
|
|
|
@ -1,86 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
import random
|
|
||||||
|
|
||||||
import spacy.en
|
|
||||||
import model
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, data_dir):
|
|
||||||
for subdir, label in (('pos', 1), ('neg', 0)):
|
|
||||||
for filename in (data_dir / subdir).iterdir():
|
|
||||||
text = filename.open().read()
|
|
||||||
doc = nlp(text)
|
|
||||||
yield doc, label
|
|
||||||
|
|
||||||
|
|
||||||
def partition(examples, split_size):
|
|
||||||
examples = list(examples)
|
|
||||||
random.shuffle(examples)
|
|
||||||
n_docs = len(examples)
|
|
||||||
split = int(n_docs * split_size)
|
|
||||||
return examples[:split], examples[split:]
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset(object):
|
|
||||||
def __init__(self, nlp, data_dir, batch_size=24):
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.train, self.dev = partition(read_data(nlp, Path(data_dir)), 0.8)
|
|
||||||
print("Read %d train docs" % len(self.train))
|
|
||||||
print("Pos. Train: ", sum(eg[1] == 1 for eg in self.train))
|
|
||||||
print("Read %d dev docs" % len(self.dev))
|
|
||||||
print("Neg. Dev: ", sum(eg[1] == 1 for eg in self.dev))
|
|
||||||
|
|
||||||
def batches(self, data):
|
|
||||||
for i in range(0, len(data), self.batch_size):
|
|
||||||
yield data[i : i + self.batch_size]
|
|
||||||
|
|
||||||
|
|
||||||
def model_writer(out_dir, name):
|
|
||||||
def save_model(epoch, params):
|
|
||||||
out_path = out_dir / name.format(epoch=epoch)
|
|
||||||
pickle.dump(params, out_path.open('wb'))
|
|
||||||
return save_model
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
data_dir=("Data directory", "positional", None, Path),
|
|
||||||
vocab_size=("Number of words to fine-tune", "option", "w", int),
|
|
||||||
n_iter=("Number of iterations (epochs)", "option", "i", int),
|
|
||||||
vector_len=("Size of embedding vectors", "option", "e", int),
|
|
||||||
hidden_len=("Size of hidden layers", "option", "H", int),
|
|
||||||
depth=("Depth", "option", "d", int),
|
|
||||||
drop_rate=("Drop-out rate", "option", "r", float),
|
|
||||||
rho=("Regularization penalty", "option", "p", float),
|
|
||||||
batch_size=("Batch size", "option", "b", int),
|
|
||||||
out_dir=("Model directory", "positional", None, Path)
|
|
||||||
)
|
|
||||||
def main(data_dir, out_dir, n_iter=10, vector_len=300, vocab_size=20000,
|
|
||||||
hidden_len=300, depth=3, drop_rate=0.3, rho=1e-4, batch_size=24):
|
|
||||||
print("Loading")
|
|
||||||
nlp = spacy.en.English(parser=False)
|
|
||||||
dataset = Dataset(nlp, data_dir / 'train', batch_size)
|
|
||||||
print("Training")
|
|
||||||
network = model.train(dataset, vector_len, hidden_len, 2, vocab_size, depth,
|
|
||||||
drop_rate, rho, n_iter,
|
|
||||||
model_writer(out_dir, 'model_{epoch}.pickle'))
|
|
||||||
score = model.Scorer()
|
|
||||||
print("Evaluating")
|
|
||||||
for doc, label in read_data(nlp, data_dir / 'test'):
|
|
||||||
word_ids, embeddings = model.get_words(doc, 0.0, vocab_size)
|
|
||||||
guess = network.forward(word_ids, embeddings)
|
|
||||||
score += guess == label
|
|
||||||
print(score)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,188 +0,0 @@
|
||||||
from __future__ import division
|
|
||||||
from numpy import average, zeros, outer, random, exp, sqrt, concatenate, argmax
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
from .util import Scorer
|
|
||||||
|
|
||||||
|
|
||||||
class Adagrad(object):
|
|
||||||
def __init__(self, dim, lr):
|
|
||||||
self.dim = dim
|
|
||||||
self.eps = 1e-3
|
|
||||||
# initial learning rate
|
|
||||||
self.learning_rate = lr
|
|
||||||
# stores sum of squared gradients
|
|
||||||
self.h = zeros(self.dim)
|
|
||||||
self._curr_rate = zeros(self.h.shape)
|
|
||||||
|
|
||||||
def rescale(self, gradient):
|
|
||||||
self._curr_rate.fill(0)
|
|
||||||
self.h += gradient ** 2
|
|
||||||
self._curr_rate = self.learning_rate / (sqrt(self.h) + self.eps)
|
|
||||||
return self._curr_rate * gradient
|
|
||||||
|
|
||||||
def reset_weights(self):
|
|
||||||
self.h = zeros(self.dim)
|
|
||||||
|
|
||||||
|
|
||||||
class Params(object):
|
|
||||||
@classmethod
|
|
||||||
def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab):
|
|
||||||
return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: zeros((x,)))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def random(cls, depth, nE, nH, nL, nV):
|
|
||||||
return cls(depth, nE, nH, nL, nV, lambda x: (random.rand(x) * 2 - 1) * 0.08)
|
|
||||||
|
|
||||||
def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer):
|
|
||||||
nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab
|
|
||||||
n_weights = sum([
|
|
||||||
(nE * nH) + nH,
|
|
||||||
(nH * nH + nH) * depth,
|
|
||||||
(nH * nL) + nL,
|
|
||||||
(nV * nE)
|
|
||||||
])
|
|
||||||
self.data = initializer(n_weights)
|
|
||||||
self.W = []
|
|
||||||
self.b = []
|
|
||||||
i = self._add_layer(0, nE, nH)
|
|
||||||
for _ in range(1, depth):
|
|
||||||
i = self._add_layer(i, nH, nH)
|
|
||||||
i = self._add_layer(i, nL, nH)
|
|
||||||
self.E = self.data[i : i + (nV * nE)].reshape((nV, nE))
|
|
||||||
self.E.fill(0)
|
|
||||||
|
|
||||||
def _add_layer(self, start, x, y):
|
|
||||||
end = start + (x * y)
|
|
||||||
self.W.append(self.data[start : end].reshape((x, y)))
|
|
||||||
self.b.append(self.data[end : end + x].reshape((x, )))
|
|
||||||
return end + x
|
|
||||||
|
|
||||||
|
|
||||||
def softmax(actvn, W, b):
|
|
||||||
w = W.dot(actvn) + b
|
|
||||||
ew = exp(w - max(w))
|
|
||||||
return (ew / sum(ew)).ravel()
|
|
||||||
|
|
||||||
|
|
||||||
def relu(actvn, W, b):
|
|
||||||
x = W.dot(actvn) + b
|
|
||||||
return x * (x > 0)
|
|
||||||
|
|
||||||
|
|
||||||
def d_relu(x):
|
|
||||||
return x > 0
|
|
||||||
|
|
||||||
|
|
||||||
class Network(object):
|
|
||||||
def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, rho=1e-4, lr=0.005):
|
|
||||||
self.depth = depth
|
|
||||||
self.n_embed = n_embed
|
|
||||||
self.n_hidden = n_hidden
|
|
||||||
self.n_labels = n_labels
|
|
||||||
self.n_vocab = n_vocab
|
|
||||||
|
|
||||||
self.params = Params.random(depth, n_embed, n_hidden, n_labels, n_vocab)
|
|
||||||
self.gradient = Params.zero(depth, n_embed, n_hidden, n_labels, n_vocab)
|
|
||||||
self.adagrad = Adagrad(self.params.data.shape, lr)
|
|
||||||
self.seen_words = {}
|
|
||||||
|
|
||||||
self.pred = zeros(self.n_labels)
|
|
||||||
self.actvn = zeros((self.depth, self.n_hidden))
|
|
||||||
self.input_vector = zeros((self.n_embed, ))
|
|
||||||
|
|
||||||
def forward(self, word_ids, embeddings):
|
|
||||||
self.input_vector.fill(0)
|
|
||||||
self.input_vector += sum(embeddings)
|
|
||||||
# Apply the fine-tuning we've learned
|
|
||||||
for id_ in word_ids:
|
|
||||||
if id_ < self.n_vocab:
|
|
||||||
self.input_vector += self.params.E[id_]
|
|
||||||
# Average
|
|
||||||
self.input_vector /= len(embeddings)
|
|
||||||
prev = self.input_vector
|
|
||||||
for i in range(self.depth):
|
|
||||||
self.actvn[i] = relu(prev, self.params.W[i], self.params.b[i])
|
|
||||||
return x * (x > 0)
|
|
||||||
|
|
||||||
|
|
||||||
prev = self.actvn[i]
|
|
||||||
self.pred = softmax(self.actvn[-1], self.params.W[-1], self.params.b[-1])
|
|
||||||
return argmax(self.pred)
|
|
||||||
|
|
||||||
def backward(self, word_ids, label):
|
|
||||||
target = zeros(self.n_labels)
|
|
||||||
target[label] = 1.0
|
|
||||||
D = self.pred - target
|
|
||||||
|
|
||||||
for i in range(self.depth, 0, -1):
|
|
||||||
self.gradient.b[i] += D
|
|
||||||
self.gradient.W[i] += outer(D, self.actvn[i-1])
|
|
||||||
D = d_relu(self.actvn[i-1]) * self.params.W[i].T.dot(D)
|
|
||||||
|
|
||||||
self.gradient.b[0] += D
|
|
||||||
self.gradient.W[0] += outer(D, self.input_vector)
|
|
||||||
|
|
||||||
grad = self.params.W[0].T.dot(D).reshape((self.n_embed,)) / len(word_ids)
|
|
||||||
for word_id in word_ids:
|
|
||||||
if word_id < self.n_vocab:
|
|
||||||
self.gradient.E[word_id] += grad
|
|
||||||
self.seen_words[word_id] = self.seen_words.get(word_id, 0) + 1
|
|
||||||
|
|
||||||
def update(self, rho, n):
|
|
||||||
# L2 Regularization
|
|
||||||
for i in range(self.depth):
|
|
||||||
self.gradient.W[i] += self.params.W[i] * rho
|
|
||||||
self.gradient.b[i] += self.params.b[i] * rho
|
|
||||||
# Do word embedding tuning
|
|
||||||
for word_id, freq in self.seen_words.items():
|
|
||||||
self.gradient.E[word_id] += (self.params.E[word_id] * freq) * rho
|
|
||||||
|
|
||||||
update = self.gradient.data / n
|
|
||||||
update = self.adagrad.rescale(update)
|
|
||||||
self.params.data -= update
|
|
||||||
self.gradient.data.fill(0)
|
|
||||||
self.seen_words = {}
|
|
||||||
|
|
||||||
|
|
||||||
def get_words(doc, dropout_rate, n_vocab):
|
|
||||||
mask = random.rand(len(doc)) > dropout_rate
|
|
||||||
word_ids = []
|
|
||||||
embeddings = []
|
|
||||||
for word in doc:
|
|
||||||
if mask[word.i] and not word.is_punct:
|
|
||||||
embeddings.append(word.vector)
|
|
||||||
word_ids.append(word.orth)
|
|
||||||
# all examples must have at least one word
|
|
||||||
if not embeddings:
|
|
||||||
return [w.orth for w in doc], [w.vector for w in doc]
|
|
||||||
else:
|
|
||||||
return word_ids, embeddings
|
|
||||||
|
|
||||||
|
|
||||||
def train(dataset, n_embed, n_hidden, n_labels, n_vocab, depth, dropout_rate, rho,
|
|
||||||
n_iter, save_model):
|
|
||||||
model = Network(depth, n_embed, n_hidden, n_labels, n_vocab)
|
|
||||||
best_acc = 0
|
|
||||||
for epoch in range(n_iter):
|
|
||||||
train_score = Scorer()
|
|
||||||
# create mini-batches
|
|
||||||
for batch in dataset.batches(dataset.train):
|
|
||||||
for doc, label in batch:
|
|
||||||
if len(doc) == 0:
|
|
||||||
continue
|
|
||||||
word_ids, embeddings = get_words(doc, dropout_rate, n_vocab)
|
|
||||||
guess = model.forward(word_ids, embeddings)
|
|
||||||
model.backward(word_ids, label)
|
|
||||||
train_score += guess == label
|
|
||||||
model.update(rho, len(batch))
|
|
||||||
test_score = Scorer()
|
|
||||||
for doc, label in dataset.dev:
|
|
||||||
word_ids, embeddings = get_words(doc, 0.0, n_vocab)
|
|
||||||
guess = model.forward(word_ids, embeddings)
|
|
||||||
test_score += guess == label
|
|
||||||
if test_score.true >= best_acc:
|
|
||||||
best_acc = test_score.true
|
|
||||||
save_model(epoch, model.params.data)
|
|
||||||
print "%d\t%s\t%s" % (epoch, train_score, test_score)
|
|
||||||
return model
|
|
|
@ -1,14 +0,0 @@
|
||||||
class Scorer(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.true = 0
|
|
||||||
self.total = 0
|
|
||||||
|
|
||||||
def __iadd__(self, is_correct):
|
|
||||||
self.true += is_correct
|
|
||||||
self.total += 1
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return '%.3f' % (self.true / self.total)
|
|
||||||
|
|
||||||
|
|
|
@ -1,246 +0,0 @@
|
||||||
from __future__ import print_function
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import division
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
import plac
|
|
||||||
import random
|
|
||||||
from collections import Counter
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from itertools import count
|
|
||||||
|
|
||||||
if os.environ.get('DYNET_GPU') == '1':
|
|
||||||
import _gdynet as dynet
|
|
||||||
from _gdynet import cg
|
|
||||||
else:
|
|
||||||
import dynet
|
|
||||||
from dynet import cg
|
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
|
||||||
def __init__(self, w2i=None):
|
|
||||||
if w2i is None: w2i = defaultdict(count(0).next)
|
|
||||||
self.w2i = dict(w2i)
|
|
||||||
self.i2w = {i:w for w,i in w2i.iteritems()}
|
|
||||||
@classmethod
|
|
||||||
def from_corpus(cls, corpus):
|
|
||||||
w2i = defaultdict(count(0).next)
|
|
||||||
for sent in corpus:
|
|
||||||
[w2i[word] for word in sent]
|
|
||||||
return Vocab(w2i)
|
|
||||||
|
|
||||||
def size(self):
|
|
||||||
return len(self.w2i.keys())
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(path):
|
|
||||||
with path.open() as file_:
|
|
||||||
sent = []
|
|
||||||
for line in file_:
|
|
||||||
line = line.strip().split()
|
|
||||||
if not line:
|
|
||||||
if sent:
|
|
||||||
yield sent
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
pieces = line
|
|
||||||
w = pieces[1]
|
|
||||||
pos = pieces[3]
|
|
||||||
sent.append((w, pos))
|
|
||||||
|
|
||||||
|
|
||||||
def get_vocab(train, test):
|
|
||||||
words = []
|
|
||||||
tags = []
|
|
||||||
wc = Counter()
|
|
||||||
for s in train:
|
|
||||||
for w, p in s:
|
|
||||||
words.append(w)
|
|
||||||
tags.append(p)
|
|
||||||
wc[w] += 1
|
|
||||||
words.append("_UNK_")
|
|
||||||
#words=[w if wc[w] > 1 else "_UNK_" for w in words]
|
|
||||||
tags.append("_START_")
|
|
||||||
|
|
||||||
for s in test:
|
|
||||||
for w, p in s:
|
|
||||||
words.append(w)
|
|
||||||
vw = Vocab.from_corpus([words])
|
|
||||||
vt = Vocab.from_corpus([tags])
|
|
||||||
return words, tags, wc, vw, vt
|
|
||||||
|
|
||||||
|
|
||||||
class BiTagger(object):
|
|
||||||
def __init__(self, vw, vt, nwords, ntags):
|
|
||||||
self.vw = vw
|
|
||||||
self.vt = vt
|
|
||||||
self.nwords = nwords
|
|
||||||
self.ntags = ntags
|
|
||||||
|
|
||||||
self.UNK = self.vw.w2i["_UNK_"]
|
|
||||||
|
|
||||||
self._model = dynet.Model()
|
|
||||||
self._sgd = dynet.SimpleSGDTrainer(self._model)
|
|
||||||
|
|
||||||
self._E = self._model.add_lookup_parameters((self.nwords, 128))
|
|
||||||
self._p_t1 = self._model.add_lookup_parameters((self.ntags, 30))
|
|
||||||
|
|
||||||
self._pH = self._model.add_parameters((32, 50*2))
|
|
||||||
self._pO = self._model.add_parameters((self.ntags, 32))
|
|
||||||
|
|
||||||
self._fwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model)
|
|
||||||
self._bwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model)
|
|
||||||
self._words_batch = []
|
|
||||||
self._tags_batch = []
|
|
||||||
self._minibatch_size = 32
|
|
||||||
|
|
||||||
def __call__(self, words):
|
|
||||||
dynet.renew_cg()
|
|
||||||
word_ids = [self.vw.w2i.get(w, self.UNK) for w in words]
|
|
||||||
wembs = [self._E[w] for w in word_ids]
|
|
||||||
|
|
||||||
f_state = self._fwd_lstm.initial_state()
|
|
||||||
b_state = self._bwd_lstm.initial_state()
|
|
||||||
|
|
||||||
fw = [x.output() for x in f_state.add_inputs(wembs)]
|
|
||||||
bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]
|
|
||||||
|
|
||||||
H = dynet.parameter(self._pH)
|
|
||||||
O = dynet.parameter(self._pO)
|
|
||||||
|
|
||||||
tags = []
|
|
||||||
for i, (f, b) in enumerate(zip(fw, reversed(bw))):
|
|
||||||
r_t = O * (dynet.tanh(H * dynet.concatenate([f, b])))
|
|
||||||
out = dynet.softmax(r_t)
|
|
||||||
tags.append(self.vt.i2w[np.argmax(out.npvalue())])
|
|
||||||
return tags
|
|
||||||
|
|
||||||
def predict_batch(self, words_batch):
|
|
||||||
dynet.renew_cg()
|
|
||||||
length = max(len(words) for words in words_batch)
|
|
||||||
word_ids = np.zeros((length, len(words_batch)), dtype='int32')
|
|
||||||
for j, words in enumerate(words_batch):
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
word_ids[i, j] = self.vw.w2i.get(word, self.UNK)
|
|
||||||
wembs = [dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)]
|
|
||||||
|
|
||||||
f_state = self._fwd_lstm.initial_state()
|
|
||||||
b_state = self._bwd_lstm.initial_state()
|
|
||||||
|
|
||||||
fw = [x.output() for x in f_state.add_inputs(wembs)]
|
|
||||||
bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]
|
|
||||||
|
|
||||||
H = dynet.parameter(self._pH)
|
|
||||||
O = dynet.parameter(self._pO)
|
|
||||||
|
|
||||||
tags_batch = [[] for _ in range(len(words_batch))]
|
|
||||||
for i, (f, b) in enumerate(zip(fw, reversed(bw))):
|
|
||||||
r_t = O * (dynet.tanh(H * dynet.concatenate([f, b])))
|
|
||||||
out = dynet.softmax(r_t).npvalue()
|
|
||||||
for j in range(len(words_batch)):
|
|
||||||
tags_batch[j].append(self.vt.i2w[np.argmax(out.T[j])])
|
|
||||||
return tags_batch
|
|
||||||
|
|
||||||
def pipe(self, sentences):
|
|
||||||
batch = []
|
|
||||||
for words in sentences:
|
|
||||||
batch.append(words)
|
|
||||||
if len(batch) == self._minibatch_size:
|
|
||||||
tags_batch = self.predict_batch(batch)
|
|
||||||
for words, tags in zip(batch, tags_batch):
|
|
||||||
yield tags
|
|
||||||
batch = []
|
|
||||||
|
|
||||||
def update(self, words, tags):
|
|
||||||
self._words_batch.append(words)
|
|
||||||
self._tags_batch.append(tags)
|
|
||||||
if len(self._words_batch) == self._minibatch_size:
|
|
||||||
loss = self.update_batch(self._words_batch, self._tags_batch)
|
|
||||||
self._words_batch = []
|
|
||||||
self._tags_batch = []
|
|
||||||
else:
|
|
||||||
loss = 0
|
|
||||||
return loss
|
|
||||||
|
|
||||||
def update_batch(self, words_batch, tags_batch):
|
|
||||||
dynet.renew_cg()
|
|
||||||
length = max(len(words) for words in words_batch)
|
|
||||||
word_ids = np.zeros((length, len(words_batch)), dtype='int32')
|
|
||||||
for j, words in enumerate(words_batch):
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
word_ids[i, j] = self.vw.w2i.get(word, self.UNK)
|
|
||||||
tag_ids = np.zeros((length, len(words_batch)), dtype='int32')
|
|
||||||
for j, tags in enumerate(tags_batch):
|
|
||||||
for i, tag in enumerate(tags):
|
|
||||||
tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK)
|
|
||||||
wembs = [dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)]
|
|
||||||
wembs = [dynet.noise(we, 0.1) for we in wembs]
|
|
||||||
|
|
||||||
f_state = self._fwd_lstm.initial_state()
|
|
||||||
b_state = self._bwd_lstm.initial_state()
|
|
||||||
|
|
||||||
fw = [x.output() for x in f_state.add_inputs(wembs)]
|
|
||||||
bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]
|
|
||||||
|
|
||||||
H = dynet.parameter(self._pH)
|
|
||||||
O = dynet.parameter(self._pO)
|
|
||||||
|
|
||||||
errs = []
|
|
||||||
for i, (f, b) in enumerate(zip(fw, reversed(bw))):
|
|
||||||
f_b = dynet.concatenate([f,b])
|
|
||||||
r_t = O * (dynet.tanh(H * f_b))
|
|
||||||
err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i])
|
|
||||||
errs.append(dynet.sum_batches(err))
|
|
||||||
sum_errs = dynet.esum(errs)
|
|
||||||
squared = -sum_errs # * sum_errs
|
|
||||||
losses = sum_errs.scalar_value()
|
|
||||||
sum_errs.backward()
|
|
||||||
self._sgd.update()
|
|
||||||
return losses
|
|
||||||
|
|
||||||
|
|
||||||
def main(train_loc, dev_loc, model_dir):
|
|
||||||
train_loc = pathlib.Path(train_loc)
|
|
||||||
dev_loc = pathlib.Path(dev_loc)
|
|
||||||
|
|
||||||
train = list(read_data((train_loc)))
|
|
||||||
test = list(read_data(dev_loc))
|
|
||||||
|
|
||||||
words, tags, wc, vw, vt = get_vocab(train, test)
|
|
||||||
|
|
||||||
UNK = vw.w2i["_UNK_"]
|
|
||||||
nwords = vw.size()
|
|
||||||
ntags = vt.size()
|
|
||||||
|
|
||||||
tagger = BiTagger(vw, vt, nwords, ntags)
|
|
||||||
|
|
||||||
tagged = loss = 0
|
|
||||||
|
|
||||||
for ITER in xrange(1):
|
|
||||||
random.shuffle(train)
|
|
||||||
for i, s in enumerate(train,1):
|
|
||||||
if i % 5000 == 0:
|
|
||||||
tagger._sgd.status()
|
|
||||||
print(loss / tagged)
|
|
||||||
loss = 0
|
|
||||||
tagged = 0
|
|
||||||
if i % 10000 == 0:
|
|
||||||
good = bad = 0.0
|
|
||||||
word_sents = [[w for w, t in sent] for sent in test]
|
|
||||||
gold_sents = [[t for w, t in sent] for sent in test]
|
|
||||||
for words, tags, golds in zip(words, tagger.pipe(words), gold_sents):
|
|
||||||
for go, gu in zip(golds, tags):
|
|
||||||
if go == gu:
|
|
||||||
good += 1
|
|
||||||
else:
|
|
||||||
bad += 1
|
|
||||||
print(good / (good+bad))
|
|
||||||
loss += tagger.update([w for w, t in s], [t for w, t in s])
|
|
||||||
tagged += len(s)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
Loading…
Reference in New Issue
Block a user