diff --git a/examples/chainer_sentiment.py b/examples/chainer_sentiment.py deleted file mode 100644 index 747ef508a..000000000 --- a/examples/chainer_sentiment.py +++ /dev/null @@ -1,322 +0,0 @@ -'''WIP --- Doesn't work well yet''' -import plac -import random -import six - -import cProfile -import pstats - -import pathlib -import cPickle as pickle -from itertools import izip - -import spacy - -import cytoolz -import cupy as xp -import cupy.cuda -import chainer.cuda - -import chainer.links as L -import chainer.functions as F -from chainer import Chain, Variable, report -import chainer.training -import chainer.optimizers -from chainer.training import extensions -from chainer.iterators import SerialIterator -from chainer.datasets import TupleDataset - - -class SentimentAnalyser(object): - @classmethod - def load(cls, path, nlp, max_length=100): - raise NotImplementedError - #with (path / 'config.json').open() as file_: - # model = model_from_json(file_.read()) - #with (path / 'model').open('rb') as file_: - # lstm_weights = pickle.load(file_) - #embeddings = get_embeddings(nlp.vocab) - #model.set_weights([embeddings] + lstm_weights) - #return cls(model, max_length=max_length) - - def __init__(self, model, max_length=100): - self._model = model - self.max_length = max_length - - def __call__(self, doc): - X = get_features([doc], self.max_length) - y = self._model.predict(X) - self.set_sentiment(doc, y) - - def pipe(self, docs, batch_size=1000, n_threads=2): - for minibatch in cytoolz.partition_all(batch_size, docs): - minibatch = list(minibatch) - sentences = [] - for doc in minibatch: - sentences.extend(doc.sents) - Xs = get_features(sentences, self.max_length) - ys = self._model.predict(Xs) - for sent, label in zip(sentences, ys): - sent.doc.sentiment += label - 0.5 - for doc in minibatch: - yield doc - - def set_sentiment(self, doc, y): - doc.sentiment = float(y[0]) - # Sentiment has a native slot for a single float. - # For arbitrary data storage, there's: - # doc.user_data['my_data'] = y - - -class Classifier(Chain): - def __init__(self, predictor): - super(Classifier, self).__init__(predictor=predictor) - - def __call__(self, x, t): - y = self.predictor(x) - loss = F.softmax_cross_entropy(y, t) - accuracy = F.accuracy(y, t) - report({'loss': loss, 'accuracy': accuracy}, self) - return loss - - -class SentimentModel(Chain): - def __init__(self, nlp, shape, **settings): - Chain.__init__(self, - embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], - set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), - encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), - attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), - predict=_Predict(shape['nr_hidden'], shape['nr_class'])) - self.to_gpu(0) - - def __call__(self, sentence): - return self.predict( - self.attend( - self.encode( - self.embed(sentence)))) - - -class _Embed(Chain): - def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): - Chain.__init__(self, - embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), - project=L.Linear(None, nr_out, nobias=True)) - self.embed.W.volatile = False - - def __call__(self, sentence): - return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] - - -class _Encode(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - fwd=L.LSTM(nr_in, nr_out), - bwd=L.LSTM(nr_in, nr_out), - mix=L.Bilinear(nr_out, nr_out, nr_out)) - - def __call__(self, sentence): - self.fwd.reset_state() - fwds = map(self.fwd, sentence) - self.bwd.reset_state() - bwds = reversed(map(self.bwd, reversed(sentence))) - return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] - - -class _Attend(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self) - - def __call__(self, sentence): - sent = sum(sentence) - return sent - - -class _Predict(Chain): - def __init__(self, nr_in, nr_out): - Chain.__init__(self, - l1=L.Linear(nr_in, nr_in), - l2=L.Linear(nr_in, nr_out)) - - def __call__(self, vector): - vector = self.l1(vector) - vector = F.elu(vector) - vector = self.l2(vector) - return vector - - -class SentenceDataset(TupleDataset): - def __init__(self, nlp, texts, labels, max_length): - self.max_length = max_length - sents, labels = self._get_labelled_sentences( - nlp.pipe(texts, batch_size=5000, n_threads=3), - labels) - TupleDataset.__init__(self, - get_features(sents, max_length), - labels) - - def __getitem__(self, index): - batches = [dataset[index] for dataset in self._datasets] - if isinstance(index, slice): - length = len(batches[0]) - returns = [tuple([batch[i] for batch in batches]) - for i in six.moves.range(length)] - return returns - else: - return tuple(batches) - - def _get_labelled_sentences(self, docs, doc_labels): - labels = [] - sentences = [] - for doc, y in izip(docs, doc_labels): - for sent in doc.sents: - sentences.append(sent) - labels.append(y) - return sentences, xp.asarray(labels, dtype='i') - - -class DocDataset(TupleDataset): - def __init__(self, nlp, texts, labels): - self.max_length = max_length - DatasetMixin.__init__(self, - get_features( - nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), - labels) - -def read_data(data_dir, limit=0): - examples = [] - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - examples.append((text, label)) - random.shuffle(examples) - if limit >= 1: - examples = examples[:limit] - return zip(*examples) # Unzips into two lists - - -def get_features(docs, max_length): - docs = list(docs) - Xs = xp.zeros((len(docs), max_length), dtype='i') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.norm - j += 1 - if j >= max_length: - break - return Xs - - -def set_vectors(vectors, vocab): - for lex in vocab: - if lex.has_vector and (lex.rank+1) < vectors.shape[0]: - lex.norm = lex.rank+1 - vectors[lex.rank + 1] = lex.vector - else: - lex.norm = 0 - return vectors - - -def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, - by_sentence=True): - nlp = spacy.load('en', entity=False) - if 'nr_vector' not in lstm_shape: - lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) - if 'nr_dim' not in lstm_shape: - lstm_shape['nr_dim'] = nlp.vocab.vectors_length - print("Make model") - model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) - print("Parsing texts...") - if by_sentence: - train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) - dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) - else: - train_data = DocDataset(nlp, train_texts, train_labels) - dev_data = DocDataset(nlp, dev_texts, dev_labels) - train_iter = SerialIterator(train_data, batch_size=batch_size, - shuffle=True, repeat=True) - dev_iter = SerialIterator(dev_data, batch_size=batch_size, - shuffle=False, repeat=False) - optimizer = chainer.optimizers.Adam() - optimizer.setup(model) - updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) - trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') - - trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) - trainer.extend(extensions.LogReport()) - trainer.extend(extensions.PrintReport([ - 'epoch', 'main/accuracy', 'validation/main/accuracy'])) - trainer.extend(extensions.ProgressBar()) - - trainer.run() - - -def evaluate(model_dir, texts, labels, max_length=100): - def create_pipeline(nlp): - ''' - This could be a lambda, but named functions are easier to read in Python. - ''' - return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, - max_length=max_length)] - - nlp = spacy.load('en') - nlp.pipeline = create_pipeline(nlp) - - correct = 0 - i = 0 - for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) - i += 1 - return float(correct) / i - - -@plac.annotations( - train_dir=("Location of training file or directory"), - dev_dir=("Location of development file or directory"), - model_dir=("Location of output model directory",), - is_runtime=("Demonstrate run-time usage", "flag", "r", bool), - nr_hidden=("Number of hidden units", "option", "H", int), - max_length=("Maximum sentence length", "option", "L", int), - dropout=("Dropout", "option", "d", float), - learn_rate=("Learn rate", "option", "e", float), - nb_epoch=("Number of training epochs", "option", "i", int), - batch_size=("Size of minibatches for training LSTM", "option", "b", int), - nr_examples=("Limit to N examples", "option", "n", int) -) -def main(model_dir, train_dir, dev_dir, - is_runtime=False, - nr_hidden=64, max_length=100, # Shape - dropout=0.5, learn_rate=0.001, # General NN config - nb_epoch=5, batch_size=32, nr_examples=-1): # Training params - model_dir = pathlib.Path(model_dir) - train_dir = pathlib.Path(train_dir) - dev_dir = pathlib.Path(dev_dir) - if is_runtime: - dev_texts, dev_labels = read_data(dev_dir) - acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) - print(acc) - else: - print("Read data") - train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) - print("Using GPU 0") - #chainer.cuda.get_device(0).use() - train_labels = xp.asarray(train_labels, dtype='i') - dev_labels = xp.asarray(dev_labels, dtype='i') - lstm = train(train_texts, train_labels, dev_texts, dev_labels, - {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, - 'nr_vector': 5000}, - {'dropout': 0.5, 'lr': learn_rate}, - {}, - nb_epoch=nb_epoch, batch_size=batch_size) - - -if __name__ == '__main__': - #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - #s = pstats.Stats("Profile.prof") - #s.strip_dirs().sort_stats("time").print_stats() - plac.call(main) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py new file mode 100644 index 000000000..9aa9fda56 --- /dev/null +++ b/examples/vectors_fast_text.py @@ -0,0 +1,30 @@ +'''Load vectors for a language trained using FastText + +https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md +''' +from __future__ import unicode_literals +import plac +import numpy + +import spacy.language + + +def main(vectors_loc): + nlp = spacy.language.Language() + + with open(vectors_loc, 'rb') as file_: + header = file_.readline() + nr_row, nr_dim = header.split() + nlp.vocab.clear_vectors(int(nr_dim)) + for line in file_: + line = line.decode('utf8') + pieces = line.split() + word = pieces[0] + vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') + nlp.vocab.set_vector(word, vector) + doc = nlp(u'class colspan') + print(doc[0].similarity(doc[1])) + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index da0489bbf..8ce150531 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -16,7 +16,16 @@ from .compat import basestring_ cdef class Vectors: - '''Store, save and load word vectors.''' + '''Store, save and load word vectors. + + Vectors data is kept in the vectors.data attribute, which should be an + instance of numpy.ndarray (for CPU vectors) + or cupy.ndarray (for GPU vectors). + + vectors.key2row is a dictionary mapping word hashes to rows + in the vectors.data table. The array `vectors.keys` keeps + the keys in order, such that keys[vectors.key2row[key]] == key. + ''' cdef public object data cdef readonly StringStore strings cdef public object key2row @@ -44,6 +53,11 @@ cdef class Vectors: return (Vectors, (self.strings, self.data)) def __getitem__(self, key): + '''Get a vector by key. If key is a string, it is hashed + to an integer ID using the vectors.strings table. + + If the integer key is not found in the table, a KeyError is raised. + ''' if isinstance(key, basestring): key = self.strings[key] i = self.key2row[key] @@ -53,23 +67,30 @@ cdef class Vectors: return self.data[i] def __setitem__(self, key, vector): + '''Set a vector for the given key. If key is a string, it is hashed + to an integer ID using the vectors.strings table. + ''' if isinstance(key, basestring): key = self.strings.add(key) i = self.key2row[key] self.data[i] = vector def __iter__(self): + '''Yield vectors from the table.''' yield from self.data def __len__(self): + '''Return the number of vectors that have been assigned.''' return self.i def __contains__(self, key): + '''Check whether a key has a vector entry in the table.''' if isinstance(key, basestring_): key = self.strings[key] return key in self.key2row def add(self, key, vector=None): + '''Add a key to the table, optionally setting a vector value as well.''' if isinstance(key, basestring_): key = self.strings.add(key) if key not in self.key2row: @@ -87,6 +108,7 @@ cdef class Vectors: return i def items(self): + '''Iterate over (string key, vector) pairs, in order.''' for i, key in enumerate(self.keys): string = self.strings[key] yield string, self.data[i] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 21c2a1165..a5f8bf6ad 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -262,7 +262,7 @@ cdef class Vocab: Words can be looked up by string or int ID. RETURNS: - A word vector. Size and shape determed by the + A word vector. Size and shape determined by the vocab.vectors instance. Usually, a numpy ndarray of shape (300,) and dtype float32.