spaCy/examples/training/train_tensorizer.py
2018-11-03 10:54:20 +00:00

53 lines
1.4 KiB
Python

'''Not sure if this is useful -- try training the Tensorizer component.'''
import plac
import spacy
import thinc.extra.datasets
from spacy.util import minibatch, use_gpu
import tqdm
def load_imdb():
nlp = spacy.blank('en')
train, dev = thinc.extra.datasets.imdb()
train_texts, _ = zip(*train)
dev_texts, _ = zip(*dev)
nlp.add_pipe(nlp.create_pipe('sentencizer'))
return list(train_texts), list(dev_texts)
def get_sentences(nlp, texts):
for doc in nlp.pipe(texts):
for sent in doc.sents:
yield sent.text
def prefer_gpu():
used = spacy.util.use_gpu(0)
if used is None:
return False
else:
return True
def main(vectors_model):
use_gpu = prefer_gpu()
print("Using GPU?", use_gpu)
print("Load data")
train_texts, dev_texts = load_imdb()
print("Load vectors")
nlp = spacy.load(vectors_model)
print("Start training")
nlp.add_pipe(nlp.create_pipe('tagger'))
tensorizer = nlp.create_pipe('tensorizer')
nlp.add_pipe(tensorizer)
optimizer = nlp.begin_training()
for i in range(10):
losses = {}
for i, batch in enumerate(minibatch(tqdm.tqdm(train_texts))):
docs = [nlp.make_doc(text) for text in batch]
tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=0.5)
print(losses)
if __name__ == '__main__':
plac.call(main)