Update train_ner_standalone example

This commit is contained in:
Matthew Honnibal 2017-10-03 18:38:55 +02:00
parent e514d6aa0a
commit cbb1fbef80

View File

@ -20,9 +20,10 @@ import plac
from pathlib import Path from pathlib import Path
import random import random
import json import json
import tqdm
from thinc.neural.optimizers import Adam from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps from thinc.neural.ops import NumpyOps
import tqdm
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
@ -35,6 +36,7 @@ from spacy.gold import minibatch
from spacy.scorer import Scorer from spacy.scorer import Scorer
import spacy.util import spacy.util
try: try:
unicode unicode
except NameError: except NameError:
@ -55,20 +57,17 @@ def init_vocab():
class Pipeline(object): class Pipeline(object):
def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None): def __init__(self, vocab=None, tokenizer=None, entity=None):
if vocab is None: if vocab is None:
vocab = init_vocab() vocab = init_vocab()
if tokenizer is None: if tokenizer is None:
tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer = Tokenizer(vocab, {}, None, None, None)
if tensorizer is None:
tensorizer = TokenVectorEncoder(vocab)
if entity is None: if entity is None:
entity = NeuralEntityRecognizer(vocab) entity = NeuralEntityRecognizer(vocab)
self.vocab = vocab self.vocab = vocab
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.tensorizer = tensorizer
self.entity = entity self.entity = entity
self.pipeline = [tensorizer, self.entity] self.pipeline = [self.entity]
def begin_training(self): def begin_training(self):
for model in self.pipeline: for model in self.pipeline:
@ -102,10 +101,8 @@ class Pipeline(object):
golds = [self.make_gold(input_, annot) for input_, annot in golds = [self.make_gold(input_, annot) for input_, annot in
zip(inputs, annots)] zip(inputs, annots)]
tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop) self.entity.update(docs, golds, drop=drop,
d_tensors = self.entity.update((docs, tensors), golds, drop=drop, sgd=sgd, losses=losses)
sgd=sgd, losses=losses)
bp_tensors(d_tensors, sgd=sgd)
return losses return losses
def evaluate(self, examples): def evaluate(self, examples):
@ -123,7 +120,6 @@ class Pipeline(object):
elif not path.is_dir(): elif not path.is_dir():
raise IOError("Can't save pipeline to %s\nNot a directory" % path) raise IOError("Can't save pipeline to %s\nNot a directory" % path)
self.vocab.to_disk(path / 'vocab') self.vocab.to_disk(path / 'vocab')
self.tensorizer.to_disk(path / 'tensorizer')
self.entity.to_disk(path / 'ner') self.entity.to_disk(path / 'ner')
def from_disk(self, path): def from_disk(self, path):
@ -133,7 +129,6 @@ class Pipeline(object):
if not path.is_dir(): if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path) raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
self.vocab = self.vocab.from_disk(path / 'vocab') self.vocab = self.vocab.from_disk(path / 'vocab')
self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
self.entity = self.entity.from_disk(path / 'ner') self.entity = self.entity.from_disk(path / 'ner')