mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Add util.env_opt support: Can set hyper params through environment variables.
This commit is contained in:
parent
d2626fdb45
commit
fc8d3a112c
|
@ -17,7 +17,7 @@ from .. import displacy
|
||||||
|
|
||||||
|
|
||||||
def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
|
def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
|
||||||
tagger, parser, ner, parser_L1):
|
use_gpu, tagger, parser, ner, parser_L1):
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_data)
|
||||||
|
@ -46,7 +46,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
|
||||||
gold_train = list(read_gold_json(train_path, limit=n_sents))
|
gold_train = list(read_gold_json(train_path, limit=n_sents))
|
||||||
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
|
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
|
||||||
|
|
||||||
train_model(lang, gold_train, gold_dev, output_path, n_iter)
|
train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
|
||||||
if gold_dev:
|
if gold_dev:
|
||||||
scorer = evaluate(lang, gold_dev, output_path)
|
scorer = evaluate(lang, gold_dev, output_path)
|
||||||
print_results(scorer)
|
print_results(scorer)
|
||||||
|
@ -65,27 +65,27 @@ def train_config(config):
|
||||||
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
|
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
|
||||||
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
|
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||||
|
|
||||||
nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
|
nlp = Language(pipeline=['token_vectors', 'tags']) #, 'dependencies'])
|
||||||
|
dropout = util.env_opt('dropout', 0.0)
|
||||||
# TODO: Get spaCy using Thinc's trainer and optimizer
|
# TODO: Get spaCy using Thinc's trainer and optimizer
|
||||||
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
|
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
|
||||||
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
|
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
|
||||||
losses = defaultdict(float)
|
losses = defaultdict(float)
|
||||||
to_render = []
|
to_render = []
|
||||||
for i, (docs, golds) in enumerate(epoch):
|
for i, (docs, golds) in enumerate(epoch):
|
||||||
state = nlp.update(docs, golds, drop=0., sgd=optimizer)
|
state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
|
||||||
losses['dep_loss'] += state.get('parser_loss', 0.0)
|
losses['dep_loss'] += state.get('parser_loss', 0.0)
|
||||||
|
losses['tag_loss'] += state.get('tagger_loss', 0.0)
|
||||||
to_render.insert(0, nlp(docs[-1].text))
|
to_render.insert(0, nlp(docs[-1].text))
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
to_render[0].user_data['title'] = "Batch %d" % i
|
||||||
with Path('/tmp/entities.html').open('w') as file_:
|
with Path('/tmp/entities.html').open('w') as file_:
|
||||||
html = displacy.render(to_render[:5], style='ent', page=True,
|
html = displacy.render(to_render[:5], style='ent', page=True)
|
||||||
options={'compact': True})
|
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
with Path('/tmp/parses.html').open('w') as file_:
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True,
|
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||||
options={'compact': True})
|
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
if dev_data:
|
if dev_data:
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
dev_scores = trainer.evaluate(dev_data).scores
|
dev_scores = trainer.evaluate(dev_data).scores
|
||||||
else:
|
else:
|
||||||
dev_scores = defaultdict(float)
|
dev_scores = defaultdict(float)
|
||||||
|
|
|
@ -8,6 +8,7 @@ import ujson
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .util import ensure_path
|
from .util import ensure_path
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -138,7 +139,8 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
return prev_costs[n_gold], previous_row[-1]
|
return prev_costs[n_gold], previous_row[-1]
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, make_supertags=False, limit=None):
|
def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
||||||
|
make_supertags = util.env_opt('make_supertags', make_supertags)
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
|
|
|
@ -220,10 +220,13 @@ cdef class Parser:
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
||||||
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
|
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
|
||||||
lower = PrecomputableMaxouts(hidden_width,
|
lower = PrecomputableMaxouts(hidden_width,
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
nI=token_vector_width,
|
nI=token_vector_width,
|
||||||
pieces=cfg.get('maxout_pieces', 1))
|
pieces=maxout_pieces)
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
upper = chain(
|
upper = chain(
|
||||||
|
@ -346,7 +349,8 @@ cdef class Parser:
|
||||||
|
|
||||||
backprops = []
|
backprops = []
|
||||||
cdef float loss = 0.
|
cdef float loss = 0.
|
||||||
while todo:
|
cutoff = max(1, len(todo) // 10)
|
||||||
|
while len(todo) >= cutoff:
|
||||||
states, golds = zip(*todo)
|
states, golds = zip(*todo)
|
||||||
|
|
||||||
token_ids = self.get_token_ids(states)
|
token_ids = self.get_token_ids(states)
|
||||||
|
@ -398,7 +402,7 @@ cdef class Parser:
|
||||||
def get_token_ids(self, states):
|
def get_token_ids(self, states):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef int n_tokens = self.nr_feature
|
cdef int n_tokens = self.nr_feature
|
||||||
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
|
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
|
||||||
for i, state in enumerate(states):
|
for i, state in enumerate(states):
|
||||||
state.set_context_tokens(ids[i])
|
state.set_context_tokens(ids[i])
|
||||||
return ids
|
return ids
|
||||||
|
|
|
@ -7,25 +7,32 @@ from cytoolz import partition_all
|
||||||
|
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
|
from thinc.neural.train import Trainer as ThincTrainer
|
||||||
|
|
||||||
from .syntax.nonproj import PseudoProjectivity
|
from .syntax.nonproj import PseudoProjectivity
|
||||||
from .gold import GoldParse, merge_sents
|
from .gold import GoldParse, merge_sents
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .tokens.doc import Doc
|
from .tokens.doc import Doc
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
class Trainer(object):
|
class Trainer(object):
|
||||||
"""
|
"""
|
||||||
Manage training of an NLP pipeline.
|
Manage training of an NLP pipeline.
|
||||||
"""
|
"""
|
||||||
def __init__(self, nlp, gold_tuples):
|
def __init__(self, nlp, gold_tuples, **cfg):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.nr_epoch = 0
|
self.nr_epoch = 0
|
||||||
self.optimizer = Adam(NumpyOps(), 0.001)
|
self.optimizer = Adam(NumpyOps(), 0.001)
|
||||||
self.gold_tuples = gold_tuples
|
self.gold_tuples = gold_tuples
|
||||||
|
self.cfg = cfg
|
||||||
|
self.batch_size = float(util.env_opt('min_batch_size', 4))
|
||||||
|
self.max_batch_size = util.env_opt('max_batch_size', 64)
|
||||||
|
self.accel_batch_size = util.env_opt('batch_accel', 1.001)
|
||||||
|
|
||||||
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
|
def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
|
||||||
cached_golds = {}
|
cached_golds = {}
|
||||||
|
cached_docs = {}
|
||||||
def _epoch(indices):
|
def _epoch(indices):
|
||||||
all_docs = []
|
all_docs = []
|
||||||
all_golds = []
|
all_golds = []
|
||||||
|
@ -36,20 +43,26 @@ class Trainer(object):
|
||||||
else:
|
else:
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
if augment_data is None:
|
if augment_data is None:
|
||||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
if i not in cached_docs:
|
||||||
if i in cached_golds:
|
cached_docs[i] = self.make_docs(raw_text, paragraph_tuples)
|
||||||
|
docs = cached_docs[i]
|
||||||
|
if i not in cached_golds:
|
||||||
|
cached_golds[i] = self.make_golds(docs, paragraph_tuples)
|
||||||
golds = cached_golds[i]
|
golds = cached_golds[i]
|
||||||
else:
|
|
||||||
golds = self.make_golds(docs, paragraph_tuples)
|
|
||||||
else:
|
else:
|
||||||
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
|
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
|
||||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
||||||
golds = self.make_golds(docs, paragraph_tuples)
|
golds = self.make_golds(docs, paragraph_tuples)
|
||||||
all_docs.extend(docs)
|
all_docs.extend(docs)
|
||||||
all_golds.extend(golds)
|
all_golds.extend(golds)
|
||||||
for batch in partition_all(12, zip(tqdm.tqdm(all_docs), all_golds)):
|
|
||||||
X, y = zip(*batch)
|
thinc_trainer = ThincTrainer(self.nlp.pipeline[0].model)
|
||||||
|
thinc_trainer.batch_size = int(self.batch_size)
|
||||||
|
thinc_trainer.nb_epoch = 1
|
||||||
|
for X, y in thinc_trainer.iterate(all_docs, all_golds):
|
||||||
yield X, y
|
yield X, y
|
||||||
|
thinc_trainer.batch_size = min(int(self.batch_size), self.max_batch_size)
|
||||||
|
self.batch_size *= self.accel_batch_size
|
||||||
|
|
||||||
indices = list(range(len(self.gold_tuples)))
|
indices = list(range(len(self.gold_tuples)))
|
||||||
for itn in range(nr_epoch):
|
for itn in range(nr_epoch):
|
||||||
|
@ -78,7 +91,8 @@ class Trainer(object):
|
||||||
if raw_text is not None:
|
if raw_text is not None:
|
||||||
return [self.nlp.make_doc(raw_text)]
|
return [self.nlp.make_doc(raw_text)]
|
||||||
else:
|
else:
|
||||||
return [Doc(self.nlp.vocab, words=sent_tuples[0][1])
|
return [
|
||||||
|
Doc(self.nlp.vocab, words=sent_tuples[0][1])
|
||||||
for sent_tuples in paragraph_tuples]
|
for sent_tuples in paragraph_tuples]
|
||||||
|
|
||||||
def make_golds(self, docs, paragraph_tuples):
|
def make_golds(self, docs, paragraph_tuples):
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
import ujson
|
import ujson
|
||||||
import pip
|
import pip
|
||||||
import importlib
|
import importlib
|
||||||
|
@ -160,7 +161,23 @@ def get_async(stream, numpy_array):
|
||||||
if cupy is None:
|
if cupy is None:
|
||||||
return numpy_array
|
return numpy_array
|
||||||
else:
|
else:
|
||||||
return cupy.array(numpy_array, stream=stream)
|
array = cupy.ndarray(numpy_array.shape, order='C',
|
||||||
|
dtype=numpy_array.dtype)
|
||||||
|
array.set(numpy_array, stream=stream)
|
||||||
|
return array
|
||||||
|
|
||||||
|
|
||||||
|
def env_opt(name, default=None):
|
||||||
|
type_convert = type(default)
|
||||||
|
if name in os.environ:
|
||||||
|
print("Get from env", name, os.environ[name])
|
||||||
|
return type_convert(os.environ[name])
|
||||||
|
elif 'SPACY_' + name.upper() in os.environ:
|
||||||
|
print("Get from env", name, os.environ['SPACY_' + name.upper()])
|
||||||
|
return type_convert(os.environ['SPACY_' + name.upper()])
|
||||||
|
else:
|
||||||
|
print("Default", name, default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def read_regex(path):
|
def read_regex(path):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user