mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-12 01:20:35 +03:00
small-changes
This commit is contained in:
parent
a5a3ed722c
commit
fdc9242bc1
|
@ -5,11 +5,13 @@ from wasabi import msg
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
class OptimizerWorker:
|
class OptimizerWorker:
|
||||||
def __init__(self, config_path):
|
def __init__(self, config_path, world_size, sync=True):
|
||||||
self.optimizer = _create_optimizer(config_path)
|
self.optimizer = _create_optimizer(config_path)
|
||||||
self.weights_dict = {}
|
self.weights_dict = {}
|
||||||
|
self.world_size = world_size
|
||||||
|
self.sync = sync
|
||||||
|
|
||||||
def call(self, key, weights, gradient, *, lr_scale=1.0):
|
def call(self, rank, key, weights, gradient, *, lr_scale=1.0):
|
||||||
if key not in self.weights_dict:
|
if key not in self.weights_dict:
|
||||||
self.weights_dict[key] = weights.copy()
|
self.weights_dict[key] = weights.copy()
|
||||||
new_weights, new_grads = self.optimizer(
|
new_weights, new_grads = self.optimizer(
|
||||||
|
@ -26,18 +28,19 @@ class OptimizerWorker:
|
||||||
class RayOptimizer:
|
class RayOptimizer:
|
||||||
local_optimizer = None
|
local_optimizer = None
|
||||||
|
|
||||||
def __init__(self, config_path, use_gpu):
|
def __init__(self, config_path, use_gpu, rank):
|
||||||
RemoteOptimizer = ray.remote(OptimizerWorker)
|
RemoteOptimizer = ray.remote(OptimizerWorker)
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
RemoteOptimizer = RemoteOptimizer.options(num_gpus=0.1)
|
RemoteOptimizer = RemoteOptimizer.options(num_gpus=0.1)
|
||||||
self.optimizer = RemoteOptimizer.remote(config_path)
|
self.optimizer = RemoteOptimizer.remote(config_path)
|
||||||
|
self.rank = rank
|
||||||
self.sync()
|
self.sync()
|
||||||
|
|
||||||
def sync(self):
|
def sync(self):
|
||||||
self.local_optimizer = ray.get(self.optimizer.fetch.remote())
|
self.local_optimizer = ray.get(self.optimizer.fetch.remote())
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
weights, grads = ray.get(self.optimizer.call.remote(*args, **kwargs))
|
weights, grads = ray.get(self.optimizer.call.remote(self.rank, *args, **kwargs))
|
||||||
return weights.copy(), grads.copy()
|
return weights.copy(), grads.copy()
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
|
|
|
@ -69,7 +69,7 @@ class AllreduceOptimizer:
|
||||||
weights = self.allreduce(weights) / self.communicator.size()
|
weights = self.allreduce(weights) / self.communicator.size()
|
||||||
|
|
||||||
|
|
||||||
gradient = self.allreduce(gradient)
|
gradient = self.allreduce(gradient) / self.communicator.size()
|
||||||
flat_weights, gradient = self.optimizer(key, weights, gradient, lr_scale=lr_scale)
|
flat_weights, gradient = self.optimizer(key, weights, gradient, lr_scale=lr_scale)
|
||||||
return flat_weights, gradient
|
return flat_weights, gradient
|
||||||
|
|
||||||
|
|
|
@ -198,7 +198,7 @@ def train_cli(
|
||||||
|
|
||||||
if num_workers and num_workers > 1:
|
if num_workers and num_workers > 1:
|
||||||
import ray
|
import ray
|
||||||
ray.init()
|
ray.init(address="auto")
|
||||||
if strategy == "ps":
|
if strategy == "ps":
|
||||||
from spacy.cli.ray_param_server import RayOptimizer
|
from spacy.cli.ray_param_server import RayOptimizer
|
||||||
remote_train = ray.remote(setup_and_train)
|
remote_train = ray.remote(setup_and_train)
|
||||||
|
@ -401,8 +401,12 @@ def train(
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
print_row = setup_printer(training, nlp)
|
print_row = setup_printer(training, nlp)
|
||||||
|
|
||||||
|
tqdm_args = dict(total=training["eval_frequency"], leave=False)
|
||||||
|
global world_rank
|
||||||
|
if world_rank is not None:
|
||||||
|
tqdm_args["disable"] = bool(world_rank != 0)
|
||||||
try:
|
try:
|
||||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(**tqdm_args)
|
||||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||||
progress.update(1)
|
progress.update(1)
|
||||||
if is_best_checkpoint is not None:
|
if is_best_checkpoint is not None:
|
||||||
|
@ -411,7 +415,7 @@ def train(
|
||||||
if is_best_checkpoint and output_path is not None:
|
if is_best_checkpoint and output_path is not None:
|
||||||
update_meta(training, nlp, info)
|
update_meta(training, nlp, info)
|
||||||
nlp.to_disk(output_path / "model-best")
|
nlp.to_disk(output_path / "model-best")
|
||||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(**tqdm_args)
|
||||||
# Clean up the objects to faciliate garbage collection.
|
# Clean up the objects to faciliate garbage collection.
|
||||||
for eg in batch:
|
for eg in batch:
|
||||||
eg.doc = None
|
eg.doc = None
|
||||||
|
@ -437,6 +441,10 @@ def train(
|
||||||
|
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(nlp, corpus, cfg):
|
||||||
epochs_todo = cfg.get("max_epochs", 0)
|
epochs_todo = cfg.get("max_epochs", 0)
|
||||||
|
if world_rank is not None:
|
||||||
|
for i in range(world_rank):
|
||||||
|
# Increment random seed
|
||||||
|
random.random()
|
||||||
while True:
|
while True:
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
|
@ -452,16 +460,18 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
random.shuffle(train_examples)
|
random.shuffle(train_examples)
|
||||||
|
|
||||||
if world_size is not None:
|
# # TODO: with large batches, this can be bad.
|
||||||
# Taken from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/distributed.py
|
# if world_size is not None:
|
||||||
num_samples = int(math.ceil(len(train_examples) * 1.0 / world_size))
|
# # Taken from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/distributed.py
|
||||||
total_size = num_samples * world_size # expected to overflow
|
# num_samples = int(math.ceil(len(train_examples) * 1.0 / world_size))
|
||||||
train_examples += train_examples[:(total_size - len(train_examples))]
|
# total_size = num_samples * world_size # expected to overflow
|
||||||
assert len(train_examples) == total_size
|
# train_examples += train_examples[:(total_size - len(train_examples))]
|
||||||
|
# assert len(train_examples) == total_size
|
||||||
|
|
||||||
# subsample
|
# # subsample
|
||||||
train_examples = train_examples[world_rank:total_size:world_size]
|
# train_examples = train_examples[world_rank:total_size:world_size]
|
||||||
assert len(train_examples) == num_samples
|
# assert len(train_examples) == num_samples
|
||||||
|
# print(f"Reset epoch: Only using {num_samples} out of {total_size} samples")
|
||||||
|
|
||||||
batches = util.minibatch_by_words(
|
batches = util.minibatch_by_words(
|
||||||
train_examples,
|
train_examples,
|
||||||
|
@ -474,7 +484,7 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
yield first
|
yield first
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
raise ValueError(Errors.E986)
|
raise ValueError(Errors.E986)
|
||||||
for batch in batches:
|
for i, batch in enumerate(batches):
|
||||||
yield batch
|
yield batch
|
||||||
epochs_todo -= 1
|
epochs_todo -= 1
|
||||||
# We intentionally compare exactly to 0 here, so that max_epochs < 1
|
# We intentionally compare exactly to 0 here, so that max_epochs < 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user