mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-07 07:00:34 +03:00
some-allreduce
This commit is contained in:
parent
ef2af90f54
commit
8cc49c5a03
|
@ -2,16 +2,23 @@ import ray
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
cp = None
|
||||||
|
nccl = None
|
||||||
|
|
||||||
|
from typing import Dict, Optional, Union, Tuple, List, cast
|
||||||
|
from thinc.types import FloatsXd
|
||||||
|
|
||||||
|
def _create_optimizer(config_path):
|
||||||
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
config = util.load_config(config_path, create_objects=False)
|
||||||
|
util.fix_random_seed(config["training"]["seed"]) # Fix this.
|
||||||
|
config = util.load_config(config_path, create_objects=True)
|
||||||
|
training = config["training"]
|
||||||
|
return training["optimizer"]
|
||||||
|
|
||||||
class OptimizerWorker:
|
class OptimizerWorker:
|
||||||
def __init__(self, config_path):
|
def __init__(self, config_path):
|
||||||
msg.info(f"Loading config from: {config_path}")
|
self.optimizer = _create_optimizer(config_path)
|
||||||
config = util.load_config(config_path, create_objects=False)
|
|
||||||
util.fix_random_seed(config["training"]["seed"])
|
|
||||||
config = util.load_config(config_path, create_objects=True)
|
|
||||||
training = config["training"]
|
|
||||||
optimizer = training["optimizer"]
|
|
||||||
self.optimizer = optimizer
|
|
||||||
self.weights_dict = {}
|
self.weights_dict = {}
|
||||||
|
|
||||||
def call(self, key, weights, gradient, *, lr_scale=1.0):
|
def call(self, key, weights, gradient, *, lr_scale=1.0):
|
||||||
|
@ -51,3 +58,57 @@ class RayOptimizer:
|
||||||
def step_schedules(self):
|
def step_schedules(self):
|
||||||
self.optimizer.step_schedules.remote()
|
self.optimizer.step_schedules.remote()
|
||||||
self.sync()
|
self.sync()
|
||||||
|
|
||||||
|
class RayWorker:
|
||||||
|
def __init__(self, rank, world_size):
|
||||||
|
global nccl
|
||||||
|
from cupy.cuda import nccl
|
||||||
|
self.rank = rank
|
||||||
|
self.world_size = world_size
|
||||||
|
self.unique_id = nccl.get_unique_id()
|
||||||
|
|
||||||
|
def initialize(self, head_id):
|
||||||
|
self.communicator = nccl.NcclCommunicator(self.world_size, head_id, self.rank)
|
||||||
|
|
||||||
|
def get_unique_id(self):
|
||||||
|
return self.unique_id
|
||||||
|
|
||||||
|
def execute(self, fn):
|
||||||
|
return fn(self)
|
||||||
|
|
||||||
|
class AllreduceOptimizer:
|
||||||
|
def __init__(self, config_path, communicator):
|
||||||
|
global cp
|
||||||
|
import cupy as cp
|
||||||
|
global nccl
|
||||||
|
from cupy.cuda import nccl
|
||||||
|
self.optimizer = _create_optimizer(config_path)
|
||||||
|
self.communicator = communicator
|
||||||
|
|
||||||
|
def allreduce(self, tensor):
|
||||||
|
self.communicator.allReduce(
|
||||||
|
tensor.data.ptr,
|
||||||
|
tensor.data.ptr,
|
||||||
|
tensor.size,
|
||||||
|
nccl.NCCL_FLOAT32,
|
||||||
|
nccl.NCCL_SUM, # TODO: is this a sum?
|
||||||
|
cp.cuda.Stream.null.ptr
|
||||||
|
)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
key: Tuple[int, str],
|
||||||
|
weights: FloatsXd,
|
||||||
|
gradient: FloatsXd,
|
||||||
|
*,
|
||||||
|
lr_scale: float = 1.0,
|
||||||
|
):
|
||||||
|
# weights = self.allreduce(weights)
|
||||||
|
gradient = self.allreduce(gradient)
|
||||||
|
flat_weights, gradient = self.optimizer(key, weights, gradient, lr_scale=lr_scale)
|
||||||
|
return flat_weights, gradient
|
||||||
|
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.optimizer, name)
|
||||||
|
|
|
@ -128,6 +128,7 @@ class ConfigSchema(BaseModel):
|
||||||
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
verbose=("Display more information for debugging purposes", "flag", "VV", bool),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
num_workers=("Parallel Workers", "option", "j", int),
|
num_workers=("Parallel Workers", "option", "j", int),
|
||||||
|
strategy=("Distributed training strategy", "option", "strat", str),
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -142,6 +143,7 @@ def train_cli(
|
||||||
verbose=False,
|
verbose=False,
|
||||||
use_gpu=-1,
|
use_gpu=-1,
|
||||||
num_workers=1,
|
num_workers=1,
|
||||||
|
strategy="ps",
|
||||||
tag_map_path=None,
|
tag_map_path=None,
|
||||||
omit_extra_lookups=False,
|
omit_extra_lookups=False,
|
||||||
):
|
):
|
||||||
|
@ -198,6 +200,7 @@ def train_cli(
|
||||||
from spacy.cli.ray_utils import RayOptimizer
|
from spacy.cli.ray_utils import RayOptimizer
|
||||||
import ray
|
import ray
|
||||||
ray.init()
|
ray.init()
|
||||||
|
if strategy == "ps":
|
||||||
remote_train = ray.remote(setup_and_train)
|
remote_train = ray.remote(setup_and_train)
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Enabling GPU with Ray")
|
msg.info("Enabling GPU with Ray")
|
||||||
|
@ -209,6 +212,23 @@ def train_cli(
|
||||||
train_args,
|
train_args,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
total_workers=num_workers) for rank in range(num_workers)])
|
total_workers=num_workers) for rank in range(num_workers)])
|
||||||
|
elif strategy == "allreduce" and use_gpu >= 0:
|
||||||
|
from spacy.cli.ray_utils import RayWorker, AllreduceOptimizer
|
||||||
|
msg.info("Enabling GPU with Ray")
|
||||||
|
RemoteRayWorker = ray.remote(RayWorker).options(num_gpus=1)
|
||||||
|
|
||||||
|
workers = [RemoteRayWorker.remote(rank, num_workers) for rank in range(num_workers)]
|
||||||
|
head_id = ray.get(workers[0].get_unique_id.remote())
|
||||||
|
ray.get([w.initialize.remote(head_id) for w in workers])
|
||||||
|
def train_fn(worker):
|
||||||
|
optimizer = AllreduceOptimizer(config_path, worker.communicator)
|
||||||
|
train_args["remote_optimizer"] = optimizer
|
||||||
|
return setup_and_train(True, train_args, worker.rank, worker.world_size)
|
||||||
|
ray.get([w.execute.remote(train_fn) for w in workers])
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
setup_and_train(use_gpu, train_args)
|
setup_and_train(use_gpu, train_args)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user