mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:16:06 +08:00
[Java] Remove non-raylet code in Java. (#2828)
This commit is contained in:
@@ -18,9 +18,8 @@ import org.ray.spi.RemoteFunctionManager;
|
||||
import org.ray.spi.StateStoreProxy;
|
||||
import org.ray.spi.impl.DefaultLocalSchedulerClient;
|
||||
import org.ray.spi.impl.NativeRemoteFunctionManager;
|
||||
import org.ray.spi.impl.NonRayletStateStoreProxyImpl;
|
||||
import org.ray.spi.impl.RayletStateStoreProxyImpl;
|
||||
import org.ray.spi.impl.RedisClient;
|
||||
import org.ray.spi.impl.StateStoreProxyImpl;
|
||||
import org.ray.spi.model.AddressInfo;
|
||||
import org.ray.util.logger.RayLog;
|
||||
|
||||
@@ -53,19 +52,14 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
throw new Error("Redis address must be configured under Worker mode.");
|
||||
}
|
||||
startOnebox(params, pathConfig);
|
||||
initStateStore(params.redis_address, params.use_raylet);
|
||||
initStateStore(params.redis_address);
|
||||
} else {
|
||||
initStateStore(params.redis_address, params.use_raylet);
|
||||
initStateStore(params.redis_address);
|
||||
if (!isWorker) {
|
||||
List<AddressInfo> nodes = stateStoreProxy.getAddressInfo(
|
||||
params.node_ip_address, params.redis_address, 5);
|
||||
params.object_store_name = nodes.get(0).storeName;
|
||||
if (!params.use_raylet) {
|
||||
params.object_store_manager_name = nodes.get(0).managerName;
|
||||
params.local_scheduler_name = nodes.get(0).schedulerName;
|
||||
} else {
|
||||
params.raylet_socket_name = nodes.get(0).rayletSocketName;
|
||||
}
|
||||
params.raylet_socket_name = nodes.get(0).rayletSocketName;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,54 +85,29 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
}
|
||||
|
||||
if (params.worker_mode != WorkerMode.NONE) {
|
||||
String overwrites = "";
|
||||
// initialize the links
|
||||
int releaseDelay = AbstractRayRuntime.configReader
|
||||
.getIntegerValue("ray", "plasma_default_release_delay", 0,
|
||||
"how many release requests should be delayed in plasma client");
|
||||
|
||||
if (!params.use_raylet) {
|
||||
ObjectStoreLink plink = new PlasmaClient(params.object_store_name,
|
||||
params.object_store_manager_name, releaseDelay);
|
||||
ObjectStoreLink plink = new PlasmaClient(params.object_store_name, "", releaseDelay);
|
||||
LocalSchedulerLink slink = new DefaultLocalSchedulerClient(
|
||||
params.raylet_socket_name,
|
||||
WorkerContext.currentWorkerId(),
|
||||
isWorker,
|
||||
WorkerContext.currentTask().taskId
|
||||
);
|
||||
|
||||
LocalSchedulerLink slink = new DefaultLocalSchedulerClient(
|
||||
params.local_scheduler_name,
|
||||
WorkerContext.currentWorkerId(),
|
||||
isWorker,
|
||||
WorkerContext.currentTask().taskId,
|
||||
false
|
||||
);
|
||||
init(slink, plink, funcMgr, pathConfig);
|
||||
|
||||
init(slink, plink, funcMgr, pathConfig);
|
||||
// register
|
||||
registerWorker(isWorker, params.node_ip_address, params.object_store_name,
|
||||
params.raylet_socket_name);
|
||||
|
||||
// register
|
||||
registerWorker(isWorker, params.node_ip_address, params.object_store_name,
|
||||
params.object_store_manager_name, params.local_scheduler_name);
|
||||
} else {
|
||||
|
||||
ObjectStoreLink plink = new PlasmaClient(params.object_store_name, "", releaseDelay);
|
||||
|
||||
LocalSchedulerLink slink = new DefaultLocalSchedulerClient(
|
||||
params.raylet_socket_name,
|
||||
WorkerContext.currentWorkerId(),
|
||||
isWorker,
|
||||
WorkerContext.currentTask().taskId,
|
||||
true
|
||||
);
|
||||
|
||||
init(slink, plink, funcMgr, pathConfig);
|
||||
|
||||
// register
|
||||
registerWorker(isWorker, params.node_ip_address, params.object_store_name,
|
||||
params.raylet_socket_name);
|
||||
}
|
||||
}
|
||||
|
||||
RayLog.core.info("RayNativeRuntime start with "
|
||||
+ "store " + params.object_store_name
|
||||
+ ", manager " + params.object_store_manager_name
|
||||
+ ", scheduler " + params.local_scheduler_name
|
||||
);
|
||||
RayLog.core.info("RayNativeRuntime started with store {}, raylet {}",
|
||||
params.object_store_name, params.raylet_socket_name);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -155,19 +124,14 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
|
||||
params.redis_address = manager.info().redisAddress;
|
||||
params.object_store_name = manager.info().localStores.get(0).storeName;
|
||||
params.object_store_manager_name = manager.info().localStores.get(0).managerName;
|
||||
params.local_scheduler_name = manager.info().localStores.get(0).schedulerName;
|
||||
params.raylet_socket_name = manager.info().localStores.get(0).rayletSocketName;
|
||||
//params.node_ip_address = NetworkUtil.getIpAddress();
|
||||
}
|
||||
|
||||
private void initStateStore(String redisAddress, boolean useRaylet) throws Exception {
|
||||
private void initStateStore(String redisAddress) throws Exception {
|
||||
kvStore = new RedisClient();
|
||||
kvStore.setAddr(redisAddress);
|
||||
stateStoreProxy = useRaylet
|
||||
? new RayletStateStoreProxyImpl(kvStore)
|
||||
: new NonRayletStateStoreProxyImpl(kvStore);
|
||||
//stateStoreProxy.setStore(kvStore);
|
||||
stateStoreProxy = new StateStoreProxyImpl(kvStore);
|
||||
stateStoreProxy.initializeGlobalState();
|
||||
}
|
||||
|
||||
@@ -193,27 +157,4 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
}
|
||||
}
|
||||
|
||||
private void registerWorker(boolean isWorker, String nodeIpAddress, String storeName,
|
||||
String managerName, String schedulerName) {
|
||||
Map<String, String> workerInfo = new HashMap<>();
|
||||
String workerId = new String(WorkerContext.currentWorkerId().getBytes());
|
||||
if (!isWorker) {
|
||||
workerInfo.put("node_ip_address", nodeIpAddress);
|
||||
workerInfo.put("driver_id", workerId);
|
||||
workerInfo.put("start_time", String.valueOf(System.currentTimeMillis()));
|
||||
workerInfo.put("plasma_store_socket", storeName);
|
||||
workerInfo.put("plasma_manager_socket", managerName);
|
||||
workerInfo.put("local_scheduler_socket", schedulerName);
|
||||
workerInfo.put("name", System.getProperty("user.dir"));
|
||||
//TODO: worker.redis_client.hmset(b"Drivers:" + worker.workerId, driver_info)
|
||||
kvStore.hmset("Drivers:" + workerId, workerInfo);
|
||||
} else {
|
||||
workerInfo.put("node_ip_address", nodeIpAddress);
|
||||
workerInfo.put("plasma_store_socket", storeName);
|
||||
workerInfo.put("plasma_manager_socket", managerName);
|
||||
workerInfo.put("local_scheduler_socket", schedulerName);
|
||||
//TODO: b"Workers:" + worker.workerId,
|
||||
kvStore.hmset("Workers:" + workerId, workerInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,8 +34,12 @@ public class RunInfo {
|
||||
}
|
||||
|
||||
public enum ProcessType {
|
||||
PT_WORKER, PT_LOCAL_SCHEDULER, PT_PLASMA_MANAGER, PT_PLASMA_STORE,
|
||||
PT_GLOBAL_SCHEDULER, PT_REDIS_SERVER, PT_WEB_UI, PT_RAYLET,
|
||||
PT_WORKER,
|
||||
PT_PLASMA_STORE,
|
||||
PT_REDIS_SERVER,
|
||||
PT_WEB_UI,
|
||||
PT_RAYLET,
|
||||
PT_DRIVER
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -29,8 +29,6 @@ import redis.clients.jedis.Jedis;
|
||||
*/
|
||||
public class RunManager {
|
||||
|
||||
public static final int INT16_MAX = 32767;
|
||||
|
||||
private static final DateTimeFormatter DATE_TIME_FORMATTER =
|
||||
DateTimeFormatter.ofPattern("Y-m-d_H-M-S");
|
||||
|
||||
@@ -71,13 +69,7 @@ public class RunManager {
|
||||
if (params.num_redis_shards <= 0) {
|
||||
params.num_redis_shards = 1;
|
||||
}
|
||||
if (params.num_local_schedulers <= 0) {
|
||||
params.num_local_schedulers = 1;
|
||||
}
|
||||
|
||||
params.start_workers_from_local_scheduler = params.run_mode != RunMode.SINGLE_BOX;
|
||||
|
||||
params.include_global_scheduler = true;
|
||||
params.start_redis_shards = true;
|
||||
|
||||
startRayProcesses();
|
||||
@@ -90,12 +82,7 @@ public class RunManager {
|
||||
if (params.num_redis_shards != 0) {
|
||||
throw new Exception("Number of redis shards should be zero in non-head node.");
|
||||
}
|
||||
if (params.num_local_schedulers <= 0) {
|
||||
params.num_local_schedulers = 1;
|
||||
}
|
||||
|
||||
//params.start_workers_from_local_scheduler = true;
|
||||
params.include_global_scheduler = false;
|
||||
params.start_redis_shards = false;
|
||||
|
||||
startRayProcesses();
|
||||
@@ -281,120 +268,34 @@ public class RunManager {
|
||||
}
|
||||
redisClient.close();
|
||||
|
||||
// start global scheduler
|
||||
if (params.include_global_scheduler && !params.use_raylet) {
|
||||
startGlobalScheduler(
|
||||
params.redis_address, params.node_ip_address, params.redirect, params.cleanup);
|
||||
}
|
||||
|
||||
// prepare parameters for node processes
|
||||
if (params.num_cpus.length == 0) {
|
||||
params.num_cpus = new int[params.num_local_schedulers];
|
||||
for (int i = 0; i < params.num_local_schedulers; i++) {
|
||||
params.num_cpus[i] = 1;
|
||||
}
|
||||
} else {
|
||||
assert (params.num_cpus.length == params.num_local_schedulers);
|
||||
}
|
||||
|
||||
if (params.num_gpus.length == 0) {
|
||||
params.num_gpus = new int[params.num_local_schedulers];
|
||||
for (int i = 0; i < params.num_local_schedulers; i++) {
|
||||
params.num_gpus[i] = 0;
|
||||
}
|
||||
} else {
|
||||
assert (params.num_gpus.length == params.num_local_schedulers);
|
||||
}
|
||||
|
||||
int[] localNumWorkers = new int[params.num_local_schedulers];
|
||||
if (params.num_workers == 0) {
|
||||
System.arraycopy(params.num_cpus, 0, localNumWorkers, 0, params.num_local_schedulers);
|
||||
} else {
|
||||
for (int i = 0; i < params.num_local_schedulers; i++) {
|
||||
localNumWorkers[i] = params.num_workers;
|
||||
}
|
||||
}
|
||||
|
||||
AddressInfo info = new AddressInfo();
|
||||
|
||||
if (params.use_raylet) {
|
||||
// Start object store
|
||||
int rpcPort = params.object_store_rpc_port;
|
||||
String storeName = "/tmp/plasma_store" + rpcPort;
|
||||
// Start object store
|
||||
int rpcPort = params.object_store_rpc_port;
|
||||
String storeName = "/tmp/plasma_store" + rpcPort;
|
||||
|
||||
startObjectStore(0, info,
|
||||
params.redis_address, params.node_ip_address, params.redirect, params.cleanup);
|
||||
|
||||
Map<String, Double> staticResources =
|
||||
ResourceUtil.getResourcesMapFromString(params.static_resources);
|
||||
|
||||
//Start raylet
|
||||
startRaylet(storeName, info, params.num_workers,
|
||||
params.redis_address,
|
||||
params.node_ip_address, params.redirect, staticResources, params.cleanup);
|
||||
|
||||
runInfo.localStores.add(info);
|
||||
} else {
|
||||
for (int i = 0; i < params.num_local_schedulers; i++) {
|
||||
// Start object stores
|
||||
startObjectStore(i, info,
|
||||
startObjectStore(0, info,
|
||||
params.redis_address, params.node_ip_address, params.redirect, params.cleanup);
|
||||
|
||||
startObjectManager(i, info,
|
||||
Map<String, Double> staticResources =
|
||||
ResourceUtil.getResourcesMapFromString(params.static_resources);
|
||||
|
||||
//Start raylet
|
||||
startRaylet(storeName, info, params.num_workers,
|
||||
params.redis_address,
|
||||
params.node_ip_address, params.redirect, params.cleanup);
|
||||
params.node_ip_address, params.redirect, staticResources, params.cleanup);
|
||||
|
||||
// Start local scheduler
|
||||
int workerCount = 0;
|
||||
runInfo.localStores.add(info);
|
||||
|
||||
if (params.start_workers_from_local_scheduler) {
|
||||
workerCount = localNumWorkers[i];
|
||||
localNumWorkers[i] = 0;
|
||||
}
|
||||
|
||||
startLocalScheduler(i, info,
|
||||
params.num_cpus[i], params.num_gpus[i], workerCount,
|
||||
params.redis_address,
|
||||
params.node_ip_address, params.redirect, params.cleanup);
|
||||
|
||||
runInfo.localStores.add(info);
|
||||
}
|
||||
}
|
||||
|
||||
// start local workers
|
||||
if (!params.use_raylet) {
|
||||
for (int i = 0; i < params.num_local_schedulers; i++) {
|
||||
AddressInfo localStores = runInfo.localStores.get(i);
|
||||
localStores.workerCount = localNumWorkers[i];
|
||||
for (int j = 0; j < localNumWorkers[i]; j++) {
|
||||
startWorker(localStores.storeName, localStores.managerName, localStores.schedulerName,
|
||||
"/worker" + i + "." + j, params.redis_address,
|
||||
params.node_ip_address, UniqueId.NIL, "", params.redirect, params.cleanup);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HashSet<RunInfo.ProcessType> excludeTypes = new HashSet<>();
|
||||
if (!params.use_raylet) {
|
||||
excludeTypes.add(RunInfo.ProcessType.PT_RAYLET);
|
||||
} else {
|
||||
excludeTypes.add(RunInfo.ProcessType.PT_LOCAL_SCHEDULER);
|
||||
excludeTypes.add(RunInfo.ProcessType.PT_GLOBAL_SCHEDULER);
|
||||
excludeTypes.add(RunInfo.ProcessType.PT_PLASMA_MANAGER);
|
||||
}
|
||||
if (!checkAlive(excludeTypes)) {
|
||||
if (!checkAlive()) {
|
||||
cleanup(true);
|
||||
throw new RuntimeException("Start Ray processes failed");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkAlive(HashSet<RunInfo.ProcessType> excludeTypes) {
|
||||
private boolean checkAlive() {
|
||||
RunInfo.ProcessType[] types = RunInfo.ProcessType.values();
|
||||
for (int i = 0; i < types.length; i++) {
|
||||
if (excludeTypes.contains(types[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ProcessInfo p;
|
||||
for (int j = 0; j < runInfo.allProcesses.get(i).size(); ) {
|
||||
p = runInfo.allProcesses.get(i).get(j);
|
||||
@@ -513,96 +414,6 @@ public class RunManager {
|
||||
return ip + ":" + port;
|
||||
}
|
||||
|
||||
private void startGlobalScheduler(String redisAddress, String ip,
|
||||
boolean redirect, boolean cleanup) {
|
||||
String filePath = paths.global_scheduler;
|
||||
String cmd = filePath + " -r " + redisAddress + " -h " + ip;
|
||||
|
||||
Map<String, String> env = null;
|
||||
startProcess(cmd.split(" "), env, RunInfo.ProcessType.PT_GLOBAL_SCHEDULER, "global_scheduler",
|
||||
redisAddress,
|
||||
ip, redirect, cleanup);
|
||||
}
|
||||
|
||||
/*
|
||||
* @param storeName The name of the plasma store socket to connect to
|
||||
*
|
||||
* @param storeManagerName The name of the plasma manager socket to connect
|
||||
* to
|
||||
*
|
||||
* @param storeManagerAddress the address of the plasma manager to connect
|
||||
* to
|
||||
*
|
||||
* @param workerPath The path of the script to use when the local scheduler
|
||||
* starts up new workers
|
||||
*
|
||||
* @param numCpus The number of CPUs the local scheduler should be
|
||||
* configured with
|
||||
*
|
||||
* @param numGpus The number of GPUs the local scheduler should be
|
||||
* configured with
|
||||
*
|
||||
* @param numWorkers The number of workers that the local scheduler should
|
||||
* start
|
||||
*/
|
||||
private void startLocalScheduler(int index, AddressInfo info, int numCpus,
|
||||
int numGpus, int numWorkers,
|
||||
String redisAddress, String ip, boolean redirect,
|
||||
boolean cleanup) {
|
||||
//if (numCpus <= 0)
|
||||
// numCpus = Runtime.getRuntime().availableProcessors();
|
||||
if (numGpus <= 0) {
|
||||
numGpus = 0;
|
||||
}
|
||||
|
||||
String filePath = paths.local_scheduler;
|
||||
int rpcPort = params.local_scheduler_rpc_port + index;
|
||||
String name = "/tmp/scheduler" + rpcPort;
|
||||
String rpcAddr = "";
|
||||
String cmd = filePath + " -s " + name + " -p " + info.storeName + " -h " + ip + " -n "
|
||||
+ numWorkers + " -c " + "CPU," + INT16_MAX + ",GPU,0";
|
||||
|
||||
assert (info.managerName.length() > 0);
|
||||
assert (info.storeName.length() > 0);
|
||||
assert (redisAddress.length() > 0);
|
||||
|
||||
cmd += " -m " + info.managerName;
|
||||
|
||||
String workerCmd = null;
|
||||
workerCmd = buildWorkerCommand(true, info.storeName, info.managerName, name,
|
||||
UniqueId.NIL, "", ip, redisAddress);
|
||||
cmd += " -w \"" + workerCmd + "\"";
|
||||
|
||||
if (redisAddress.length() > 0) {
|
||||
cmd += " -r " + redisAddress;
|
||||
}
|
||||
if (info.managerPort > 0) {
|
||||
cmd += " -a " + params.node_ip_address + ":" + info.managerPort;
|
||||
}
|
||||
|
||||
Map<String, String> env = null;
|
||||
String[] cmds = StringUtil.split(cmd, " ", "\"", "\"").toArray(new String[0]);
|
||||
Process p = startProcess(cmds, env, RunInfo.ProcessType.PT_LOCAL_SCHEDULER,
|
||||
"local_scheduler", redisAddress, ip, redirect, cleanup);
|
||||
|
||||
if (p != null && p.isAlive()) {
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
if (p == null || !p.isAlive()) {
|
||||
info.schedulerName = "";
|
||||
info.schedulerRpcAddr = "";
|
||||
throw new RuntimeException("Start local scheduler failed ...");
|
||||
} else {
|
||||
info.schedulerName = name;
|
||||
info.schedulerRpcAddr = rpcAddr;
|
||||
}
|
||||
}
|
||||
|
||||
private void startRaylet(String storeName, AddressInfo info, int numWorkers,
|
||||
String redisAddress, String ip, boolean redirect,
|
||||
Map<String, Double> staticResources, boolean cleanup) {
|
||||
@@ -658,39 +469,7 @@ public class RunManager {
|
||||
String ip, String redisAddress) {
|
||||
String workerConfigs = "ray.java.start.object_store_name=" + storeName
|
||||
+ ";ray.java.start.raylet_socket_name=" + rayletSocketName
|
||||
+ ";ray.java.start.worker_mode=WORKER;ray.java.start.use_raylet=true";
|
||||
workerConfigs += ";ray.java.start.deploy=" + params.deploy;
|
||||
if (!actorId.equals(UniqueId.NIL)) {
|
||||
workerConfigs += ";ray.java.start.actor_id=" + actorId;
|
||||
}
|
||||
if (!actorClass.equals("")) {
|
||||
workerConfigs += ";ray.java.start.driver_class=" + actorClass;
|
||||
}
|
||||
|
||||
String jvmArgs = "";
|
||||
jvmArgs += " -Dlogging.path=" + params.log_dir;
|
||||
jvmArgs += " -Dlogging.file.name=core-*pid_suffix*";
|
||||
|
||||
return buildJavaProcessCommand(
|
||||
RunInfo.ProcessType.PT_WORKER,
|
||||
"org.ray.runner.worker.DefaultWorker",
|
||||
"",
|
||||
workerConfigs,
|
||||
jvmArgs,
|
||||
ip,
|
||||
redisAddress,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
private String buildWorkerCommand(boolean isFromLocalScheduler, String storeName,
|
||||
String storeManagerName, String localSchedulerName,
|
||||
UniqueId actorId, String actorClass, String
|
||||
ip, String redisAddress) {
|
||||
String workerConfigs = "ray.java.start.object_store_name=" + storeName
|
||||
+ ";ray.java.start.object_store_manager_name=" + storeManagerName
|
||||
+ ";ray.java.start.worker_mode=WORKER"
|
||||
+ ";ray.java.start.local_scheduler_name=" + localSchedulerName;
|
||||
+ ";ray.java.start.worker_mode=WORKER";
|
||||
workerConfigs += ";ray.java.start.deploy=" + params.deploy;
|
||||
if (!actorId.equals(UniqueId.NIL)) {
|
||||
workerConfigs += ";ray.java.start.actor_id=" + actorId;
|
||||
@@ -747,47 +526,4 @@ public class RunManager {
|
||||
}
|
||||
}
|
||||
|
||||
private AddressInfo startObjectManager(int index, AddressInfo info,
|
||||
String redisAddress, String ip, boolean redirect,
|
||||
boolean cleanup) {
|
||||
String filePath = paths.store_manager;
|
||||
int rpcPort = params.object_store_manager_rpc_port + index;
|
||||
String name = "/tmp/plasma_manager" + rpcPort;
|
||||
String rpcAddr = "";
|
||||
|
||||
String cmd = filePath + " -s " + info.storeName + " -m " + name + " -h " + ip + " -p "
|
||||
+ (params.object_store_manager_ray_listen_port + index)
|
||||
+ " -r " + redisAddress;
|
||||
|
||||
Map<String, String> env = null;
|
||||
Process p = startProcess(cmd.split(" "), env, RunInfo.ProcessType.PT_PLASMA_MANAGER,
|
||||
"object_manager", redisAddress, ip, redirect, cleanup);
|
||||
|
||||
if (p != null && p.isAlive()) {
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
if (p == null || !p.isAlive()) {
|
||||
throw new RuntimeException("Start object manager failed ...");
|
||||
} else {
|
||||
info.managerName = name;
|
||||
info.managerPort = params.object_store_manager_ray_listen_port + index;
|
||||
info.managerRpcAddr = rpcAddr;
|
||||
return info;
|
||||
}
|
||||
}
|
||||
|
||||
public void startWorker(String storeName, String storeManagerName,
|
||||
String localSchedulerName, String workerName, String redisAddress,
|
||||
String ip, UniqueId actorId, String actorClass,
|
||||
boolean redirect, boolean cleanup) {
|
||||
String cmd = buildWorkerCommand(false, storeName, storeManagerName, localSchedulerName, actorId,
|
||||
actorClass, ip, redisAddress);
|
||||
startProcess(cmd.split(" "), null, RunInfo.ProcessType.PT_WORKER, workerName, redisAddress, ip,
|
||||
redirect, cleanup);
|
||||
}
|
||||
}
|
||||
|
||||
+23
-43
@@ -7,6 +7,8 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.ray.api.RayObject;
|
||||
import org.ray.api.WaitResult;
|
||||
import org.ray.api.id.UniqueId;
|
||||
import org.ray.core.AbstractRayRuntime;
|
||||
import org.ray.core.UniqueIdHelper;
|
||||
@@ -28,48 +30,38 @@ public class DefaultLocalSchedulerClient implements LocalSchedulerLink {
|
||||
return bb;
|
||||
});
|
||||
private long client = 0;
|
||||
boolean useRaylet = false;
|
||||
|
||||
public DefaultLocalSchedulerClient(String schedulerSockName, UniqueId clientId,
|
||||
boolean isWorker, UniqueId driverId, boolean useRaylet) {
|
||||
boolean isWorker, UniqueId driverId) {
|
||||
client = nativeInit(schedulerSockName, clientId.getBytes(),
|
||||
isWorker, driverId.getBytes(), useRaylet);
|
||||
this.useRaylet = useRaylet;
|
||||
isWorker, driverId.getBytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<byte[]> wait(byte[][] objectIds, int timeoutMs, int numReturns) {
|
||||
assert (useRaylet == true);
|
||||
public <T> WaitResult<T> wait(List<RayObject<T>> waitFor, int numReturns, int timeoutMs) {
|
||||
List<UniqueId> ids = new ArrayList<>();
|
||||
for (RayObject<T> element : waitFor) {
|
||||
ids.add(element.getId());
|
||||
}
|
||||
|
||||
boolean[] readys = nativeWaitObject(client, objectIds, numReturns, timeoutMs, false);
|
||||
assert (readys.length == objectIds.length);
|
||||
boolean[] ready = nativeWaitObject(client, getIdBytes(ids), numReturns, timeoutMs, false);
|
||||
List<RayObject<T>> readyList = new ArrayList<>();
|
||||
List<RayObject<T>> unreadyList = new ArrayList<>();
|
||||
|
||||
List<byte[]> ret = new ArrayList<>();
|
||||
for (int i = 0; i < readys.length; i++) {
|
||||
if (readys[i]) {
|
||||
ret.add(objectIds[i]);
|
||||
for (int i = 0; i < ready.length; i++) {
|
||||
if (ready[i]) {
|
||||
readyList.add(waitFor.get(i));
|
||||
} else {
|
||||
unreadyList.add(waitFor.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
return new WaitResult<>(readyList, unreadyList);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void submitTask(TaskSpec task) {
|
||||
RayLog.core.debug("Submitting task: {}", task);
|
||||
// We don't support resources management in non raylet mode.
|
||||
if (!useRaylet) {
|
||||
task.resources.clear();
|
||||
task.resources.put(ResourceUtil.CPU_LITERAL, 0.0);
|
||||
} else {
|
||||
if (!task.resources.containsKey(ResourceUtil.CPU_LITERAL)) {
|
||||
task.resources.put(ResourceUtil.CPU_LITERAL, 0.0);
|
||||
}
|
||||
|
||||
if (!task.resources.containsKey(ResourceUtil.GPU_LITERAL)) {
|
||||
task.resources.put(ResourceUtil.GPU_LITERAL, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
ByteBuffer info = taskSpec2Info(task);
|
||||
byte[] cursorId = null;
|
||||
@@ -77,29 +69,17 @@ public class DefaultLocalSchedulerClient implements LocalSchedulerLink {
|
||||
cursorId = task.cursorId.getBytes();
|
||||
}
|
||||
|
||||
nativeSubmitTask(client, cursorId, info, info.position(), info.remaining(), useRaylet);
|
||||
nativeSubmitTask(client, cursorId, info, info.position(), info.remaining());
|
||||
}
|
||||
|
||||
@Override
|
||||
public TaskSpec getTask() {
|
||||
byte[] bytes = nativeGetTask(client, useRaylet);
|
||||
byte[] bytes = nativeGetTask(client);
|
||||
assert (null != bytes);
|
||||
ByteBuffer bb = ByteBuffer.wrap(bytes);
|
||||
return taskInfo2Spec(bb);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void markTaskPutDependency(UniqueId taskId, UniqueId objectId) {
|
||||
nativePutObject(client, taskId.getBytes(), objectId.getBytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reconstructObject(UniqueId objectId, boolean fetchOnly) {
|
||||
List<UniqueId> objects = new ArrayList<>();
|
||||
objects.add(objectId);
|
||||
nativeReconstructObjects(client, getIdBytes(objects), fetchOnly);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reconstructObjects(List<UniqueId> objectIds, boolean fetchOnly) {
|
||||
if (RayLog.core.isInfoEnabled()) {
|
||||
@@ -289,13 +269,13 @@ public class DefaultLocalSchedulerClient implements LocalSchedulerLink {
|
||||
/// 6) popd
|
||||
|
||||
private static native long nativeInit(String localSchedulerSocket, byte[] workerId,
|
||||
boolean isWorker, byte[] driverTaskId, boolean useRaylet);
|
||||
boolean isWorker, byte[] driverTaskId);
|
||||
|
||||
private static native void nativeSubmitTask(long client, byte[] cursorId, ByteBuffer taskBuff,
|
||||
int pos, int taskSize, boolean useRaylet);
|
||||
int pos, int taskSize);
|
||||
|
||||
// return TaskInfo (in FlatBuffer)
|
||||
private static native byte[] nativeGetTask(long client, boolean useRaylet);
|
||||
private static native byte[] nativeGetTask(long client);
|
||||
|
||||
private static native void nativeDestroy(long client);
|
||||
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
package org.ray.spi.impl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.ray.spi.KeyValueStoreLink;
|
||||
import org.ray.spi.model.AddressInfo;
|
||||
|
||||
/**
|
||||
* A class used to interface with the Ray control state for non-raylet.
|
||||
*/
|
||||
public class NonRayletStateStoreProxyImpl extends BaseStateStoreProxyImpl {
|
||||
public NonRayletStateStoreProxyImpl(KeyValueStoreLink rayKvStore) {
|
||||
super(rayKvStore);
|
||||
}
|
||||
|
||||
/*
|
||||
* get address info of one node from primary redis
|
||||
* @param: node ip address, usually local ip address
|
||||
* @return: a list of SchedulerInfo which contains storeName, managerName, managerPort and
|
||||
* schedulerName
|
||||
* @note:Redis data key is "CL:*", redis data value is a hash.
|
||||
* The hash contains the following:
|
||||
* "deleted" : 0/1
|
||||
* "ray_client_id"
|
||||
* "node_ip_address"
|
||||
* "client_type" : plasma_manager/local_scheduler
|
||||
* "store_socket_name"(op)
|
||||
* "manager_socket_name"(op)
|
||||
* "local_scheduler_socket_name"(op)
|
||||
*/
|
||||
@Override
|
||||
public List<AddressInfo> doGetAddressInfo(final String nodeIpAddress,
|
||||
final String redisAddress) throws Exception {
|
||||
if (this.rayKvStore == null) {
|
||||
throw new Exception("no redis client when use doGetAddressInfo");
|
||||
}
|
||||
List<AddressInfo> schedulerInfo = new ArrayList<>();
|
||||
|
||||
Set<byte[]> cks = rayKvStore.keys("CL:*".getBytes());
|
||||
byte[] key;
|
||||
List<Map<byte[], byte[]>> plasmaManager = new ArrayList<>();
|
||||
List<Map<byte[], byte[]>> localScheduler = new ArrayList<>();
|
||||
for (byte[] ck : cks) {
|
||||
key = ck;
|
||||
Map<byte[], byte[]> info = rayKvStore.hgetAll(key);
|
||||
|
||||
String deleted = charsetDecode(info.get("deleted".getBytes()), "US-ASCII");
|
||||
if (deleted != null) {
|
||||
if (Boolean.getBoolean(deleted)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!info.containsKey("ray_client_id".getBytes())) {
|
||||
throw new Exception("no ray_client_id in any client");
|
||||
} else if (!info.containsKey("node_ip_address".getBytes())) {
|
||||
throw new Exception("no node_ip_address in any client");
|
||||
} else if (!info.containsKey("client_type".getBytes())) {
|
||||
throw new Exception("no client_type in any client");
|
||||
}
|
||||
|
||||
if (charsetDecode(info.get("node_ip_address".getBytes()), "US-ASCII")
|
||||
.equals(nodeIpAddress)) {
|
||||
String clientType = charsetDecode(info.get("client_type".getBytes()), "US-ASCII");
|
||||
if ("plasma_manager".equals(clientType)) {
|
||||
plasmaManager.add(info);
|
||||
} else if ("local_scheduler".equals(clientType)) {
|
||||
localScheduler.add(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (plasmaManager.size() < 1 || localScheduler.size() < 1) {
|
||||
throw new Exception("no plasma_manager or local_scheduler");
|
||||
} else if (plasmaManager.size() != localScheduler.size()) {
|
||||
throw new Exception("plasma_manager number not Equal local_scheduler number");
|
||||
}
|
||||
|
||||
for (int i = 0; i < plasmaManager.size(); i++) {
|
||||
AddressInfo si = new AddressInfo();
|
||||
si.storeName = charsetDecode(plasmaManager.get(i).get("store_socket_name".getBytes()),
|
||||
"US-ASCII");
|
||||
si.managerName = charsetDecode(plasmaManager.get(i).get("manager_socket_name".getBytes()),
|
||||
"US-ASCII");
|
||||
|
||||
byte[] rpc = plasmaManager.get(i).get("manager_rpc_name".getBytes());
|
||||
if (rpc != null) {
|
||||
si.managerRpcAddr = charsetDecode(rpc, "US-ASCII");
|
||||
}
|
||||
|
||||
rpc = plasmaManager.get(i).get("store_rpc_name".getBytes());
|
||||
if (rpc != null) {
|
||||
si.storeRpcAddr = charsetDecode(rpc, "US-ASCII");
|
||||
}
|
||||
|
||||
String managerAddr = charsetDecode(plasmaManager.get(i).get("manager_address".getBytes()),
|
||||
"US-ASCII");
|
||||
si.managerPort = Integer.parseInt(managerAddr.split(":")[1]);
|
||||
si.schedulerName = charsetDecode(
|
||||
localScheduler.get(i).get("local_scheduler_socket_name".getBytes()), "US-ASCII");
|
||||
|
||||
rpc = localScheduler.get(i).get("local_scheduler_rpc_name".getBytes());
|
||||
if (rpc != null) {
|
||||
si.schedulerRpcAddr = charsetDecode(rpc, "US-ASCII");
|
||||
}
|
||||
|
||||
schedulerInfo.add(si);
|
||||
}
|
||||
|
||||
return schedulerInfo;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,62 +0,0 @@
|
||||
package org.ray.spi.impl;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import org.ray.api.id.UniqueId;
|
||||
import org.ray.format.gcs.ClientTableData;
|
||||
import org.ray.spi.KeyValueStoreLink;
|
||||
import org.ray.spi.model.AddressInfo;
|
||||
import org.ray.util.NetworkUtil;
|
||||
|
||||
/**
|
||||
* A class used to interface with the Ray control state for raylet.
|
||||
*/
|
||||
public class RayletStateStoreProxyImpl extends BaseStateStoreProxyImpl {
|
||||
|
||||
public RayletStateStoreProxyImpl(KeyValueStoreLink rayKvStore) {
|
||||
super(rayKvStore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<AddressInfo> doGetAddressInfo(final String nodeIpAddress,
|
||||
final String redisAddress) throws Exception {
|
||||
if (this.rayKvStore == null) {
|
||||
throw new Exception("no redis client when use doGetAddressInfo");
|
||||
}
|
||||
List<AddressInfo> schedulerInfo = new ArrayList<>();
|
||||
|
||||
byte[] prefix = "CLIENT".getBytes();
|
||||
byte[] postfix = UniqueId.genNil().getBytes();
|
||||
byte[] clientKey = new byte[prefix.length + postfix.length];
|
||||
System.arraycopy(prefix, 0, clientKey, 0, prefix.length);
|
||||
System.arraycopy(postfix, 0, clientKey, prefix.length, postfix.length);
|
||||
|
||||
Set<byte[]> clients = rayKvStore.zrange(clientKey, 0, -1);
|
||||
|
||||
for (byte[] clientMessage : clients) {
|
||||
ByteBuffer bb = ByteBuffer.wrap(clientMessage);
|
||||
ClientTableData client = ClientTableData.getRootAsClientTableData(bb);
|
||||
String clientNodeIpAddress = client.nodeManagerAddress();
|
||||
|
||||
String localIpAddress = NetworkUtil.getIpAddress(null);
|
||||
String redisIpAddress = redisAddress.substring(0, redisAddress.indexOf(':'));
|
||||
|
||||
boolean headNodeAddress = "127.0.0.1".equals(clientNodeIpAddress)
|
||||
&& Objects.equals(redisIpAddress, localIpAddress);
|
||||
boolean notHeadNodeAddress = Objects.equals(clientNodeIpAddress, nodeIpAddress);
|
||||
|
||||
if (headNodeAddress || notHeadNodeAddress) {
|
||||
AddressInfo si = new AddressInfo();
|
||||
si.storeName = client.objectStoreSocketName();
|
||||
si.rayletSocketName = client.rayletSocketName();
|
||||
si.managerRpcAddr = client.nodeManagerAddress();
|
||||
si.managerPort = client.nodeManagerPort();
|
||||
schedulerInfo.add(si);
|
||||
}
|
||||
}
|
||||
return schedulerInfo;
|
||||
}
|
||||
}
|
||||
+51
-11
@@ -1,26 +1,30 @@
|
||||
package org.ray.spi.impl;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.ray.api.id.UniqueId;
|
||||
import org.ray.format.gcs.ClientTableData;
|
||||
import org.ray.spi.KeyValueStoreLink;
|
||||
import org.ray.spi.StateStoreProxy;
|
||||
import org.ray.spi.model.AddressInfo;
|
||||
import org.ray.util.NetworkUtil;
|
||||
import org.ray.util.logger.RayLog;
|
||||
|
||||
/**
|
||||
* Base class used to interface with the Ray control state.
|
||||
* A class used to interface with the Ray control state.
|
||||
*/
|
||||
public abstract class BaseStateStoreProxyImpl implements StateStoreProxy {
|
||||
public class StateStoreProxyImpl implements StateStoreProxy {
|
||||
|
||||
public KeyValueStoreLink rayKvStore;
|
||||
public ArrayList<KeyValueStoreLink> shardStoreList = new ArrayList<>();
|
||||
|
||||
public BaseStateStoreProxyImpl(KeyValueStoreLink rayKvStore) {
|
||||
public StateStoreProxyImpl(KeyValueStoreLink rayKvStore) {
|
||||
this.rayKvStore = rayKvStore;
|
||||
}
|
||||
|
||||
@@ -49,7 +53,7 @@ public abstract class BaseStateStoreProxyImpl implements StateStoreProxy {
|
||||
Set<String> distinctIpAddress = new HashSet<String>(ipAddressPorts);
|
||||
if (distinctIpAddress.size() != numRedisShards) {
|
||||
es = String.format("Expected %d Redis shard addresses, found2 %d.", numRedisShards,
|
||||
distinctIpAddress.size());
|
||||
distinctIpAddress.size());
|
||||
throw new Exception(es);
|
||||
}
|
||||
|
||||
@@ -78,7 +82,7 @@ public abstract class BaseStateStoreProxyImpl implements StateStoreProxy {
|
||||
|
||||
@Override
|
||||
public List<AddressInfo> getAddressInfo(final String nodeIpAddress,
|
||||
final String redisAddress,
|
||||
final String redisAddress,
|
||||
int numRetries) {
|
||||
int count = 0;
|
||||
while (count < numRetries) {
|
||||
@@ -86,11 +90,11 @@ public abstract class BaseStateStoreProxyImpl implements StateStoreProxy {
|
||||
return doGetAddressInfo(nodeIpAddress, redisAddress);
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
RayLog.core.warn("Error occurred in BaseStateStoreProxyImpl getAddressInfo, "
|
||||
+ (numRetries - count) + " retries remaining", e);
|
||||
RayLog.core.warn("Error occurred in StateStoreProxyImpl getAddressInfo, "
|
||||
+ (numRetries - count) + " retries remaining", e);
|
||||
TimeUnit.MILLISECONDS.sleep(1000);
|
||||
} catch (InterruptedException ie) {
|
||||
RayLog.core.error("error at BaseStateStoreProxyImpl getAddressInfo", e);
|
||||
RayLog.core.error("error at StateStoreProxyImpl getAddressInfo", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
@@ -108,8 +112,44 @@ public abstract class BaseStateStoreProxyImpl implements StateStoreProxy {
|
||||
* @return A list of SchedulerInfo which contains node manager or local scheduler address info.
|
||||
* @throws Exception No redis client exception.
|
||||
*/
|
||||
protected abstract List<AddressInfo> doGetAddressInfo(final String nodeIpAddress,
|
||||
final String redisAddress) throws Exception;
|
||||
public List<AddressInfo> doGetAddressInfo(final String nodeIpAddress,
|
||||
final String redisAddress) throws Exception {
|
||||
if (this.rayKvStore == null) {
|
||||
throw new Exception("no redis client when use doGetAddressInfo");
|
||||
}
|
||||
List<AddressInfo> schedulerInfo = new ArrayList<>();
|
||||
|
||||
byte[] prefix = "CLIENT".getBytes();
|
||||
byte[] postfix = UniqueId.genNil().getBytes();
|
||||
byte[] clientKey = new byte[prefix.length + postfix.length];
|
||||
System.arraycopy(prefix, 0, clientKey, 0, prefix.length);
|
||||
System.arraycopy(postfix, 0, clientKey, prefix.length, postfix.length);
|
||||
|
||||
Set<byte[]> clients = rayKvStore.zrange(clientKey, 0, -1);
|
||||
|
||||
for (byte[] clientMessage : clients) {
|
||||
ByteBuffer bb = ByteBuffer.wrap(clientMessage);
|
||||
ClientTableData client = ClientTableData.getRootAsClientTableData(bb);
|
||||
String clientNodeIpAddress = client.nodeManagerAddress();
|
||||
|
||||
String localIpAddress = NetworkUtil.getIpAddress(null);
|
||||
String redisIpAddress = redisAddress.substring(0, redisAddress.indexOf(':'));
|
||||
|
||||
boolean headNodeAddress = "127.0.0.1".equals(clientNodeIpAddress)
|
||||
&& Objects.equals(redisIpAddress, localIpAddress);
|
||||
boolean notHeadNodeAddress = Objects.equals(clientNodeIpAddress, nodeIpAddress);
|
||||
|
||||
if (headNodeAddress || notHeadNodeAddress) {
|
||||
AddressInfo si = new AddressInfo();
|
||||
si.storeName = client.objectStoreSocketName();
|
||||
si.rayletSocketName = client.rayletSocketName();
|
||||
si.managerRpcAddr = client.nodeManagerAddress();
|
||||
si.managerPort = client.nodeManagerPort();
|
||||
schedulerInfo.add(si);
|
||||
}
|
||||
}
|
||||
return schedulerInfo;
|
||||
}
|
||||
|
||||
protected String charsetDecode(byte[] bs, String charset) throws UnsupportedEncodingException {
|
||||
return new String(bs, charset);
|
||||
Reference in New Issue
Block a user