[Core] Multi-tenancy: Job isolation & implement per job config (except for env variables) (#9500)

This commit is contained in:
Kai Yang
2020-08-04 15:51:29 +08:00
committed by GitHub
parent 28b1f7710c
commit 27cd323ce1
35 changed files with 969 additions and 184 deletions
@@ -11,6 +11,7 @@ import io.ray.runtime.gcs.GcsClient;
import io.ray.runtime.gcs.GcsClientOptions;
import io.ray.runtime.gcs.RedisClient;
import io.ray.runtime.generated.Common.WorkerType;
import io.ray.runtime.generated.Gcs.JobConfig;
import io.ray.runtime.object.NativeObjectStore;
import io.ray.runtime.runner.RunManager;
import io.ray.runtime.task.NativeTaskExecutor;
@@ -106,6 +107,17 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
}
int numWorkersPerProcess =
rayConfig.workerMode == WorkerType.DRIVER ? 1 : rayConfig.numWorkersPerProcess;
byte[] serializedJobConfig = null;
if (rayConfig.workerMode == WorkerType.DRIVER) {
JobConfig.Builder jobConfigBuilder =
JobConfig.newBuilder()
.setNumJavaWorkersPerProcess(rayConfig.numWorkersPerProcess)
.addAllJvmOptions(rayConfig.jvmOptionsForJavaWorker)
.putAllWorkerEnv(rayConfig.workerEnv);
serializedJobConfig = jobConfigBuilder.build().toByteArray();
}
// TODO(qwang): Get object_store_socket_name and raylet_socket_name from Redis.
nativeInitialize(rayConfig.workerMode.getNumber(),
rayConfig.nodeIp, rayConfig.getNodeManagerPort(),
@@ -113,7 +125,7 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
rayConfig.objectStoreSocketName, rayConfig.rayletSocketName,
(rayConfig.workerMode == WorkerType.DRIVER ? rayConfig.getJobId() : JobId.NIL).getBytes(),
new GcsClientOptions(rayConfig), numWorkersPerProcess,
rayConfig.logDir, rayConfig.rayletConfigParameters);
rayConfig.logDir, rayConfig.rayletConfigParameters, serializedJobConfig);
taskExecutor = new NativeTaskExecutor(this);
workerContext = new NativeWorkerContext();
@@ -201,7 +213,7 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
int workerMode, String ndoeIpAddress,
int nodeManagerPort, String driverName, String storeSocket, String rayletSocket,
byte[] jobId, GcsClientOptions gcsClientOptions, int numWorkersPerProcess,
String logDir, Map<String, String> rayletConfigParameters);
String logDir, Map<String, String> rayletConfigParameters, byte[] serializedJobConfig);
private static native void nativeRunTaskExecutor(TaskExecutor taskExecutor);
@@ -3,6 +3,7 @@ package io.ray.runtime.config;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigException;
import com.typesafe.config.ConfigFactory;
@@ -29,6 +30,8 @@ public class RayConfig {
public static final String DEFAULT_CONFIG_FILE = "ray.default.conf";
public static final String CUSTOM_CONFIG_FILE = "ray.conf";
private static int DEFAULT_NUM_JAVA_WORKER_PER_PROCESS = 10;
private static final Random RANDOM = new Random();
private static final DateTimeFormatter DATE_TIME_FORMATTER =
@@ -90,6 +93,9 @@ public class RayConfig {
public final int numWorkersPerProcess;
public final List<String> jvmOptionsForJavaWorker;
public final Map<String, String> workerEnv;
private void validate() {
if (workerMode == WorkerType.WORKER) {
Preconditions.checkArgument(redisAddress != null,
@@ -141,6 +147,17 @@ public class RayConfig {
this.jobId = JobId.NIL;
}
// jvm options for java workers of this job.
jvmOptionsForJavaWorker = config.getStringList("ray.job.jvm-options");
ImmutableMap.Builder<String, String> workerEnvBuilder = ImmutableMap.builder();
Config workerEnvConfig = config.getConfig("ray.job.worker-env");
if (workerEnvConfig != null) {
for (Map.Entry<String, ConfigValue> entry : workerEnvConfig.entrySet()) {
workerEnvBuilder.put(entry.getKey(), workerEnvConfig.getString(entry.getKey()));
}
}
workerEnv = workerEnvBuilder.build();
updateSessionDir();
// Object store configurations.
objectStoreSize = config.getBytes("ray.object-store.size");
@@ -206,7 +223,22 @@ public class RayConfig {
jobResourcePath = null;
}
numWorkersPerProcess = config.getInt("ray.raylet.config.num_workers_per_process_java");
boolean enableMultiTenancy = false;
if (config.hasPath("ray.raylet.config.enable_multi_tenancy")) {
enableMultiTenancy =
Boolean.valueOf(config.getString("ray.raylet.config.enable_multi_tenancy"));
}
if (!enableMultiTenancy) {
numWorkersPerProcess = config.getInt("ray.raylet.config.num_workers_per_process_java");
} else {
final int localNumWorkersPerProcess = config.getInt("ray.job.num-java-workers-per-process");
if (localNumWorkersPerProcess <= 0) {
numWorkersPerProcess = DEFAULT_NUM_JAVA_WORKER_PER_PROCESS;
} else {
numWorkersPerProcess = localNumWorkersPerProcess;
}
}
// Validate config.
validate();
@@ -29,6 +29,15 @@ ray {
// executing tasks from different jobs. E.g. if it's set to '/tm/job_resources',
// the path for job 123 will be '/tmp/job_resources/123'.
resource-path: ""
/// The number of java worker per worker process.
num-java-workers-per-process: 10
/// The jvm options for java workers of the job.
jvm-options: []
// Environment variables to be set on worker processes.
worker-env {
// key1 : "value1"
// key2 : "value2"
}
}
// Configurations about logging.