mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
[Core] Multi-tenancy: Job isolation & implement per job config (except for env variables) (#9500)
This commit is contained in:
@@ -11,6 +11,7 @@ import io.ray.runtime.gcs.GcsClient;
|
||||
import io.ray.runtime.gcs.GcsClientOptions;
|
||||
import io.ray.runtime.gcs.RedisClient;
|
||||
import io.ray.runtime.generated.Common.WorkerType;
|
||||
import io.ray.runtime.generated.Gcs.JobConfig;
|
||||
import io.ray.runtime.object.NativeObjectStore;
|
||||
import io.ray.runtime.runner.RunManager;
|
||||
import io.ray.runtime.task.NativeTaskExecutor;
|
||||
@@ -106,6 +107,17 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
}
|
||||
int numWorkersPerProcess =
|
||||
rayConfig.workerMode == WorkerType.DRIVER ? 1 : rayConfig.numWorkersPerProcess;
|
||||
|
||||
byte[] serializedJobConfig = null;
|
||||
if (rayConfig.workerMode == WorkerType.DRIVER) {
|
||||
JobConfig.Builder jobConfigBuilder =
|
||||
JobConfig.newBuilder()
|
||||
.setNumJavaWorkersPerProcess(rayConfig.numWorkersPerProcess)
|
||||
.addAllJvmOptions(rayConfig.jvmOptionsForJavaWorker)
|
||||
.putAllWorkerEnv(rayConfig.workerEnv);
|
||||
serializedJobConfig = jobConfigBuilder.build().toByteArray();
|
||||
}
|
||||
|
||||
// TODO(qwang): Get object_store_socket_name and raylet_socket_name from Redis.
|
||||
nativeInitialize(rayConfig.workerMode.getNumber(),
|
||||
rayConfig.nodeIp, rayConfig.getNodeManagerPort(),
|
||||
@@ -113,7 +125,7 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
rayConfig.objectStoreSocketName, rayConfig.rayletSocketName,
|
||||
(rayConfig.workerMode == WorkerType.DRIVER ? rayConfig.getJobId() : JobId.NIL).getBytes(),
|
||||
new GcsClientOptions(rayConfig), numWorkersPerProcess,
|
||||
rayConfig.logDir, rayConfig.rayletConfigParameters);
|
||||
rayConfig.logDir, rayConfig.rayletConfigParameters, serializedJobConfig);
|
||||
|
||||
taskExecutor = new NativeTaskExecutor(this);
|
||||
workerContext = new NativeWorkerContext();
|
||||
@@ -201,7 +213,7 @@ public final class RayNativeRuntime extends AbstractRayRuntime {
|
||||
int workerMode, String ndoeIpAddress,
|
||||
int nodeManagerPort, String driverName, String storeSocket, String rayletSocket,
|
||||
byte[] jobId, GcsClientOptions gcsClientOptions, int numWorkersPerProcess,
|
||||
String logDir, Map<String, String> rayletConfigParameters);
|
||||
String logDir, Map<String, String> rayletConfigParameters, byte[] serializedJobConfig);
|
||||
|
||||
private static native void nativeRunTaskExecutor(TaskExecutor taskExecutor);
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ package io.ray.runtime.config;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.typesafe.config.Config;
|
||||
import com.typesafe.config.ConfigException;
|
||||
import com.typesafe.config.ConfigFactory;
|
||||
@@ -29,6 +30,8 @@ public class RayConfig {
|
||||
public static final String DEFAULT_CONFIG_FILE = "ray.default.conf";
|
||||
public static final String CUSTOM_CONFIG_FILE = "ray.conf";
|
||||
|
||||
private static int DEFAULT_NUM_JAVA_WORKER_PER_PROCESS = 10;
|
||||
|
||||
private static final Random RANDOM = new Random();
|
||||
|
||||
private static final DateTimeFormatter DATE_TIME_FORMATTER =
|
||||
@@ -90,6 +93,9 @@ public class RayConfig {
|
||||
|
||||
public final int numWorkersPerProcess;
|
||||
|
||||
public final List<String> jvmOptionsForJavaWorker;
|
||||
public final Map<String, String> workerEnv;
|
||||
|
||||
private void validate() {
|
||||
if (workerMode == WorkerType.WORKER) {
|
||||
Preconditions.checkArgument(redisAddress != null,
|
||||
@@ -141,6 +147,17 @@ public class RayConfig {
|
||||
this.jobId = JobId.NIL;
|
||||
}
|
||||
|
||||
// jvm options for java workers of this job.
|
||||
jvmOptionsForJavaWorker = config.getStringList("ray.job.jvm-options");
|
||||
|
||||
ImmutableMap.Builder<String, String> workerEnvBuilder = ImmutableMap.builder();
|
||||
Config workerEnvConfig = config.getConfig("ray.job.worker-env");
|
||||
if (workerEnvConfig != null) {
|
||||
for (Map.Entry<String, ConfigValue> entry : workerEnvConfig.entrySet()) {
|
||||
workerEnvBuilder.put(entry.getKey(), workerEnvConfig.getString(entry.getKey()));
|
||||
}
|
||||
}
|
||||
workerEnv = workerEnvBuilder.build();
|
||||
updateSessionDir();
|
||||
// Object store configurations.
|
||||
objectStoreSize = config.getBytes("ray.object-store.size");
|
||||
@@ -206,7 +223,22 @@ public class RayConfig {
|
||||
jobResourcePath = null;
|
||||
}
|
||||
|
||||
numWorkersPerProcess = config.getInt("ray.raylet.config.num_workers_per_process_java");
|
||||
boolean enableMultiTenancy = false;
|
||||
if (config.hasPath("ray.raylet.config.enable_multi_tenancy")) {
|
||||
enableMultiTenancy =
|
||||
Boolean.valueOf(config.getString("ray.raylet.config.enable_multi_tenancy"));
|
||||
}
|
||||
|
||||
if (!enableMultiTenancy) {
|
||||
numWorkersPerProcess = config.getInt("ray.raylet.config.num_workers_per_process_java");
|
||||
} else {
|
||||
final int localNumWorkersPerProcess = config.getInt("ray.job.num-java-workers-per-process");
|
||||
if (localNumWorkersPerProcess <= 0) {
|
||||
numWorkersPerProcess = DEFAULT_NUM_JAVA_WORKER_PER_PROCESS;
|
||||
} else {
|
||||
numWorkersPerProcess = localNumWorkersPerProcess;
|
||||
}
|
||||
}
|
||||
|
||||
// Validate config.
|
||||
validate();
|
||||
|
||||
@@ -29,6 +29,15 @@ ray {
|
||||
// executing tasks from different jobs. E.g. if it's set to '/tm/job_resources',
|
||||
// the path for job 123 will be '/tmp/job_resources/123'.
|
||||
resource-path: ""
|
||||
/// The number of java worker per worker process.
|
||||
num-java-workers-per-process: 10
|
||||
/// The jvm options for java workers of the job.
|
||||
jvm-options: []
|
||||
// Environment variables to be set on worker processes.
|
||||
worker-env {
|
||||
// key1 : "value1"
|
||||
// key2 : "value2"
|
||||
}
|
||||
}
|
||||
|
||||
// Configurations about logging.
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
package io.ray.test;
|
||||
|
||||
import io.ray.api.ActorHandle;
|
||||
import io.ray.api.ObjectRef;
|
||||
import io.ray.api.Ray;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterClass;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test(groups = {"cluster"})
|
||||
public class JobConfigTest extends BaseTest {
|
||||
|
||||
@BeforeClass
|
||||
public void setupJobConfig() {
|
||||
System.setProperty("ray.raylet.config.enable_multi_tenancy", "true");
|
||||
System.setProperty("ray.job.num-java-workers-per-process", "3");
|
||||
System.setProperty("ray.job.jvm-options.0", "-DX=999");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public void tearDownJobConfig() {
|
||||
System.clearProperty("ray.raylet.config.enable_multi_tenancy");
|
||||
System.clearProperty("ray.job.num-java-workers-per-process");
|
||||
System.clearProperty("ray.job.jvm-options.0");
|
||||
}
|
||||
|
||||
public static String getJvmOptions() {
|
||||
return System.getProperty("X");
|
||||
}
|
||||
|
||||
public static Integer getWorkersNum() {
|
||||
return TestUtils.getRuntime().getRayConfig().numWorkersPerProcess;
|
||||
}
|
||||
|
||||
public static class MyActor {
|
||||
|
||||
public Integer getWorkersNum() {
|
||||
return TestUtils.getRuntime().getRayConfig().numWorkersPerProcess;
|
||||
}
|
||||
|
||||
public String getJvmOptions() {
|
||||
return System.getProperty("X");
|
||||
}
|
||||
}
|
||||
|
||||
public void testJvmOptions() {
|
||||
ObjectRef<String> obj = Ray.task(JobConfigTest::getJvmOptions).remote();
|
||||
Assert.assertEquals("999", obj.get());
|
||||
}
|
||||
|
||||
public void testNumJavaWorkerPerProcess() {
|
||||
ObjectRef<Integer> obj = Ray.task(JobConfigTest::getWorkersNum).remote();
|
||||
Assert.assertEquals(3, (int) obj.get());
|
||||
}
|
||||
|
||||
|
||||
public void testInActor() {
|
||||
ActorHandle<MyActor> actor = Ray.actor(MyActor::new).remote();
|
||||
|
||||
// test jvm options.
|
||||
ObjectRef<String> obj1 = actor.task(MyActor::getJvmOptions).remote();
|
||||
Assert.assertEquals("999", obj1.get());
|
||||
|
||||
// test workers number.
|
||||
ObjectRef<Integer> obj2 = actor.task(MyActor::getWorkersNum).remote();
|
||||
Assert.assertEquals(3, (int) obj2.get());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package io.ray.test;
|
||||
|
||||
import io.ray.api.ActorHandle;
|
||||
import io.ray.api.ObjectRef;
|
||||
import io.ray.api.Ray;
|
||||
import io.ray.runtime.config.RayConfig;
|
||||
import io.ray.runtime.util.SystemUtil;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.lang.ProcessBuilder.Redirect;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.AfterClass;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
@Test(groups = {"cluster"})
|
||||
public class MultiDriverTest extends BaseTest {
|
||||
|
||||
private static final int DRIVER_COUNT = 10;
|
||||
private static final int NORMAL_TASK_COUNT_PER_DRIVER = 100;
|
||||
private static final int ACTOR_COUNT_PER_DRIVER = 10;
|
||||
private static final String PID_LIST_PREFIX = "PID: ";
|
||||
|
||||
@BeforeClass
|
||||
public void setUpClass() {
|
||||
System.setProperty("ray.raylet.config.enable_multi_tenancy", "true");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public void tearDownClass() {
|
||||
System.clearProperty("ray.raylet.config.enable_multi_tenancy");
|
||||
}
|
||||
|
||||
static int getPid() {
|
||||
return SystemUtil.pid();
|
||||
}
|
||||
|
||||
public static class Actor {
|
||||
|
||||
public int getPid() {
|
||||
return SystemUtil.pid();
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Ray.init();
|
||||
|
||||
List<ObjectRef<Integer>> pidObjectList = new ArrayList<>();
|
||||
// Submit some normal tasks and get the PIDs of workers which execute the tasks.
|
||||
for (int i = 0; i < NORMAL_TASK_COUNT_PER_DRIVER; ++i) {
|
||||
pidObjectList.add(Ray.task(MultiDriverTest::getPid).remote());
|
||||
}
|
||||
// Create some actors and get the PIDs of actors.
|
||||
for (int i = 0; i < ACTOR_COUNT_PER_DRIVER; ++i) {
|
||||
ActorHandle<Actor> actor = Ray.actor(Actor::new).remote();
|
||||
pidObjectList.add(actor.task(Actor::getPid).remote());
|
||||
}
|
||||
Set<Integer> pids = new HashSet<>();
|
||||
for (ObjectRef<Integer> object : pidObjectList) {
|
||||
pids.add(object.get());
|
||||
}
|
||||
// Write pids to stdout
|
||||
System.out.println(
|
||||
PID_LIST_PREFIX + pids.stream().map(String::valueOf).collect(Collectors.joining(",")));
|
||||
}
|
||||
|
||||
public void testMultiDrivers() throws InterruptedException, IOException {
|
||||
// This test case starts some driver processes. Each driver process submits some tasks and
|
||||
// collect the PIDs of the workers used by the driver. The drivers output the PID list
|
||||
// which will be read by the test case itself. The test case will compare the PIDs used by
|
||||
// different drivers and make sure that all the PIDs don't overlap. If overlapped, it means that
|
||||
// tasks owned by different drivers were scheduled to the same worker process, that is, tasks
|
||||
// of different jobs were not correctly isolated during execution.
|
||||
List<Process> drivers = new ArrayList<>();
|
||||
for (int i = 0; i < DRIVER_COUNT; ++i) {
|
||||
drivers.add(startDriver());
|
||||
}
|
||||
|
||||
// Wait for drivers to finish.
|
||||
for (Process driver : drivers) {
|
||||
driver.waitFor();
|
||||
Assert.assertEquals(driver.exitValue(), 0,
|
||||
"The driver exited with code " + driver.exitValue());
|
||||
}
|
||||
|
||||
// Read driver outputs and check for any PID overlap.
|
||||
Set<Integer> pids = new HashSet<>();
|
||||
for (Process driver : drivers) {
|
||||
try (BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(driver.getInputStream()))) {
|
||||
String line;
|
||||
int previousSize = pids.size();
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith(PID_LIST_PREFIX)) {
|
||||
for (String pidString : line.substring(PID_LIST_PREFIX.length()).split(",")) {
|
||||
// Make sure the PIDs don't overlap.
|
||||
Assert.assertTrue(pids.add(Integer.valueOf(pidString)),
|
||||
"Worker process with PID " + line + " is shared by multiple drivers.");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
int nowSize = pids.size();
|
||||
Assert.assertTrue(nowSize > previousSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Process startDriver() throws IOException {
|
||||
RayConfig rayConfig = TestUtils.getRuntime().getRayConfig();
|
||||
|
||||
ProcessBuilder builder = new ProcessBuilder(
|
||||
"java",
|
||||
"-cp",
|
||||
System.getProperty("java.class.path"),
|
||||
"-Dray.redis.address=" + rayConfig.getRedisAddress(),
|
||||
"-Dray.object-store.socket-name=" + rayConfig.objectStoreSocketName,
|
||||
"-Dray.raylet.socket-name=" + rayConfig.rayletSocketName,
|
||||
"-Dray.raylet.node-manager-port=" + String.valueOf(rayConfig.getNodeManagerPort()),
|
||||
MultiDriverTest.class.getName());
|
||||
builder.redirectError(Redirect.INHERIT);
|
||||
return builder.start();
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,8 @@ import org.testng.annotations.Test;
|
||||
|
||||
public class RayletConfigTest extends BaseTest {
|
||||
|
||||
private static final String RAY_CONFIG_KEY = "num_workers_per_process_java";
|
||||
private static final String RAY_CONFIG_VALUE = "2";
|
||||
private static final String RAY_CONFIG_KEY = "get_timeout_milliseconds";
|
||||
private static final String RAY_CONFIG_VALUE = "1234";
|
||||
|
||||
@BeforeClass
|
||||
public void beforeClass() {
|
||||
|
||||
Reference in New Issue
Block a user