[Java] Simplify Ray.init() by invoking ray start internally (#10762)

This commit is contained in:
Kai Yang
2020-12-04 14:33:45 +08:00
committed by GitHub
parent 8cebe1e79c
commit 21fcee28f9
39 changed files with 367 additions and 1085 deletions
@@ -1,30 +1,16 @@
package io.ray.streaming.api.context;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.gson.Gson;
import io.ray.api.Ray;
import io.ray.runtime.config.RayConfig;
import io.ray.runtime.util.NetworkUtil;
import java.io.File;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class ClusterStarter {
private static final Logger LOG = LoggerFactory.getLogger(ClusterStarter.class);
private static final String PLASMA_STORE_SOCKET_NAME = "/tmp/ray/plasma_store_socket";
private static final String RAYLET_SOCKET_NAME = "/tmp/ray/raylet_socket";
static synchronized void startCluster(boolean isCrossLanguage, boolean isLocal) {
static synchronized void startCluster(boolean isLocal) {
Preconditions.checkArgument(!Ray.isInitialized());
RayConfig.reset();
if (!isLocal) {
System.setProperty("ray.raylet.config.num_workers_per_process_java", "1");
System.setProperty("ray.run-mode", "CLUSTER");
@@ -33,97 +19,13 @@ class ClusterStarter {
System.setProperty("ray.run-mode", "SINGLE_PROCESS");
}
if (!isCrossLanguage) {
Ray.init();
return;
}
// Delete existing socket files.
for (String socket : ImmutableList.of(RAYLET_SOCKET_NAME, PLASMA_STORE_SOCKET_NAME)) {
File file = new File(socket);
if (file.exists()) {
LOG.info("Delete existing socket file {}", file);
file.delete();
}
}
String nodeManagerPort = String.valueOf(NetworkUtil.getUnusedPort());
// jars in the `ray` wheel doesn't contains test classes, so we add test classes explicitly.
// Since mvn test classes contains `test` in path and bazel test classes is located at a jar
// with `test` included in the name, we can check classpath `test` to filter out test classes.
String classpath = Stream.of(System.getProperty("java.class.path").split(":"))
.filter(s -> !s.contains(" ") && s.contains("test"))
.collect(Collectors.joining(":"));
String workerOptions = new Gson().toJson(ImmutableList.of("-classpath", classpath));
Map<String, String> config = new HashMap<>(RayConfig.create().rayletConfigParameters);
config.put("num_workers_per_process_java", "1");
// Start ray cluster.
List<String> startCommand = ImmutableList.of(
"ray",
"start",
"--head",
"--port=6379",
String.format("--plasma-store-socket-name=%s", PLASMA_STORE_SOCKET_NAME),
String.format("--raylet-socket-name=%s", RAYLET_SOCKET_NAME),
String.format("--node-manager-port=%s", nodeManagerPort),
"--load-code-from-local",
"--java-worker-options=" + workerOptions,
"--system-config=" + new Gson().toJson(config)
);
if (!executeCommand(startCommand, 10)) {
throw new RuntimeException("Couldn't start ray cluster.");
}
// Connect to the cluster.
System.setProperty("ray.address", "127.0.0.1:6379");
System.setProperty("ray.object-store.socket-name", PLASMA_STORE_SOCKET_NAME);
System.setProperty("ray.raylet.socket-name", RAYLET_SOCKET_NAME);
System.setProperty("ray.raylet.node-manager-port", nodeManagerPort);
Ray.init();
}
public static synchronized void stopCluster(boolean isCrossLanguage) {
public static synchronized void stopCluster() {
// Disconnect to the cluster.
Ray.shutdown();
System.clearProperty("ray.address");
System.clearProperty("ray.object-store.socket-name");
System.clearProperty("ray.raylet.socket-name");
System.clearProperty("ray.raylet.node-manager-port");
System.clearProperty("ray.raylet.config.num_workers_per_process_java");
System.clearProperty("ray.run-mode");
if (isCrossLanguage) {
// Stop ray cluster.
final List<String> stopCommand = ImmutableList.of(
"ray",
"stop"
);
if (!executeCommand(stopCommand, 10)) {
throw new RuntimeException("Couldn't stop ray cluster");
}
}
}
/**
* Execute an external command.
*
* @return Whether the command succeeded.
*/
private static boolean executeCommand(List<String> command, int waitTimeoutSeconds) {
LOG.info("Executing command: {}", String.join(" ", command));
try {
ProcessBuilder processBuilder = new ProcessBuilder(command)
.redirectOutput(ProcessBuilder.Redirect.INHERIT)
.redirectError(ProcessBuilder.Redirect.INHERIT);
Process process = processBuilder.start();
boolean exit = process.waitFor(waitTimeoutSeconds, TimeUnit.SECONDS);
if (!exit) {
process.destroyForcibly();
}
return process.exitValue() == 0;
} catch (Exception e) {
throw new RuntimeException("Error executing command " + String.join(" ", command), e);
}
}
}
@@ -65,11 +65,10 @@ public class StreamingContext implements Serializable {
if (!Ray.isInitialized()) {
if (Config.MEMORY_CHANNEL.equalsIgnoreCase(jobConfig.get(Config.CHANNEL_TYPE))) {
Preconditions.checkArgument(!jobGraph.isCrossLanguageGraph());
ClusterStarter.startCluster(false, true);
ClusterStarter.startCluster(true);
LOG.info("Created local cluster for job {}.", jobName);
} else {
ClusterStarter.startCluster(jobGraph.isCrossLanguageGraph(), false);
ClusterStarter.startCluster(false);
LOG.info("Created multi process cluster for job {}.", jobName);
}
Runtime.getRuntime().addShutdownHook(new Thread(StreamingContext.this::stop));
@@ -103,7 +102,7 @@ public class StreamingContext implements Serializable {
public void stop() {
if (Ray.isInitialized()) {
ClusterStarter.stopCluster(jobGraph.isCrossLanguageGraph());
ClusterStarter.stopCluster();
}
}
}
@@ -1,6 +1,5 @@
package io.ray.streaming.jobgraph;
import io.ray.streaming.api.Language;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
@@ -138,14 +137,4 @@ public class JobGraph implements Serializable {
}
}
public boolean isCrossLanguageGraph() {
Language language = jobVertices.get(0).getLanguage();
for (JobVertex jobVertex : jobVertices) {
if (jobVertex.getLanguage() != language) {
return true;
}
}
return false;
}
}
@@ -1,6 +1,7 @@
package io.ray.streaming.runtime.transfer;
import io.ray.runtime.RayNativeRuntime;
import io.ray.runtime.util.BinaryFileUtil;
import io.ray.runtime.util.JniUtils;
/**
@@ -10,11 +11,7 @@ import io.ray.runtime.util.JniUtils;
public class TransferHandler {
static {
try {
Class.forName(RayNativeRuntime.class.getName());
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
JniUtils.loadLibrary(BinaryFileUtil.CORE_WORKER_JAVA_LIBRARY, true);
JniUtils.loadLibrary("streaming_java");
}
@@ -1,6 +1,7 @@
package io.ray.streaming.runtime.util;
import io.ray.runtime.RayNativeRuntime;
import io.ray.runtime.util.BinaryFileUtil;
import io.ray.runtime.util.JniUtils;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
@@ -29,13 +30,7 @@ public class EnvUtil {
}
public static void loadNativeLibraries() {
// Explicitly load `RayNativeRuntime`, to make sure `core_worker_library_java`
// is loaded before `streaming_java`.
try {
Class.forName(RayNativeRuntime.class.getName());
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
JniUtils.loadLibrary(BinaryFileUtil.CORE_WORKER_JAVA_LIBRARY, true);
JniUtils.loadLibrary("streaming_java");
}
@@ -58,11 +58,11 @@ public class StreamingQueueTest extends BaseUnitTest implements Serializable {
void beforeMethod() {
LOGGER.info("beforeTest");
Ray.shutdown();
System.setProperty("ray.resources", "CPU:4,RES-A:4");
System.setProperty("ray.head-args.0", "--num-cpus=4");
System.setProperty("ray.head-args.1", "--resources={\"RES-A\":4}");
System.setProperty("ray.raylet.config.num_workers_per_process_java", "1");
System.setProperty("ray.run-mode", "CLUSTER");
System.setProperty("ray.redirect-output", "true");
RayConfig.reset();
Ray.init();
}
@@ -71,6 +71,8 @@ public class StreamingQueueTest extends BaseUnitTest implements Serializable {
LOGGER.info("afterTest");
Ray.shutdown();
System.clearProperty("ray.run-mode");
System.clearProperty("ray.head-args.0");
System.clearProperty("ray.head-args.1");
}
@Test(timeOut = 300000)
@@ -78,7 +80,8 @@ public class StreamingQueueTest extends BaseUnitTest implements Serializable {
LOGGER.info("StreamingQueueTest.testReaderWriter run-mode: {}",
System.getProperty("ray.run-mode"));
Ray.shutdown();
System.setProperty("ray.resources", "CPU:4,RES-A:4");
System.setProperty("ray.head-args.0", "--num-cpus=4");
System.setProperty("ray.head-args.1", "--resources={\"RES-A\":4}");
System.setProperty("ray.raylet.config.num_workers_per_process_java", "1");
System.setProperty("ray.run-mode", "CLUSTER");
@@ -134,7 +137,8 @@ public class StreamingQueueTest extends BaseUnitTest implements Serializable {
@Test(timeOut = 60000)
public void testWordCount() {
Ray.shutdown();
System.setProperty("ray.resources", "CPU:4,RES-A:4");
System.setProperty("ray.head-args.0", "--num-cpus=4");
System.setProperty("ray.head-args.1", "--resources={\"RES-A\":4}");
System.setProperty("ray.raylet.config.num_workers_per_process_java", "1");
System.setProperty("ray.run-mode", "CLUSTER");