mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:22:39 +08:00
[Core]Remove checkpoint table (#12235)
* Delete an actor entry from node manager. * Remove checkpoint table * remote checkpoint interface * remove checkpoint interface * fix ExitActorTest Co-authored-by: chaokunyang <shawn.ck.yang@gmail.com>
This commit is contained in:
@@ -1,100 +0,0 @@
|
||||
package io.ray.api;
|
||||
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
import java.util.List;
|
||||
|
||||
public interface Checkpointable {
|
||||
|
||||
class CheckpointContext {
|
||||
|
||||
/**
|
||||
* Actor's ID.
|
||||
*/
|
||||
public final ActorId actorId;
|
||||
/**
|
||||
* Number of tasks executed since last checkpoint.
|
||||
*/
|
||||
public final int numTasksSinceLastCheckpoint;
|
||||
/**
|
||||
* Time elapsed since last checkpoint, in milliseconds.
|
||||
*/
|
||||
public final long timeElapsedMsSinceLastCheckpoint;
|
||||
|
||||
public CheckpointContext(ActorId actorId, int numTasksSinceLastCheckpoint,
|
||||
long timeElapsedMsSinceLastCheckpoint) {
|
||||
this.actorId = actorId;
|
||||
this.numTasksSinceLastCheckpoint = numTasksSinceLastCheckpoint;
|
||||
this.timeElapsedMsSinceLastCheckpoint = timeElapsedMsSinceLastCheckpoint;
|
||||
}
|
||||
}
|
||||
|
||||
class Checkpoint {
|
||||
|
||||
/**
|
||||
* Checkpoint's ID.
|
||||
*/
|
||||
public final UniqueId checkpointId;
|
||||
/**
|
||||
* Checkpoint's timestamp.
|
||||
*/
|
||||
public final long timestamp;
|
||||
|
||||
public Checkpoint(UniqueId checkpointId, long timestamp) {
|
||||
this.checkpointId = checkpointId;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether this actor needs to be checkpointed.
|
||||
*
|
||||
* This method will be called after every task. You should implement this callback to decide
|
||||
* whether this actor needs to be checkpointed at this time, based on the checkpoint context, or
|
||||
* any other factors.
|
||||
*
|
||||
* @param checkpointContext An object that contains info about last checkpoint.
|
||||
* @return A boolean value that indicates whether this actor needs to be checkpointed.
|
||||
*/
|
||||
boolean shouldCheckpoint(CheckpointContext checkpointContext);
|
||||
|
||||
/**
|
||||
* Save a checkpoint to persistent storage.
|
||||
*
|
||||
* If `shouldCheckpoint` returns true, this method will be called. You should implement this
|
||||
* callback to save actor's checkpoint and the given checkpoint id to persistent storage.
|
||||
*
|
||||
* @param actorId Actor's ID.
|
||||
* @param checkpointId An ID that represents this actor's current state in GCS. You should
|
||||
* save this checkpoint ID together with actor's checkpoint data.
|
||||
*/
|
||||
void saveCheckpoint(ActorId actorId, UniqueId checkpointId);
|
||||
|
||||
/**
|
||||
* Load actor's previous checkpoint, and restore actor's state.
|
||||
*
|
||||
* This method will be called when an actor is restarted, after the actor's constructor. If the
|
||||
* actor needs to restore from previous checkpoint, this function should restore actor's state and
|
||||
* return the checkpoint ID. Otherwise, it should do nothing and return null.
|
||||
*
|
||||
* @param actorId Actor's ID.
|
||||
* @param availableCheckpoints A list of available checkpoint IDs and their timestamps, sorted
|
||||
* by timestamp in descending order. Note, this method must return the ID of one checkpoint in
|
||||
* this list, or null. Otherwise, an exception will be thrown.
|
||||
* @return The ID of the checkpoint from which the actor was resumed, or null if the actor should
|
||||
* restart from the beginning.
|
||||
*/
|
||||
UniqueId loadCheckpoint(ActorId actorId, List<Checkpoint> availableCheckpoints);
|
||||
|
||||
/**
|
||||
* Delete an expired checkpoint;
|
||||
*
|
||||
* This method will be called when an checkpoint is expired. You should implement this method to
|
||||
* delete your application checkpoint data. Note, the maximum number of checkpoints kept in the
|
||||
* backend can be configured at `RayConfig.num_actor_checkpoints_to_keep`.
|
||||
*
|
||||
* @param actorId ID of the actor.
|
||||
* @param checkpointId ID of the checkpoint that has expired.
|
||||
*/
|
||||
void checkpointExpired(ActorId actorId, UniqueId checkpointId);
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package io.ray.runtime.gcs;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import io.ray.api.Checkpointable.Checkpoint;
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.BaseId;
|
||||
import io.ray.api.id.JobId;
|
||||
@@ -12,7 +11,6 @@ import io.ray.api.id.UniqueId;
|
||||
import io.ray.api.placementgroup.PlacementGroup;
|
||||
import io.ray.api.runtimecontext.NodeInfo;
|
||||
import io.ray.runtime.generated.Gcs;
|
||||
import io.ray.runtime.generated.Gcs.ActorCheckpointIdData;
|
||||
import io.ray.runtime.generated.Gcs.GcsNodeInfo;
|
||||
import io.ray.runtime.generated.Gcs.TablePrefix;
|
||||
import io.ray.runtime.placementgroup.PlacementGroupUtils;
|
||||
@@ -175,33 +173,6 @@ public class GcsClient {
|
||||
return client.exists(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the available checkpoints for the given actor ID.
|
||||
*/
|
||||
public List<Checkpoint> getCheckpointsForActor(ActorId actorId) {
|
||||
List<Checkpoint> checkpoints = new ArrayList<>();
|
||||
byte[] result = globalStateAccessor.getActorCheckpointId(actorId);
|
||||
if (result != null) {
|
||||
ActorCheckpointIdData data = null;
|
||||
try {
|
||||
data = ActorCheckpointIdData.parseFrom(result);
|
||||
} catch (InvalidProtocolBufferException e) {
|
||||
throw new RuntimeException("Received invalid protobuf data from GCS.");
|
||||
}
|
||||
UniqueId[] checkpointIds = new UniqueId[data.getCheckpointIdsCount()];
|
||||
for (int i = 0; i < checkpointIds.length; i++) {
|
||||
checkpointIds[i] = UniqueId
|
||||
.fromByteBuffer(data.getCheckpointIds(i).asReadOnlyByteBuffer());
|
||||
}
|
||||
|
||||
for (int i = 0; i < checkpointIds.length; i++) {
|
||||
checkpoints.add(new Checkpoint(checkpointIds[i], data.getTimestamps(i)));
|
||||
}
|
||||
}
|
||||
checkpoints.sort((x, y) -> Long.compare(y.timestamp, x.timestamp));
|
||||
return checkpoints;
|
||||
}
|
||||
|
||||
public JobId nextJobId() {
|
||||
int jobCounter = (int) primary.incr("JobCounter".getBytes());
|
||||
return JobId.fromInt(jobCounter);
|
||||
|
||||
@@ -125,17 +125,6 @@ public class GlobalStateAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return An actor checkpoint id data with ActorCheckpointIdData protobuf schema.
|
||||
*/
|
||||
public byte[] getActorCheckpointId(ActorId actorId) {
|
||||
// Fetch an actor checkpoint id with protobuf bytes format from GCS.
|
||||
synchronized (GlobalStateAccessor.class) {
|
||||
validateGlobalStateAccessorPointer();
|
||||
return this.nativeGetActorCheckpointId(globalStateAccessorNativePointer, actorId.getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
private void destroyGlobalStateAccessor() {
|
||||
synchronized (GlobalStateAccessor.class) {
|
||||
if (0 == globalStateAccessorNativePointer) {
|
||||
@@ -164,8 +153,6 @@ public class GlobalStateAccessor {
|
||||
|
||||
private native byte[] nativeGetActorInfo(long nativePtr, byte[] actorId);
|
||||
|
||||
private native byte[] nativeGetActorCheckpointId(long nativePtr, byte[] actorId);
|
||||
|
||||
private native byte[] nativeGetPlacementGroupInfo(long nativePtr,
|
||||
byte[] placementGroupId);
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
package io.ray.runtime.task;
|
||||
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
import io.ray.runtime.RayRuntimeInternal;
|
||||
|
||||
@@ -34,11 +33,4 @@ public class LocalModeTaskExecutor extends TaskExecutor<LocalModeTaskExecutor.Lo
|
||||
return new LocalActorContext(runtime.getWorkerContext().getCurrentWorkerId());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void maybeSaveCheckpoint(Object actor, ActorId actorId) {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void maybeLoadCheckpoint(Object actor, ActorId actorId) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,39 +1,14 @@
|
||||
package io.ray.runtime.task;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import io.ray.api.Checkpointable;
|
||||
import io.ray.api.Checkpointable.Checkpoint;
|
||||
import io.ray.api.Checkpointable.CheckpointContext;
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
import io.ray.runtime.RayRuntimeInternal;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Task executor for cluster mode.
|
||||
*/
|
||||
public class NativeTaskExecutor extends TaskExecutor<NativeTaskExecutor.NativeActorContext> {
|
||||
|
||||
// TODO(hchen): Use the C++ config.
|
||||
private static final int NUM_ACTOR_CHECKPOINTS_TO_KEEP = 20;
|
||||
|
||||
static class NativeActorContext extends TaskExecutor.ActorContext {
|
||||
|
||||
/**
|
||||
* Number of tasks executed since last actor checkpoint.
|
||||
*/
|
||||
private int numTasksSinceLastCheckpoint = 0;
|
||||
|
||||
/**
|
||||
* IDs of this actor's previous checkpoints.
|
||||
*/
|
||||
private List<UniqueId> checkpointIds;
|
||||
|
||||
/**
|
||||
* Timestamp of the last actor checkpoint.
|
||||
*/
|
||||
private long lastCheckpointTimestamp = 0;
|
||||
}
|
||||
|
||||
public NativeTaskExecutor(RayRuntimeInternal runtime) {
|
||||
@@ -49,63 +24,4 @@ public class NativeTaskExecutor extends TaskExecutor<NativeTaskExecutor.NativeAc
|
||||
// This is to make sure no memory leak when `Ray.exitActor()` is called.
|
||||
removeActorContext(new UniqueId(workerIdBytes));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void maybeSaveCheckpoint(Object actor, ActorId actorId) {
|
||||
if (!(actor instanceof Checkpointable)) {
|
||||
return;
|
||||
}
|
||||
NativeActorContext actorContext = getActorContext();
|
||||
CheckpointContext checkpointContext = new CheckpointContext(actorId,
|
||||
++actorContext.numTasksSinceLastCheckpoint,
|
||||
System.currentTimeMillis() - actorContext.lastCheckpointTimestamp);
|
||||
Checkpointable checkpointable = (Checkpointable) actor;
|
||||
if (!checkpointable.shouldCheckpoint(checkpointContext)) {
|
||||
return;
|
||||
}
|
||||
actorContext.numTasksSinceLastCheckpoint = 0;
|
||||
actorContext.lastCheckpointTimestamp = System.currentTimeMillis();
|
||||
UniqueId checkpointId = new UniqueId(nativePrepareCheckpoint());
|
||||
List<UniqueId> checkpointIds = actorContext.checkpointIds;
|
||||
checkpointIds.add(checkpointId);
|
||||
if (checkpointIds.size() > NUM_ACTOR_CHECKPOINTS_TO_KEEP) {
|
||||
((Checkpointable) actor).checkpointExpired(actorId, checkpointIds.get(0));
|
||||
checkpointIds.remove(0);
|
||||
}
|
||||
checkpointable.saveCheckpoint(actorId, checkpointId);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void maybeLoadCheckpoint(Object actor, ActorId actorId) {
|
||||
if (!(actor instanceof Checkpointable)) {
|
||||
return;
|
||||
}
|
||||
NativeActorContext actorContext = getActorContext();
|
||||
actorContext.numTasksSinceLastCheckpoint = 0;
|
||||
actorContext.lastCheckpointTimestamp = System.currentTimeMillis();
|
||||
actorContext.checkpointIds = new ArrayList<>();
|
||||
List<Checkpoint> availableCheckpoints
|
||||
= runtime.getGcsClient().getCheckpointsForActor(actorId);
|
||||
if (availableCheckpoints.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
UniqueId checkpointId = ((Checkpointable) actor).loadCheckpoint(actorId, availableCheckpoints);
|
||||
if (checkpointId != null) {
|
||||
boolean checkpointValid = false;
|
||||
for (Checkpoint checkpoint : availableCheckpoints) {
|
||||
if (checkpoint.checkpointId.equals(checkpointId)) {
|
||||
checkpointValid = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Preconditions.checkArgument(checkpointValid,
|
||||
"'loadCheckpoint' must return a checkpoint ID that exists in the "
|
||||
+ "'availableCheckpoints' list, or null.");
|
||||
nativeNotifyActorResumedFromCheckpoint(checkpointId.getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
private static native byte[] nativePrepareCheckpoint();
|
||||
|
||||
private static native void nativeNotifyActorResumedFromCheckpoint(byte[] checkpointId);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package io.ray.runtime.task;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.JobId;
|
||||
import io.ray.api.id.TaskId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
@@ -150,16 +149,10 @@ public abstract class TaskExecutor<T extends TaskExecutor.ActorContext> {
|
||||
}
|
||||
// Set result
|
||||
if (taskType != TaskType.ACTOR_CREATION_TASK) {
|
||||
if (taskType == TaskType.ACTOR_TASK) {
|
||||
// TODO (kfstorm): handle checkpoint in core worker.
|
||||
maybeSaveCheckpoint(actor, runtime.getWorkerContext().getCurrentActorId());
|
||||
}
|
||||
if (rayFunction.hasReturn()) {
|
||||
returnObjects.add(ObjectSerializer.serialize(result));
|
||||
}
|
||||
} else {
|
||||
// TODO (kfstorm): handle checkpoint in core worker.
|
||||
maybeLoadCheckpoint(result, runtime.getWorkerContext().getCurrentActorId());
|
||||
actorContext.currentActor = result;
|
||||
}
|
||||
LOGGER.debug("Finished executing task {}", taskId);
|
||||
@@ -195,7 +188,4 @@ public abstract class TaskExecutor<T extends TaskExecutor.ActorContext> {
|
||||
rayFunctionInfo.get(2));
|
||||
}
|
||||
|
||||
protected abstract void maybeSaveCheckpoint(Object actor, ActorId actorId);
|
||||
|
||||
protected abstract void maybeLoadCheckpoint(Object actor, ActorId actorId);
|
||||
}
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
package io.ray.test;
|
||||
|
||||
import io.ray.api.ActorHandle;
|
||||
import io.ray.api.Checkpointable;
|
||||
import io.ray.api.Ray;
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
import io.ray.runtime.exception.RayActorException;
|
||||
import io.ray.runtime.util.SystemUtil;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
@@ -73,68 +69,5 @@ public class ActorRestartTest extends BaseTest {
|
||||
// We should receive a RayActorException because the actor is dead.
|
||||
}
|
||||
}
|
||||
|
||||
public static class CheckpointableCounter extends Counter implements Checkpointable {
|
||||
|
||||
private boolean resumedFromCheckpoint = false;
|
||||
private boolean increaseCalled = false;
|
||||
|
||||
@Override
|
||||
public int increase() {
|
||||
increaseCalled = true;
|
||||
return super.increase();
|
||||
}
|
||||
|
||||
public boolean wasResumedFromCheckpoint() {
|
||||
return resumedFromCheckpoint;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldCheckpoint(CheckpointContext checkpointContext) {
|
||||
// Checkpoint the actor when value is increased to 3.
|
||||
boolean shouldCheckpoint = increaseCalled && value == 3;
|
||||
increaseCalled = false;
|
||||
return shouldCheckpoint;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void saveCheckpoint(ActorId actorId, UniqueId checkpointId) {
|
||||
// In practice, user should save the checkpoint id and data to a persistent store.
|
||||
// But for simplicity, we don't do that in this unit test.
|
||||
}
|
||||
|
||||
@Override
|
||||
public UniqueId loadCheckpoint(ActorId actorId, List<Checkpoint> availableCheckpoints) {
|
||||
// Restore previous value and return checkpoint id.
|
||||
this.value = 3;
|
||||
this.resumedFromCheckpoint = true;
|
||||
return availableCheckpoints.get(availableCheckpoints.size() - 1).checkpointId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkpointExpired(ActorId actorId, UniqueId checkpointId) {
|
||||
}
|
||||
}
|
||||
|
||||
public void testActorCheckpointing() throws IOException, InterruptedException {
|
||||
ActorHandle<CheckpointableCounter> actor = Ray.actor(CheckpointableCounter::new)
|
||||
.setMaxRestarts(1).remote();
|
||||
// Call increase 3 times.
|
||||
for (int i = 0; i < 3; i++) {
|
||||
actor.task(CheckpointableCounter::increase).remote().get();
|
||||
}
|
||||
// Assert that the actor wasn't resumed from a checkpoint.
|
||||
Assert.assertFalse(actor.task(CheckpointableCounter::wasResumedFromCheckpoint).remote().get());
|
||||
int pid = actor.task(CheckpointableCounter::getPid).remote().get();
|
||||
Runtime.getRuntime().exec("kill -9 " + pid);
|
||||
// Wait for the actor to be killed.
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
// Try calling increase on this actor again and check the value is now 4.
|
||||
int value = actor.task(CheckpointableCounter::increase).remote().get();
|
||||
Assert.assertEquals(value, 4);
|
||||
// Assert that the actor was resumed from a checkpoint.
|
||||
Assert.assertTrue(actor.task(CheckpointableCounter::wasResumedFromCheckpoint).remote().get());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,17 +3,13 @@ package io.ray.test;
|
||||
import static io.ray.runtime.util.SystemUtil.pid;
|
||||
|
||||
import io.ray.api.ActorHandle;
|
||||
import io.ray.api.Checkpointable;
|
||||
import io.ray.api.ObjectRef;
|
||||
import io.ray.api.Ray;
|
||||
import io.ray.api.id.ActorId;
|
||||
import io.ray.api.id.UniqueId;
|
||||
import io.ray.runtime.exception.RayActorException;
|
||||
import io.ray.runtime.task.TaskExecutor;
|
||||
import io.ray.runtime.util.SystemUtil;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.testng.Assert;
|
||||
@@ -22,7 +18,7 @@ import org.testng.annotations.Test;
|
||||
@Test(groups = {"cluster"})
|
||||
public class ExitActorTest extends BaseTest {
|
||||
|
||||
private static class ExitingActor implements Checkpointable {
|
||||
private static class ExitingActor {
|
||||
|
||||
int counter = 0;
|
||||
|
||||
@@ -45,26 +41,6 @@ public class ExitActorTest extends BaseTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldCheckpoint(CheckpointContext checkpointContext) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void saveCheckpoint(ActorId actorId, UniqueId checkpointId) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public UniqueId loadCheckpoint(ActorId actorId, List<Checkpoint> availableCheckpoints) {
|
||||
// Dummy load checkpoint.
|
||||
this.counter = 1;
|
||||
return availableCheckpoints.get(availableCheckpoints.size() - 1).checkpointId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkpointExpired(ActorId actorId, UniqueId checkpointId) {
|
||||
}
|
||||
|
||||
public boolean exit() {
|
||||
Ray.exitActor();
|
||||
return false;
|
||||
@@ -79,7 +55,7 @@ public class ExitActorTest extends BaseTest {
|
||||
Runtime.getRuntime().exec("kill -9 " + pid);
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
// Make sure this actor can be reconstructed.
|
||||
Assert.assertEquals(2, (int) actor.task(ExitingActor::incr).remote().get());
|
||||
Assert.assertEquals(1, (int) actor.task(ExitingActor::incr).remote().get());
|
||||
|
||||
// `exitActor` will exit the actor without reconstructing.
|
||||
ObjectRef<Boolean> obj = actor.task(ExitingActor::exit).remote();
|
||||
|
||||
Reference in New Issue
Block a user