mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 10:45:02 +08:00
Implement actor checkpointing (#3839)
* Implement Actor checkpointing * docs * fix * fix * fix * move restore-from-checkpoint to HandleActorStateTransition * Revert "move restore-from-checkpoint to HandleActorStateTransition" This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12. * resubmit waiting tasks when actor frontier restored * add doc about num_actor_checkpoints_to_keep=1 * add num_actor_checkpoints_to_keep to Cython * add checkpoint_expired api * check if actor class is abstract * change checkpoint_ids to long string * implement java * Refactor to delay actor creation publish until checkpoint is resumed * debug, lint * Erase from checkpoints to restore if task fails * fix lint * update comments * avoid duplicated actor notification log * fix unintended change * add actor_id to checkpoint_expired * small java updates * make checkpoint info per actor * lint * Remove logging * Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager * Replace old actor checkpointing tests * Fix test and lint * address comments * consolidate kill_actor * Remove __ray_checkpoint__ * fix non-ascii char * Loosen test checks * fix java * fix sphinx-build
This commit is contained in:
@@ -4,10 +4,13 @@ import static org.ray.runtime.util.SystemUtil.pid;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.ray.api.Checkpointable;
|
||||
import org.ray.api.Ray;
|
||||
import org.ray.api.RayActor;
|
||||
import org.ray.api.annotation.RayRemote;
|
||||
import org.ray.api.id.UniqueId;
|
||||
import org.ray.api.options.ActorCreationOptions;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
@@ -17,10 +20,10 @@ public class ActorReconstructionTest extends BaseTest {
|
||||
@RayRemote()
|
||||
public static class Counter {
|
||||
|
||||
private int value = 0;
|
||||
protected int value = 0;
|
||||
|
||||
public int increase(int delta) {
|
||||
value += delta;
|
||||
public int increase() {
|
||||
value += 1;
|
||||
return value;
|
||||
}
|
||||
|
||||
@@ -35,7 +38,7 @@ public class ActorReconstructionTest extends BaseTest {
|
||||
RayActor<Counter> actor = Ray.createActor(Counter::new, options);
|
||||
// Call increase 3 times.
|
||||
for (int i = 0; i < 3; i++) {
|
||||
Ray.call(Counter::increase, actor, 1).get();
|
||||
Ray.call(Counter::increase, actor).get();
|
||||
}
|
||||
|
||||
// Kill the actor process.
|
||||
@@ -45,7 +48,7 @@ public class ActorReconstructionTest extends BaseTest {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
// Try calling increase on this actor again and check the value is now 4.
|
||||
int value = Ray.call(Counter::increase, actor, 1).get();
|
||||
int value = Ray.call(Counter::increase, actor).get();
|
||||
Assert.assertEquals(value, 4);
|
||||
|
||||
// Kill the actor process again.
|
||||
@@ -55,7 +58,7 @@ public class ActorReconstructionTest extends BaseTest {
|
||||
|
||||
// Try calling increase on this actor again and this should fail.
|
||||
try {
|
||||
Ray.call(Counter::increase, actor, 1).get();
|
||||
Ray.call(Counter::increase, actor).get();
|
||||
Assert.fail("The above task didn't fail.");
|
||||
} catch (StringIndexOutOfBoundsException e) {
|
||||
// Raylet backend will put invalid data in task's result to indicate the task has failed.
|
||||
@@ -64,4 +67,71 @@ public class ActorReconstructionTest extends BaseTest {
|
||||
// instead of throwing this exception.
|
||||
}
|
||||
}
|
||||
|
||||
public static class CheckpointableCounter extends Counter implements Checkpointable {
|
||||
|
||||
private boolean resumedFromCheckpoint = false;
|
||||
private boolean increaseCalled = false;
|
||||
|
||||
@Override
|
||||
public int increase() {
|
||||
increaseCalled = true;
|
||||
return super.increase();
|
||||
}
|
||||
|
||||
public boolean wasResumedFromCheckpoint() {
|
||||
return resumedFromCheckpoint;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldCheckpoint(CheckpointContext checkpointContext) {
|
||||
// Checkpoint the actor when value is increased to 3.
|
||||
boolean shouldCheckpoint = increaseCalled && value == 3;
|
||||
increaseCalled = false;
|
||||
return shouldCheckpoint;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void saveCheckpoint(UniqueId actorId, UniqueId checkpointId) {
|
||||
// In practice, user should save the checkpoint id and data to a persistent store.
|
||||
// But for simplicity, we don't do that in this unit test.
|
||||
}
|
||||
|
||||
@Override
|
||||
public UniqueId loadCheckpoint(UniqueId actorId, List<Checkpoint> availableCheckpoints) {
|
||||
// Restore previous value and return checkpoint id.
|
||||
this.value = 3;
|
||||
this.resumedFromCheckpoint = true;
|
||||
return availableCheckpoints.get(availableCheckpoints.size() - 1).checkpointId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkpointExpired(UniqueId actorId, UniqueId checkpointId) {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testActorCheckpointing() throws IOException, InterruptedException {
|
||||
ActorCreationOptions options = new ActorCreationOptions(new HashMap<>(), 1);
|
||||
RayActor<CheckpointableCounter> actor = Ray.createActor(CheckpointableCounter::new, options);
|
||||
// Call increase 3 times.
|
||||
for (int i = 0; i < 3; i++) {
|
||||
Ray.call(CheckpointableCounter::increase, actor).get();
|
||||
}
|
||||
// Assert that the actor wasn't resumed from a checkpoint.
|
||||
Assert.assertFalse(Ray.call(CheckpointableCounter::wasResumedFromCheckpoint, actor).get());
|
||||
|
||||
// Kill the actor process.
|
||||
int pid = Ray.call(CheckpointableCounter::getPid, actor).get();
|
||||
Runtime.getRuntime().exec("kill -9 " + pid);
|
||||
// Wait for the actor to be killed.
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
// Try calling increase on this actor again and check the value is now 4.
|
||||
int value = Ray.call(CheckpointableCounter::increase, actor).get();
|
||||
Assert.assertEquals(value, 4);
|
||||
// Assert that the actor was resumed from a checkpoint.
|
||||
Assert.assertTrue(Ray.call(CheckpointableCounter::wasResumedFromCheckpoint, actor).get());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user