Implement actor checkpointing (#3839)

* Implement Actor checkpointing

* docs

* fix

* fix

* fix

* move restore-from-checkpoint to HandleActorStateTransition

* Revert "move restore-from-checkpoint to HandleActorStateTransition"

This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12.

* resubmit waiting tasks when actor frontier restored

* add doc about num_actor_checkpoints_to_keep=1

* add num_actor_checkpoints_to_keep to Cython

* add checkpoint_expired api

* check if actor class is abstract

* change checkpoint_ids to long string

* implement java

* Refactor to delay actor creation publish until checkpoint is resumed

* debug, lint

* Erase from checkpoints to restore if task fails

* fix lint

* update comments

* avoid duplicated actor notification log

* fix unintended change

* add actor_id to checkpoint_expired

* small java updates

* make checkpoint info per actor

* lint

* Remove logging

* Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager

* Replace old actor checkpointing tests

* Fix test and lint

* address comments

* consolidate kill_actor

* Remove __ray_checkpoint__

* fix non-ascii char

* Loosen test checks

* fix java

* fix sphinx-build
This commit is contained in:
Hao Chen
2019-02-13 19:39:02 +08:00
committed by GitHub
parent 57dcd3033e
commit f31a79f3f7
41 changed files with 1708 additions and 490 deletions
@@ -4,10 +4,13 @@ import static org.ray.runtime.util.SystemUtil.pid;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.ray.api.Checkpointable;
import org.ray.api.Ray;
import org.ray.api.RayActor;
import org.ray.api.annotation.RayRemote;
import org.ray.api.id.UniqueId;
import org.ray.api.options.ActorCreationOptions;
import org.testng.Assert;
import org.testng.annotations.Test;
@@ -17,10 +20,10 @@ public class ActorReconstructionTest extends BaseTest {
@RayRemote()
public static class Counter {
private int value = 0;
protected int value = 0;
public int increase(int delta) {
value += delta;
public int increase() {
value += 1;
return value;
}
@@ -35,7 +38,7 @@ public class ActorReconstructionTest extends BaseTest {
RayActor<Counter> actor = Ray.createActor(Counter::new, options);
// Call increase 3 times.
for (int i = 0; i < 3; i++) {
Ray.call(Counter::increase, actor, 1).get();
Ray.call(Counter::increase, actor).get();
}
// Kill the actor process.
@@ -45,7 +48,7 @@ public class ActorReconstructionTest extends BaseTest {
TimeUnit.SECONDS.sleep(1);
// Try calling increase on this actor again and check the value is now 4.
int value = Ray.call(Counter::increase, actor, 1).get();
int value = Ray.call(Counter::increase, actor).get();
Assert.assertEquals(value, 4);
// Kill the actor process again.
@@ -55,7 +58,7 @@ public class ActorReconstructionTest extends BaseTest {
// Try calling increase on this actor again and this should fail.
try {
Ray.call(Counter::increase, actor, 1).get();
Ray.call(Counter::increase, actor).get();
Assert.fail("The above task didn't fail.");
} catch (StringIndexOutOfBoundsException e) {
// Raylet backend will put invalid data in task's result to indicate the task has failed.
@@ -64,4 +67,71 @@ public class ActorReconstructionTest extends BaseTest {
// instead of throwing this exception.
}
}
public static class CheckpointableCounter extends Counter implements Checkpointable {
private boolean resumedFromCheckpoint = false;
private boolean increaseCalled = false;
@Override
public int increase() {
increaseCalled = true;
return super.increase();
}
public boolean wasResumedFromCheckpoint() {
return resumedFromCheckpoint;
}
@Override
public boolean shouldCheckpoint(CheckpointContext checkpointContext) {
// Checkpoint the actor when value is increased to 3.
boolean shouldCheckpoint = increaseCalled && value == 3;
increaseCalled = false;
return shouldCheckpoint;
}
@Override
public void saveCheckpoint(UniqueId actorId, UniqueId checkpointId) {
// In practice, user should save the checkpoint id and data to a persistent store.
// But for simplicity, we don't do that in this unit test.
}
@Override
public UniqueId loadCheckpoint(UniqueId actorId, List<Checkpoint> availableCheckpoints) {
// Restore previous value and return checkpoint id.
this.value = 3;
this.resumedFromCheckpoint = true;
return availableCheckpoints.get(availableCheckpoints.size() - 1).checkpointId;
}
@Override
public void checkpointExpired(UniqueId actorId, UniqueId checkpointId) {
}
}
@Test
public void testActorCheckpointing() throws IOException, InterruptedException {
ActorCreationOptions options = new ActorCreationOptions(new HashMap<>(), 1);
RayActor<CheckpointableCounter> actor = Ray.createActor(CheckpointableCounter::new, options);
// Call increase 3 times.
for (int i = 0; i < 3; i++) {
Ray.call(CheckpointableCounter::increase, actor).get();
}
// Assert that the actor wasn't resumed from a checkpoint.
Assert.assertFalse(Ray.call(CheckpointableCounter::wasResumedFromCheckpoint, actor).get());
// Kill the actor process.
int pid = Ray.call(CheckpointableCounter::getPid, actor).get();
Runtime.getRuntime().exec("kill -9 " + pid);
// Wait for the actor to be killed.
TimeUnit.SECONDS.sleep(1);
// Try calling increase on this actor again and check the value is now 4.
int value = Ray.call(CheckpointableCounter::increase, actor).get();
Assert.assertEquals(value, 4);
// Assert that the actor was resumed from a checkpoint.
Assert.assertTrue(Ray.call(CheckpointableCounter::wasResumedFromCheckpoint, actor).get());
}
}