From f95ab4f50663a61dfd6e9ff7664275ac1b7a784f Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkam@users.noreply.github.com>
Date: Mon, 22 Jun 2020 14:49:16 -0700
Subject: [PATCH] [Testing] Multi-node Training+Tune Long Running Test (#8966)

---
 ci/jenkins_tests/run_tune_tests.sh            |   4 +
 ci/long_running_distributed_tests/README.rst  |  45 +++++
 .../ray-project/cluster.yaml                  |  86 ++++++++++
 .../ray-project/project.yaml                  |  34 ++++
 .../ray-project/requirements.txt              |   1 +
 .../workloads/pytorch_pbt_failure.py          | 155 ++++++++++++++++++
 ci/long_running_tests/README.rst              |  40 ++---
 .../ray-project/project.yaml                  |   2 +-
 doc/dev/RELEASE_PROCESS.rst                   |  16 +-
 9 files changed, 359 insertions(+), 24 deletions(-)
 create mode 100644 ci/long_running_distributed_tests/README.rst
 create mode 100644 ci/long_running_distributed_tests/ray-project/cluster.yaml
 create mode 100644 ci/long_running_distributed_tests/ray-project/project.yaml
 create mode 100644 ci/long_running_distributed_tests/ray-project/requirements.txt
 create mode 100644 ci/long_running_distributed_tests/workloads/pytorch_pbt_failure.py

diff --git a/ci/jenkins_tests/run_tune_tests.sh b/ci/jenkins_tests/run_tune_tests.sh
index 41602008f..00d792f87 100755
--- a/ci/jenkins_tests/run_tune_tests.sh
+++ b/ci/jenkins_tests/run_tune_tests.sh
@@ -153,6 +153,10 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE}
     python /ray/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py \
     --smoke-test
 
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/ci/long_running_distributed_tests/workloads/pytorch_pbt_failure.py \
+    --smoke-test
+
 # uncomment once statsmodels is updated.
 $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
     python /ray/python/ray/tune/examples/bohb_example.py
diff --git a/ci/long_running_distributed_tests/README.rst b/ci/long_running_distributed_tests/README.rst
new file mode 100644
index 000000000..24479f365
--- /dev/null
+++ b/ci/long_running_distributed_tests/README.rst
@@ -0,0 +1,45 @@
+Long Running Distributed Tests
+==============================
+
+This directory contains the long-running multi-node workloads which are intended to run
+forever until they fail. To set up the project you need to run
+
+.. code-block:: bash
+
+    $ pip install anyscale
+    $ anyscale init
+
+Running the Workloads
+---------------------
+Easiest approach is to use the `Anyscale UI <https://www.anyscale.dev/>`_. First run ``anyscale snapshot create`` from the command line to create a project snapshot. Then from the UI, you can launch an individual session and execute the test_workload command for each test. 
+
+You can also start the workloads using the CLI with:
+
+.. code-block:: bash
+
+    $ anyscale start --ray-wheel=<RAY_WHEEL_LINK>
+    $ anyscale run test_workload --workload=<WORKLOAD_NAME>
+
+
+Doing this for each workload will start one EC2 instance per workload and will start the workloads
+running (one per instance). A list of
+available workload options is available in the `ray_projects/project.yaml` file.
+
+
+Debugging
+---------
+The primary method to debug the test while it is running is to view the logs and the dashboard from the UI. After the test has failed, you can still view the stdout logs in the UI and also inspect
+the logs under ``/tmp/ray/session*/logs/`` and
+``/tmp/ray/session*/debug_state.txt``.
+
+Shut Down the Workloads
+-----------------------
+
+The instances running the workloads can all be killed by running
+``anyscale stop <SESSION_NAME>``.
+
+Adding a Workload
+-----------------
+
+To create a new workload, simply add a new Python file under ``workloads/`` and
+add the workload in the run command in `ray-project/project.yaml`.
diff --git a/ci/long_running_distributed_tests/ray-project/cluster.yaml b/ci/long_running_distributed_tests/ray-project/cluster.yaml
new file mode 100644
index 000000000..ea66d51ad
--- /dev/null
+++ b/ci/long_running_distributed_tests/ray-project/cluster.yaml
@@ -0,0 +1,86 @@
+# This file is generated by `ray project create`.
+
+# A unique identifier for the head node and workers of this cluster.
+cluster_name: long-running-distributed-tests
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 3
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers. min_workers defaults to 0.
+max_workers: 3
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+    cache_stopped_nodes: False
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: g3.8xlarge
+    ImageId: ami-0888a3b5189309429  # DLAMI 7/1/19
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 150
+
+worker_nodes:
+  InstanceType: g3.8xlarge
+  ImageId: ami-0888a3b5189309429  # DLAMI 7/1/19
+  BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 150
+  InstanceMarketOptions:
+    MarketType: spot
+
+setup_commands:
+  # Install ray.
+  - pip install -U pip
+  - ray || pip install -U {{ray-wheel}}
+  # Installing this without -U to make sure we don't replace the existing Ray installation
+  - pip install ray[rllib]
+  - pip install -U ipdb
+  # There have been some recent problems with torch 1.5 and torchvision 0.6
+  # not recognizing GPUs.
+  # So, we force install torch 1.4 and torchvision 0.5. 
+  # https://github.com/pytorch/pytorch/issues/37212#issuecomment-623198624.
+  - pip install torch==1.4.0 torchvision==0.5.0
+  - echo set-window-option -g mouse on > ~/.tmux.conf
+  - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - export RAY_BACKEND_LOG_LEVEL=debug
+    - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - export RAY_BACKEND_LOG_LEVEL=debug
+    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
diff --git a/ci/long_running_distributed_tests/ray-project/project.yaml b/ci/long_running_distributed_tests/ray-project/project.yaml
new file mode 100644
index 000000000..e70a4d2ae
--- /dev/null
+++ b/ci/long_running_distributed_tests/ray-project/project.yaml
@@ -0,0 +1,34 @@
+# This file is generated by `ray project create`.
+
+name: long-running-distributed-tests
+
+cluster:
+  config: ray-project/cluster.yaml
+  params:
+    - name: ray-wheel
+      help: "URL to the ray wheel to test (defaults to latest)"
+      default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
+
+commands:
+  - name: test_workload
+    help: "Start a long running distributed test."
+    command: |
+      python workloads/{{workload}}.py
+    params:
+      - name: workload
+        help: "Name of workload to run."
+        choices:
+          [
+            "pytorch_pbt_failure"
+          ]
+  
+# Pathnames for files and directories that should be saved
+# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project
+# directory or absolute. Generally, this should be files
+# that were created by an active session, such as
+# application checkpoints and logs.
+output_files: [
+  # For example, uncomment this to save the logs from the
+  # last ray job.
+  # "/tmp/ray/session_latest",
+]
diff --git a/ci/long_running_distributed_tests/ray-project/requirements.txt b/ci/long_running_distributed_tests/ray-project/requirements.txt
new file mode 100644
index 000000000..0f026d879
--- /dev/null
+++ b/ci/long_running_distributed_tests/ray-project/requirements.txt
@@ -0,0 +1 @@
+ray[debug]
\ No newline at end of file
diff --git a/ci/long_running_distributed_tests/workloads/pytorch_pbt_failure.py b/ci/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
new file mode 100644
index 000000000..94eafa2de
--- /dev/null
+++ b/ci/long_running_distributed_tests/workloads/pytorch_pbt_failure.py
@@ -0,0 +1,155 @@
+import argparse
+import numpy as np
+import os
+import random
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Subset
+from torchvision.datasets import CIFAR10
+import torchvision.transforms as transforms
+
+import ray
+from ray import tune
+from ray.autoscaler.commands import kill_node
+from ray.tune import CLIReporter
+from ray.tune.ray_trial_executor import RayTrialExecutor
+from ray.tune.schedulers import PopulationBasedTraining
+from ray.tune.utils.util import merge_dicts
+from ray.util.sgd.torch import TorchTrainer
+from ray.util.sgd.torch.resnet import ResNet18
+from ray.util.sgd.utils import BATCH_SIZE
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--smoke-test",
+    action="store_true",
+    default=False,
+    help="Finish quickly for training.")
+args = parser.parse_args()
+
+
+class FailureInjectorExecutor(RayTrialExecutor):
+    """Adds random failure injection to the TrialExecutor."""
+
+    def on_step_begin(self, trial_runner):
+        """Before step(), update available resources and inject failure."""
+        self._update_avail_resources()
+        # With 10% probability inject failure to a worker.
+        if random.random() < 0.1 and not args.smoke_test:
+            # With 10% probability fully terminate the node.
+            should_terminate = random.random() < 0.1
+            kill_node(
+                "/home/ubuntu/ray_bootstrap_config.yaml",
+                yes=True,
+                hard=should_terminate,
+                override_cluster_name=None)
+
+
+def initialization_hook():
+    # Need this for avoiding a connection restart issue on AWS.
+    os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
+    os.environ["NCCL_LL_THRESHOLD"] = "0"
+
+    # set the below if needed
+    # print("NCCL DEBUG SET")
+    # os.environ["NCCL_DEBUG"] = "INFO"
+
+
+def cifar_creator(config):
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465),
+                             (0.2023, 0.1994, 0.2010)),
+    ])  # meanstd transformation
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465),
+                             (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = CIFAR10(
+        root="~/data", train=True, download=True, transform=transform_train)
+    validation_dataset = CIFAR10(
+        root="~/data", train=False, download=False, transform=transform_test)
+
+    if config.get("test_mode"):
+        train_dataset = Subset(train_dataset, list(range(64)))
+        validation_dataset = Subset(validation_dataset, list(range(64)))
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=config[BATCH_SIZE], num_workers=2)
+    validation_loader = DataLoader(
+        validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2)
+    return train_loader, validation_loader
+
+
+def optimizer_creator(model, config):
+    """Returns optimizer"""
+    return torch.optim.SGD(
+        model.parameters(),
+        lr=config.get("lr", 0.1),
+        momentum=config.get("momentum", 0.9))
+
+
+ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
+num_training_workers = 1 if args.smoke_test else 3
+
+executor = FailureInjectorExecutor(queue_trials=True)
+
+TorchTrainable = TorchTrainer.as_trainable(
+    model_creator=ResNet18,
+    data_creator=cifar_creator,
+    optimizer_creator=optimizer_creator,
+    loss_creator=nn.CrossEntropyLoss,
+    initialization_hook=initialization_hook,
+    num_workers=num_training_workers,
+    config={
+        "test_mode": args.smoke_test,
+        BATCH_SIZE: 128 * num_training_workers,
+    },
+    use_gpu=not args.smoke_test)
+
+
+class NoFaultToleranceTrainable(TorchTrainable):
+    def _train(self):
+        train_stats = self.trainer.train(max_retries=0, profile=True)
+        validation_stats = self.trainer.validate(profile=True)
+        stats = merge_dicts(train_stats, validation_stats)
+        return stats
+
+
+pbt_scheduler = PopulationBasedTraining(
+    time_attr="training_iteration",
+    metric="val_loss",
+    mode="min",
+    perturbation_interval=1,
+    hyperparam_mutations={
+        # distribution for resampling
+        "lr": lambda: np.random.uniform(0.001, 1),
+        # allow perturbations within this set of categorical values
+        "momentum": [0.8, 0.9, 0.99],
+    })
+
+reporter = CLIReporter()
+reporter.add_metric_column("val_loss", "loss")
+reporter.add_metric_column("val_accuracy", "acc")
+
+analysis = tune.run(
+    NoFaultToleranceTrainable,
+    num_samples=4,
+    config={
+        "lr": tune.choice([0.001, 0.01, 0.1]),
+        "momentum": 0.8,
+        "head_location": None,
+        "worker_locations": None
+    },
+    max_failures=-1,  # used for fault tolerance
+    checkpoint_freq=2,  # used for fault tolerance
+    progress_reporter=reporter,
+    scheduler=pbt_scheduler,
+    trial_executor=executor,
+    stop={"training_iteration": 1} if args.smoke_test else None)
+
+print(analysis.get_best_config(metric="val_loss", mode="min"))
diff --git a/ci/long_running_tests/README.rst b/ci/long_running_tests/README.rst
index 895878334..e255fcf50 100644
--- a/ci/long_running_tests/README.rst
+++ b/ci/long_running_tests/README.rst
@@ -6,45 +6,45 @@ forever until they fail. To set up the project you need to run
 
 .. code-block:: bash
 
-    pip install anyscale
-    anyscale project create
+    $ pip install anyscale
+    $ anyscale init
 
 Note that all the long running test is running inside virtual environment, tensorflow_p36
 
 Running the Workloads
 ---------------------
+Easiest approach is to use the `Anyscale UI <https://www.anyscale.dev/>`. First run ``anyscale snapshot create`` from the command line to create a project snapshot. Then from the UI, you can launch an individual session and execute the run command for each test. 
 
-You can start all the workloads with:
+You can also start the workloads using the CLI with:
 
 .. code-block:: bash
 
-    anyscale session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl
-
-This will start one EC2 instance per workload and will start the workloads
-running (one per instance). You can start a specific workload by specifying
-its name as an argument ``--workload=`` instead of ``"*"``. A list of
-available options is available via `any session start run --help`.
+    $ anyscale start
+    $ anyscale run test_workload --workload=<WORKLOAD_NAME> --wheel=<RAY_WHEEL_LINK>
 
 
-Check Workload Statuses
------------------------
+Doing this for each workload will start one EC2 instance per workload and will start the workloads
+running (one per instance). A list of
+available workload options is available in the `ray_projects/project.yaml` file.
 
-To check up on the workloads, run either
-``anyscale session --name="*" execute check-load``, which
-will print the load on each machine, or
-``anyscale session --name="*" execute show-output``, which
-will print the tail of the output for each workload.
 
-To debug workloads that have failed, you may find it useful to ssh to the
-relevant machine, attach to the tmux session (usually ``tmux a -t 0``), inspect
-the logs under ``/tmp/ray/session*/logs/``, and also inspect
+Debugging
+---------
+The primary method to debug the test while it is running is to view the logs and the dashboard from the UI. After the test has failed, you can still view the stdout logs in the UI and also inspect
+the logs under ``/tmp/ray/session*/logs/`` and
 ``/tmp/ray/session*/debug_state.txt``.
 
+.. To check up on the workloads, run either
+.. ``anyscale session --name="*" execute check-load``, which
+.. will print the load on each machine, or
+.. ``anyscale session --name="*" execute show-output``, which
+.. will print the tail of the output for each workload.
+
 Shut Down the Workloads
 -----------------------
 
 The instances running the workloads can all be killed by running
-``anyscale session stop --name "*"``.
+``anyscale stop <SESSION_NAME>``.
 
 Adding a Workload
 -----------------
diff --git a/ci/long_running_tests/ray-project/project.yaml b/ci/long_running_tests/ray-project/project.yaml
index 0b41b1be6..160e61a19 100644
--- a/ci/long_running_tests/ray-project/project.yaml
+++ b/ci/long_running_tests/ray-project/project.yaml
@@ -5,7 +5,7 @@ cluster:
   config: ray-project/cluster.yaml
 
 commands:
-  - name: run
+  - name: test_workload
     help: "Start a long running stress test."
     command: |
       # Install nightly Ray wheels.
diff --git a/doc/dev/RELEASE_PROCESS.rst b/doc/dev/RELEASE_PROCESS.rst
index 89ec1a499..98ee46264 100644
--- a/doc/dev/RELEASE_PROCESS.rst
+++ b/doc/dev/RELEASE_PROCESS.rst
@@ -26,7 +26,7 @@ This document describes the process for creating new releases.
 
 4. **Testing:** Before releasing, the following sets of tests should be run.
    The results of each of these tests for previous releases are checked in
-   under ``doc/dev/release_tests``, and should be compared against to identify
+   under ``doc/dev/release_logs``, and should be compared against to identify
    any regressions.
 
    1. Long-running tests
@@ -35,10 +35,20 @@ This document describes the process for creating new releases.
 
        ray/ci/long_running_tests/README.rst
 
-   Follow the instructions to kick off the tests and check the status of the workloads
-   These tests should run for at least 24 hours (printing new iterations and CPU load
+   Follow the instructions to kick off the tests and check the status of the workloads.
+   These tests should run for at least 24 hours without erroring or hanging (ensure that it is printing new iterations and CPU load is
    stable in the AWS console).
 
+   2. Long-running multi-node tests
+
+   .. code-block:: bash
+
+      ray/ci/long_running_distributed_tests/README.rst
+
+   Follow the instructions to kick off the tests and check the status of the workloads.
+   These suite of tests are similar to the standard long running tests, except these actually run in a multi-node cluster instead of just a simulated one.
+   These tests should also run for at least 24 hours without erroring or hanging.
+
    2. Multi-node regression tests
 
    Follow the same instruction as long running stress tests. The large scale distributed