diff --git a/ci/jenkins_tests/run_sgd_tests.sh b/ci/jenkins_tests/run_sgd_tests.sh index 6a8600782..d0eb03450 100755 --- a/ci/jenkins_tests/run_sgd_tests.sh +++ b/ci/jenkins_tests/run_sgd_tests.sh @@ -64,6 +64,9 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ python /ray/python/ray/util/sgd/torch/examples/dcgan.py --smoke-test --num-workers=2 +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ + python /ray/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py --smoke-test + $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py diff --git a/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py b/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py index fb743898e..acbef0276 100644 --- a/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py +++ b/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py @@ -49,6 +49,8 @@ beta1 = 0.5 # iterations of actual training in each Trainable _train train_iterations_per_step = 5 +MODEL_PATH = os.path.expanduser("~/.ray/models/mnist_cnn.pt") + def get_data_loader(): dataset = dset.MNIST( @@ -305,6 +307,16 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() ray.init() + import urllib.request + # Download a pre-trained MNIST model for inception score calculation. + # This is a tiny model (<100kb). + if not os.path.exists(MODEL_PATH): + print("downloading model") + os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) + urllib.request.urlretrieve( + "https://github.com/ray-project/ray/raw/master/python/ray/tune/" + "examples/pbt_dcgan_mnist/mnist_cnn.pt", MODEL_PATH) + dataloader = get_data_loader() if not args.smoke_test: # Plot some training images @@ -322,10 +334,7 @@ if __name__ == "__main__": # load the pretrained mnist classification model for inception_score mnist_cnn = Net() - model_path = os.path.join( - os.path.dirname(ray.__file__), - "tune/examples/pbt_dcgan_mnist/mnist_cnn.pt") - mnist_cnn.load_state_dict(torch.load(model_path)) + mnist_cnn.load_state_dict(torch.load(MODEL_PATH)) mnist_cnn.eval() mnist_model_ref = ray.put(mnist_cnn) diff --git a/python/ray/util/sgd/torch/examples/__init__.py b/python/ray/util/sgd/torch/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py index 6155d9e21..b57bd1011 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py +++ b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py @@ -16,6 +16,8 @@ from ray.util.sgd.torch import TrainingOperator parser = argparse.ArgumentParser( description="PyTorch Synthetic Benchmark", formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument( + "--smoke-test", action="store_true", default=False, help="finish quickly.") parser.add_argument( "--fp16", action="store_true", default=False, help="use fp16 training") @@ -49,6 +51,16 @@ parser.add_argument( help="Disables cluster training") args = parser.parse_args() + +if args.smoke_test: + args.model = "resnet18" + args.batch_size = 1 + args.num_iters = 1 + args.num_batches_per_iter = 2 + args.num_warmup_batches = 2 + args.local = True + args.no_cuda = True + args.cuda = not args.no_cuda and torch.cuda.is_available() device = "GPU" if args.cuda else "CPU" @@ -68,7 +80,6 @@ class Training(TrainingOperator): self.data, self.target = data, target def train_epoch(self, *pargs, **kwargs): - # print(self.model) def benchmark(): self.optimizer.zero_grad() output = self.model(self.data) @@ -76,11 +87,11 @@ class Training(TrainingOperator): loss.backward() self.optimizer.step() - # print("Running warmup...") + print("Running warmup...") if self.global_step == 0: timeit.timeit(benchmark, number=args.num_warmup_batches) self.global_step += 1 - # print("Running benchmark...") + print("Running benchmark...") time = timeit.timeit(benchmark, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time return {"img_sec": img_sec} @@ -99,7 +110,7 @@ if __name__ == "__main__": model_creator=lambda cfg: getattr(models, args.model)(), optimizer_creator=lambda model, cfg: optim.SGD( model.parameters(), lr=0.01 * cfg.get("lr_scaler")), - data_creator=lambda cfg: LinearDataset(4, 2), + data_creator=lambda cfg: LinearDataset(4, 2), # Mock dataset. initialization_hook=init_hook, config=dict( lr_scaler=num_workers), diff --git a/python/ray/util/sgd/torch/examples/dcgan.py b/python/ray/util/sgd/torch/examples/dcgan.py index 9eec9348e..4669d266b 100644 --- a/python/ray/util/sgd/torch/examples/dcgan.py +++ b/python/ray/util/sgd/torch/examples/dcgan.py @@ -21,6 +21,8 @@ from ray.util.sgd import TorchTrainer from ray.util.sgd.utils import override from ray.util.sgd.torch import TrainingOperator +MODEL_PATH = os.path.expanduser("~/.ray/models/mnist_cnn.pt") + def data_creator(config): dataset = datasets.MNIST( @@ -227,9 +229,7 @@ def train_example(num_workers=1, use_gpu=False, test_mode=False): config = { "test_mode": test_mode, "batch_size": 16 if test_mode else 512 // num_workers, - "classification_model_path": os.path.join( - os.path.dirname(ray.__file__), - "util/sgd/torch/examples/mnist_cnn.pt") + "classification_model_path": MODEL_PATH } trainer = TorchTrainer( model_creator=model_creator, @@ -256,6 +256,16 @@ def train_example(num_workers=1, use_gpu=False, test_mode=False): if __name__ == "__main__": + import urllib.request + # Download a pre-trained MNIST model for inception score calculation. + # This is a tiny model (<100kb). + if not os.path.exists(MODEL_PATH): + print("downloading model") + os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) + urllib.request.urlretrieve( + "https://github.com/ray-project/ray/raw/master/python/ray/tune/" + "examples/pbt_dcgan_mnist/mnist_cnn.pt", MODEL_PATH) + parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") diff --git a/python/ray/util/sgd/torch/examples/mnist_cnn.pt b/python/ray/util/sgd/torch/examples/mnist_cnn.pt deleted file mode 100644 index 1c4364e16..000000000 Binary files a/python/ray/util/sgd/torch/examples/mnist_cnn.pt and /dev/null differ