diff --git a/.travis.yml b/.travis.yml index 1bf9f7dda..cde858639 100644 --- a/.travis.yml +++ b/.travis.yml @@ -261,6 +261,8 @@ matrix: - os: linux env: - RLLIB_TESTING=1 RLLIB_QUICK_TRAIN_AND_MISC_TESTS=1 + # TODO (sven): Remove this after fixing rllib tests num_cpus. + - RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - PYTHON=3.6 - TF_VERSION=2.1.0 - TFP_VERSION=0.8 @@ -271,15 +273,17 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=quick_train rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=quick_train --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... # Test everything that does not have any of the "main" labels: # "learning_tests|quick_train|examples|tests_dir". - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... # RLlib: Everything in rllib/examples/ directory. - os: linux env: - RLLIB_TESTING=1 RLLIB_EXAMPLE_DIR_TESTS=1 + # TODO (sven): Remove this after fixing rllib tests num_cpus. + - RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - PYTHON=3.6 - TF_VERSION=2.1.0 - TFP_VERSION=0.8 @@ -290,15 +294,17 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_A,examples_B rllib/... - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_C,examples_D rllib/... - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P rllib/... - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... # RLlib: tests_dir: Everything in rllib/tests/ directory (A-L). - os: linux env: - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_L=1 + # TODO (sven): Remove this after fixing rllib tests num_cpus. + - RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - PYTHON=3.6 - TF_VERSION=2.1.0 - TFP_VERSION=0.8 @@ -309,12 +315,14 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... # RLlib: tests_dir: Everything in rllib/tests/ directory (M-Z). - os: linux env: - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_M_TO_Z=1 + # TODO (sven): Remove this after fixing rllib tests num_cpus. + - RAY_USE_MULTIPROCESSING_CPU_COUNT=1 - PYTHON=3.6 - TF_VERSION=2.1.0 - TFP_VERSION=0.8 @@ -325,7 +333,7 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... # Tune: Tests and examples. diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst index a7b74ced0..b73bf6822 100644 --- a/doc/source/serve/index.rst +++ b/doc/source/serve/index.rst @@ -50,11 +50,13 @@ Serve a function by defining a function, an endpoint, and a backend (in this cas connecting the two by setting traffic from the endpoint to the backend. .. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_function.py + :lines: 2-4,6-7,9- Serve a stateful class by defining a class (``Counter``), creating an endpoint and a backend, then connecting the two by setting traffic from the endpoint to the backend. .. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_class.py + :lines: 2-4,6-7,9- See :doc:`key-concepts` for more exhaustive coverage about Ray Serve and its core concepts. diff --git a/doc/source/tune/_tutorials/tune-serve-integration-mnist.py b/doc/source/tune/_tutorials/tune-serve-integration-mnist.py index 04690129a..2c703cf0d 100644 --- a/doc/source/tune/_tutorials/tune-serve-integration-mnist.py +++ b/doc/source/tune/_tutorials/tune-serve-integration-mnist.py @@ -97,6 +97,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim +import ray from ray import tune, serve from ray.serve.exceptions import RayServeException from ray.tune import CLIReporter @@ -613,6 +614,9 @@ if __name__ == "__main__": args = parser.parse_args() + if args.smoke_test: + ray.init(num_cpus=2) + model_dir = os.path.expanduser(args.model_dir) if args.query >= 0: diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py index 92845518f..0bc753c42 100644 --- a/python/ray/resource_spec.py +++ b/python/ray/resource_spec.py @@ -1,7 +1,6 @@ import math from collections import namedtuple import logging -import multiprocessing import os import re import subprocess @@ -105,7 +104,9 @@ class ResourceSpec( # Check types. for resource_label, resource_quantity in resources.items(): assert (isinstance(resource_quantity, int) - or isinstance(resource_quantity, float)) + or isinstance(resource_quantity, float)), ( + f"{resource_label} ({type(resource_quantity)}): " + f"{resource_quantity}") if (isinstance(resource_quantity, float) and not resource_quantity.is_integer()): raise ValueError( @@ -148,7 +149,7 @@ class ResourceSpec( num_cpus = self.num_cpus if num_cpus is None: - num_cpus = multiprocessing.cpu_count() + num_cpus = ray.utils.get_num_cpus() num_gpus = self.num_gpus gpu_ids = ray.utils.get_cuda_visible_devices() diff --git a/python/ray/serve/examples/doc/quickstart_class.py b/python/ray/serve/examples/doc/quickstart_class.py index f8aa26aee..890aed094 100644 --- a/python/ray/serve/examples/doc/quickstart_class.py +++ b/python/ray/serve/examples/doc/quickstart_class.py @@ -1,6 +1,8 @@ +import ray from ray import serve import requests +ray.init(num_cpus=8) client = serve.start() diff --git a/python/ray/serve/examples/doc/quickstart_function.py b/python/ray/serve/examples/doc/quickstart_function.py index 61ea3816a..a0e26d3f2 100644 --- a/python/ray/serve/examples/doc/quickstart_function.py +++ b/python/ray/serve/examples/doc/quickstart_function.py @@ -1,6 +1,8 @@ +import ray from ray import serve import requests +ray.init(num_cpus=8) client = serve.start() diff --git a/python/ray/serve/examples/doc/snippet_model_composition.py b/python/ray/serve/examples/doc/snippet_model_composition.py index 25705a7f8..67a1890bf 100644 --- a/python/ray/serve/examples/doc/snippet_model_composition.py +++ b/python/ray/serve/examples/doc/snippet_model_composition.py @@ -3,7 +3,7 @@ import requests import ray from ray import serve -ray.init(num_cpus=10) +ray.init(num_cpus=8) client = serve.start() # Our pipeline will be structured as follows: diff --git a/python/ray/serve/examples/doc/tutorial_batch.py b/python/ray/serve/examples/doc/tutorial_batch.py index 271e69afa..3f8129a05 100644 --- a/python/ray/serve/examples/doc/tutorial_batch.py +++ b/python/ray/serve/examples/doc/tutorial_batch.py @@ -28,8 +28,8 @@ def batch_adder_v0(flask_requests: List): # __doc_define_servable_v0_end__ +ray.init(num_cpus=8) # __doc_deploy_begin__ -ray.init(num_cpus=10) client = serve.start() client.create_backend("adder:v0", batch_adder_v0, config={"max_batch_size": 4}) client.create_endpoint( diff --git a/python/ray/serve/examples/doc/tutorial_pytorch.py b/python/ray/serve/examples/doc/tutorial_pytorch.py index c7e8593a6..80e3b9d32 100644 --- a/python/ray/serve/examples/doc/tutorial_pytorch.py +++ b/python/ray/serve/examples/doc/tutorial_pytorch.py @@ -1,4 +1,5 @@ # yapf: disable +import ray # __doc_import_begin__ from ray import serve @@ -45,6 +46,7 @@ class ImageModel: # __doc_define_servable_end__ +ray.init(num_cpus=8) # __doc_deploy_begin__ client = serve.start() client.create_backend("resnet18:v0", ImageModel) diff --git a/python/ray/serve/examples/doc/tutorial_sklearn.py b/python/ray/serve/examples/doc/tutorial_sklearn.py index 976f883a2..69f17953b 100644 --- a/python/ray/serve/examples/doc/tutorial_sklearn.py +++ b/python/ray/serve/examples/doc/tutorial_sklearn.py @@ -1,4 +1,5 @@ # yapf: disable +import ray # __doc_import_begin__ from ray import serve @@ -70,6 +71,7 @@ class BoostingModel: # __doc_define_servable_end__ +ray.init(num_cpus=8) # __doc_deploy_begin__ client = serve.start() client.create_backend("lr:v1", BoostingModel) diff --git a/python/ray/serve/examples/doc/tutorial_tensorflow.py b/python/ray/serve/examples/doc/tutorial_tensorflow.py index 71f974697..07fb36381 100644 --- a/python/ray/serve/examples/doc/tutorial_tensorflow.py +++ b/python/ray/serve/examples/doc/tutorial_tensorflow.py @@ -1,4 +1,5 @@ # yapf: disable +import ray # __doc_import_begin__ from ray import serve @@ -68,6 +69,7 @@ class TFMnistModel: # __doc_define_servable_end__ +ray.init(num_cpus=8) # __doc_deploy_begin__ client = serve.start() client.create_backend("tf:v1", TFMnistModel, TRAINED_MODEL_PATH) diff --git a/python/ray/tests/test_memory_scheduling.py b/python/ray/tests/test_memory_scheduling.py index fa642a4b7..8da6026d8 100644 --- a/python/ray/tests/test_memory_scheduling.py +++ b/python/ray/tests/test_memory_scheduling.py @@ -70,6 +70,7 @@ class TestMemoryScheduling(unittest.TestCase): def testTuneDriverHeapLimit(self): try: + ray.init(num_cpus=4, _memory=100 * MB) _register_all() result = tune.run( "PG", @@ -89,6 +90,11 @@ class TestMemoryScheduling(unittest.TestCase): def testTuneDriverStoreLimit(self): try: + ray.init( + num_cpus=4, + _memory=100 * MB, + object_store_memory=100 * MB, + ) _register_all() self.assertRaisesRegexp( ray.tune.error.TuneError, @@ -107,6 +113,7 @@ class TestMemoryScheduling(unittest.TestCase): def testTuneWorkerHeapLimit(self): try: + ray.init(num_cpus=4, _memory=100 * MB) _register_all() result = tune.run( "PG", @@ -127,6 +134,11 @@ class TestMemoryScheduling(unittest.TestCase): def testTuneWorkerStoreLimit(self): try: + ray.init( + num_cpus=4, + _memory=100 * MB, + object_store_memory=100 * MB, + ) _register_all() self.assertRaisesRegexp( ray.tune.error.TuneError, @@ -144,6 +156,7 @@ class TestMemoryScheduling(unittest.TestCase): def testTuneObjectLimitApplied(self): try: + ray.init(num_cpus=2, object_store_memory=500 * MB) result = tune.run( train_oom, resources_per_trial={"object_store_memory": 150 * 1024 * 1024}, diff --git a/python/ray/tune/examples/cifar10_pytorch.py b/python/ray/tune/examples/cifar10_pytorch.py index 1b9b75afb..2e1b1cbe2 100644 --- a/python/ray/tune/examples/cifar10_pytorch.py +++ b/python/ray/tune/examples/cifar10_pytorch.py @@ -12,6 +12,7 @@ import torch.optim as optim from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms +import ray from ray import tune from ray.tune import CLIReporter from ray.tune.schedulers import ASHAScheduler @@ -231,6 +232,7 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() if args.smoke_test: + ray.init(num_cpus=2) main(num_samples=1, max_num_epochs=1, gpus_per_trial=0) else: # Change this to activate training on GPUs diff --git a/python/ray/tune/examples/horovod_simple.py b/python/ray/tune/examples/horovod_simple.py index e566edf0f..f2a141529 100644 --- a/python/ray/tune/examples/horovod_simple.py +++ b/python/ray/tune/examples/horovod_simple.py @@ -97,6 +97,9 @@ if __name__ == "__main__": parser.add_argument("--hosts-per-trial", type=int, default=1) parser.add_argument("--slots-per-host", type=int, default=2) args = parser.parse_args() + if args.smoke_test: + import ray + ray.init(num_cpus=2) # import ray # ray.init(address="auto") # assumes ray is started with ray up diff --git a/python/ray/tune/examples/pbt_convnet_example.py b/python/ray/tune/examples/pbt_convnet_example.py index d996d0edd..f54f036b9 100644 --- a/python/ray/tune/examples/pbt_convnet_example.py +++ b/python/ray/tune/examples/pbt_convnet_example.py @@ -76,7 +76,7 @@ if __name__ == "__main__": "--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() - ray.init() + ray.init(num_cpus=2) datasets.MNIST("~/data", train=True, download=True) # check if PytorchTrainble will save/restore correctly before execution diff --git a/python/ray/tune/tests/ext_pytorch.py b/python/ray/tune/tests/ext_pytorch.py index cc9759c07..5b9db0193 100644 --- a/python/ray/tune/tests/ext_pytorch.py +++ b/python/ray/tune/tests/ext_pytorch.py @@ -58,6 +58,7 @@ import torch.optim as optim from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms +import ray from ray import tune from ray.tune import CLIReporter from ray.tune.schedulers import ASHAScheduler @@ -434,6 +435,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": # You can change the number of GPUs per trial here: + ray.init(num_cpus=2) # for testing purposes only main(num_samples=2, max_num_epochs=2, gpus_per_trial=0) diff --git a/python/ray/tune/tests/test_dependency.py b/python/ray/tune/tests/test_dependency.py index ce5d7a28b..c2e2f2c7c 100644 --- a/python/ray/tune/tests/test_dependency.py +++ b/python/ray/tune/tests/test_dependency.py @@ -11,7 +11,8 @@ def f(config, reporter): if __name__ == "__main__": - ray.init() + ray.init(num_cpus=2) + register_trainable("my_class", f) run_experiments({ "test": { diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py index fc06b7aeb..e2ee69013 100644 --- a/python/ray/tune/tests/test_ray_trial_executor.py +++ b/python/ray/tune/tests/test_ray_trial_executor.py @@ -16,7 +16,7 @@ from ray.cluster_utils import Cluster class RayTrialExecutorTest(unittest.TestCase): def setUp(self): self.trial_executor = RayTrialExecutor(queue_trials=False) - ray.init(ignore_reinit_error=True) + ray.init(num_cpus=2, ignore_reinit_error=True) _register_all() # Needed for flaky tests def tearDown(self): diff --git a/python/ray/tune/tests/test_sync.py b/python/ray/tune/tests/test_sync.py index 229fb1afc..e51b518bd 100644 --- a/python/ray/tune/tests/test_sync.py +++ b/python/ray/tune/tests/test_sync.py @@ -17,7 +17,7 @@ from ray.tune.syncer import CommandBasedClient class TestSyncFunctionality(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/tests/test_track.py b/python/ray/tune/tests/test_track.py index 7abb8df4e..346892865 100644 --- a/python/ray/tune/tests/test_track.py +++ b/python/ray/tune/tests/test_track.py @@ -20,7 +20,7 @@ class TrackApiTest(unittest.TestCase): def testSoftDeprecation(self): """Checks that tune.track.log code does not break.""" from ray.tune import track - ray.init() + ray.init(num_cpus=2) def testme(config): for i in range(config["iters"]): diff --git a/python/ray/tune/tests/test_trial_runner.py b/python/ray/tune/tests/test_trial_runner.py index 9a50a88d5..c947a1dae 100644 --- a/python/ray/tune/tests/test_trial_runner.py +++ b/python/ray/tune/tests/test_trial_runner.py @@ -23,7 +23,8 @@ class TrialRunnerTest(unittest.TestCase): ray.shutdown() def testTrialStatus(self): - ray.init() + ray.init(num_cpus=2) + trial = Trial("__fake") trial_executor = RayTrialExecutor() self.assertEqual(trial.status, Trial.PENDING) @@ -35,7 +36,7 @@ class TrialRunnerTest(unittest.TestCase): self.assertEqual(trial.status, Trial.ERROR) def testExperimentTagTruncation(self): - ray.init() + ray.init(num_cpus=2) def train(config, reporter): reporter(timesteps_total=1) diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py index ed07b0504..a3fc80479 100644 --- a/python/ray/tune/tests/test_trial_runner_3.py +++ b/python/ray/tune/tests/test_trial_runner_3.py @@ -540,7 +540,8 @@ class TrialRunnerTest3(unittest.TestCase): runner2.step() def testCheckpointWithFunction(self): - ray.init() + ray.init(num_cpus=2) + trial = Trial( "__fake", config={"callbacks": { @@ -565,7 +566,8 @@ class TrialRunnerTest3(unittest.TestCase): and fname.endswith(".json")) for fname in os.listdir(cdir)) - ray.init() + ray.init(num_cpus=2) + trial = Trial("__fake", checkpoint_freq=1) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) diff --git a/python/ray/tune/tests/test_trial_scheduler.py b/python/ray/tune/tests/test_trial_scheduler.py index 507ae81f0..2588a2e64 100644 --- a/python/ray/tune/tests/test_trial_scheduler.py +++ b/python/ray/tune/tests/test_trial_scheduler.py @@ -38,7 +38,7 @@ def mock_trial_runner(trials=None): class EarlyStoppingSuite(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() @@ -782,7 +782,7 @@ class _MockTrial(Trial): class PopulationBasedTestingSuite(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() @@ -1802,7 +1802,7 @@ class E2EPopulationBasedTestingSuite(unittest.TestCase): class AsyncHyperBandSuite(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py index 5af7cb467..09831fa01 100644 --- a/python/ray/tune/tests/test_trial_scheduler_pbt.py +++ b/python/ray/tune/tests/test_trial_scheduler_pbt.py @@ -104,7 +104,7 @@ class PopulationBasedTrainingSynchTest(unittest.TestCase): class PopulationBasedTrainingConfigTest(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() @@ -145,7 +145,7 @@ class PopulationBasedTrainingConfigTest(unittest.TestCase): class PopulationBasedTrainingResumeTest(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py index 89077da90..d94e1a7ae 100644 --- a/python/ray/tune/tests/test_tune_restore.py +++ b/python/ray/tune/tests/test_tune_restore.py @@ -85,7 +85,7 @@ class TuneRestoreTest(unittest.TestCase): class TuneExampleTest(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/tests/test_var.py b/python/ray/tune/tests/test_var.py index 082a9fe03..bf9e0c4f9 100644 --- a/python/ray/tune/tests/test_var.py +++ b/python/ray/tune/tests/test_var.py @@ -15,7 +15,7 @@ from ray.tune.suggest.variant_generator import (RecursiveDependencyError, class VariantGeneratorTest(unittest.TestCase): def setUp(self): - ray.init() + ray.init(num_cpus=2) def tearDown(self): ray.shutdown() diff --git a/python/ray/util/sgd/BUILD b/python/ray/util/sgd/BUILD index 784016de9..664ac40a3 100644 --- a/python/ray/util/sgd/BUILD +++ b/python/ray/util/sgd/BUILD @@ -67,7 +67,7 @@ py_test( srcs = ["tf/examples/tensorflow_train_example.py"], tags = ["exclusive", "tf"], deps = [":sgd_lib"], - args = ["--num-replicas=1"] + args = ["--num-replicas=1", "--smoke-test"] ) py_test( @@ -77,7 +77,7 @@ py_test( srcs = ["tf/examples/tensorflow_train_example.py"], tags = ["exclusive", "tf"], deps = [":sgd_lib"], - args = ["--num-replicas=2"] + args = ["--num-replicas=2", "--smoke-test"] ) py_test( @@ -87,7 +87,7 @@ py_test( srcs = ["tf/examples/tensorflow_train_example.py"], tags = ["exclusive", "tf"], deps = [":sgd_lib"], - args = ["--tune"] + args = ["--tune", "--smoke-test"] ) # -------------------------------------------------------------------- @@ -148,7 +148,7 @@ py_test( srcs = ["torch/examples/train_example.py"], tags = ["exclusive", "pytorch"], deps = [":sgd_lib"], - args = ["--num-workers=1"] + args = ["--num-workers=1", "--smoke-test"] ) py_test( @@ -158,7 +158,7 @@ py_test( srcs = ["torch/examples/train_example.py"], tags = ["exclusive", "pytorch"], deps = [":sgd_lib"], - args = ["--num-workers=2"] + args = ["--num-workers=2", "--smoke-test"] ) py_test( @@ -168,7 +168,7 @@ py_test( srcs = ["torch/examples/tune_example.py"], tags = ["exclusive", "pytorch"], deps = [":sgd_lib"], - args = ["--num-workers=1"] + args = ["--num-workers=1", "--smoke-test"] ) py_test( @@ -178,7 +178,7 @@ py_test( srcs = ["torch/examples/tune_example.py"], tags = ["exclusive", "pytorch"], deps = [":sgd_lib"], - args = ["--num-workers=2"] + args = ["--num-workers=2", "--smoke-test"] ) diff --git a/python/ray/util/sgd/tf/examples/cifar_tf_example.py b/python/ray/util/sgd/tf/examples/cifar_tf_example.py index 407ed27a5..a348ed882 100644 --- a/python/ray/util/sgd/tf/examples/cifar_tf_example.py +++ b/python/ray/util/sgd/tf/examples/cifar_tf_example.py @@ -177,7 +177,10 @@ if __name__ == "__main__": help="Finish quickly for testing. Assume False for users.") args, _ = parser.parse_known_args() - ray.init(address=args.address) + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) data_size = 60000 test_size = 10000 batch_size = args.batch_size diff --git a/python/ray/util/sgd/tf/examples/tensorflow_train_example.py b/python/ray/util/sgd/tf/examples/tensorflow_train_example.py index 41f4505ea..b872594a0 100644 --- a/python/ray/util/sgd/tf/examples/tensorflow_train_example.py +++ b/python/ray/util/sgd/tf/examples/tensorflow_train_example.py @@ -114,6 +114,8 @@ def tune_example(num_replicas=1, use_gpu=False): if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--address", required=False, @@ -135,7 +137,10 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() - ray.init(address=args.address) + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) if args.tune: tune_example(num_replicas=args.num_replicas, use_gpu=args.use_gpu) diff --git a/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py index 2cae22f7d..d40d14e2f 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py +++ b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py @@ -109,7 +109,10 @@ class Training(TrainingOperator): if __name__ == "__main__": - ray.init(address=None if args.local else "auto") + if args.local: + ray.init(num_cpus=2) + else: + ray.init(address="auto") num_workers = 2 if args.local else int(ray.cluster_resources().get(device)) from ray.util.sgd.torch.examples.train_example import LinearDataset diff --git a/python/ray/util/sgd/torch/examples/dcgan.py b/python/ray/util/sgd/torch/examples/dcgan.py index 90f59458f..9bbf34f41 100644 --- a/python/ray/util/sgd/torch/examples/dcgan.py +++ b/python/ray/util/sgd/torch/examples/dcgan.py @@ -289,7 +289,10 @@ if __name__ == "__main__": default=False, help="Enables GPU training") args = parser.parse_args() - ray.init(address=args.address) + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) trainer = train_example( num_workers=args.num_workers, diff --git a/python/ray/util/sgd/torch/examples/image_models/train.py b/python/ray/util/sgd/torch/examples/image_models/train.py index e9fe05176..48fe4579c 100644 --- a/python/ray/util/sgd/torch/examples/image_models/train.py +++ b/python/ray/util/sgd/torch/examples/image_models/train.py @@ -128,8 +128,10 @@ def main(): setup_default_logging() args, args_text = parse_args() - - ray.init(address=args.ray_address) + if args.smoke_test: + ray.init(num_cpus=int(args.ray_num_workers)) + else: + ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, diff --git a/python/ray/util/sgd/torch/examples/train_example.py b/python/ray/util/sgd/torch/examples/train_example.py index 71e8446ec..d0eaf8d6b 100644 --- a/python/ray/util/sgd/torch/examples/train_example.py +++ b/python/ray/util/sgd/torch/examples/train_example.py @@ -115,10 +115,17 @@ if __name__ == "__main__": help="Enables GPU training") parser.add_argument( "--tune", action="store_true", default=False, help="Tune training") + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.") args, _ = parser.parse_known_args() import ray - - ray.init(address=args.address) + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) train_example(num_workers=args.num_workers, use_gpu=args.use_gpu) diff --git a/python/ray/util/sgd/torch/examples/tune_example.py b/python/ray/util/sgd/torch/examples/tune_example.py index ef6c32893..da04bc455 100644 --- a/python/ray/util/sgd/torch/examples/tune_example.py +++ b/python/ray/util/sgd/torch/examples/tune_example.py @@ -59,6 +59,8 @@ def tune_example(operator_cls, num_workers=1, use_gpu=False): if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--address", type=str, @@ -77,7 +79,10 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() - ray.init(address=args.address) + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss) diff --git a/python/ray/utils.py b/python/ray/utils.py index 3dd4379fd..c06352b1a 100644 --- a/python/ray/utils.py +++ b/python/ray/utils.py @@ -3,6 +3,7 @@ import errno import hashlib import inspect import logging +import multiprocessing import numpy as np import os import signal @@ -498,6 +499,56 @@ def get_system_memory(): return psutil_memory_in_bytes +def _get_docker_cpus(): + # 1. Try using CFS Quota (https://bugs.openjdk.java.net/browse/JDK-8146115) + # 2. Try Nproc (CPU sets) + cpu_quota_file_name = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" + cpu_share_file_name = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" + num_cpus = 0 + if os.path.exists(cpu_quota_file_name) and os.path.exists( + cpu_quota_file_name): + with open(cpu_quota_file_name, "r") as f: + num_cpus = int(f.read()) + if num_cpus != -1: + with open(cpu_share_file_name, "r") as f: + num_cpus /= int(f.read()) + return num_cpus + + return int(subprocess.check_output("nproc")) + + +def get_num_cpus(): + cpu_count = multiprocessing.cpu_count() + if "RAY_USE_MULTIPROCESSING_CPU_COUNT" in os.environ: + logger.info( + "Using multiprocessing.cpu_count() to detect the number of CPUs. " + "This method of CPU detection is buggy when used inside docker. " + "To correctly detect CPUs remove the enivronment variable: " + "`RAY_USE_MULTIPROCESSING_CPU_COUNT`.") + return cpu_count + try: + # Not easy to get cpu count in docker, see: + # https://bugs.python.org/issue36054 + docker_count = _get_docker_cpus() + if docker_count != cpu_count: + cpu_count = docker_count + if "RAY_DISABLE_DOCKER_CPU_WARNING" not in os.environ: + logger.warning( + "Detecting limited number of CPUs due to docker. In " + "previous versions of Ray, CPU detection in containers " + "was buggy. Please ensure that Ray has enough CPUs " + "allocated. This message will be removed in future " + "version of Ray. You can set the environment variable: " + "`RAY_DISABLE_DOCKER_CPU_WARNING` to remove it now.") + + except Exception: + # `nproc` and cgroup are linux-only. If docker only works on linux + # (will run in a linux VM on other platforms), so this is fine. + pass + + return cpu_count + + def get_used_memory(): """Return the currently used system memory in bytes