From 1ab98155ebf66e4d393b9002e3c720c972ddd64d Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 1 Apr 2020 10:05:07 -0700 Subject: [PATCH] [Release] Revert Enable GCS Server by Default (#7840) --- .travis.yml | 8 ++--- .../org/ray/runtime/config/RayConfig.java | 5 ---- .../org/ray/runtime/runner/RunManager.java | 2 +- python/ray/node.py | 2 +- python/ray/ray_constants.py | 8 +---- python/ray/tests/test_component_failures_3.py | 4 +-- python/ray/tests/test_multinode_failures_2.py | 4 +-- python/ray/tests/test_tempfile.py | 2 +- src/ray/common/ray_config_def.h | 9 ------ src/ray/core_worker/core_worker.cc | 2 +- src/ray/core_worker/test/core_worker_test.cc | 2 +- src/ray/gcs/gcs_server/gcs_server.cc | 29 +++++++------------ src/ray/raylet/main.cc | 3 +- 13 files changed, 26 insertions(+), 54 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7cf3d7ea3..27e1bb2f8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,11 +49,11 @@ matrix: - os: linux env: - - TESTSUITE=gcs_service_disabled + - TESTSUITE=gcs_service - JDK='Oracle JDK 8' + - RAY_GCS_SERVICE_ENABLED=true - PYTHON=3.6 PYTHONWARNINGS=ignore - RAY_INSTALL_JAVA=1 - - RAY_GCS_SERVICE_ENABLED=false install: - eval `python $TRAVIS_BUILD_DIR/ci/travis/determine_tests_to_run.py` - ./ci/travis/install-bazel.sh @@ -69,11 +69,11 @@ matrix: - os: linux env: - - TESTSUITE=gcs_service_disabled_python_testcase + - TESTSUITE=gcs_service_python_testcase - JDK='Oracle JDK 8' + - RAY_GCS_SERVICE_ENABLED=true - PYTHON=3.6 PYTHONWARNINGS=ignore - RAY_INSTALL_JAVA=1 - - RAY_GCS_SERVICE_ENABLED=false install: - eval `python $TRAVIS_BUILD_DIR/ci/travis/determine_tests_to_run.py` - ./ci/travis/install-bazel.sh diff --git a/java/runtime/src/main/java/org/ray/runtime/config/RayConfig.java b/java/runtime/src/main/java/org/ray/runtime/config/RayConfig.java index 8671ed73d..1ba6246c7 100644 --- a/java/runtime/src/main/java/org/ray/runtime/config/RayConfig.java +++ b/java/runtime/src/main/java/org/ray/runtime/config/RayConfig.java @@ -72,8 +72,6 @@ public class RayConfig { public final String jobResourcePath; public final String pythonWorkerCommand; - public final boolean gcsServiceEnabled; - private static volatile RayConfig instance = null; public static RayConfig getInstance() { @@ -225,9 +223,6 @@ public class RayConfig { numWorkersPerProcess = config.getInt("ray.raylet.config.num_workers_per_process_java"); - gcsServiceEnabled = System.getenv("RAY_GCS_SERVICE_ENABLED") == null || - System.getenv("RAY_GCS_SERVICE_ENABLED").toLowerCase().equals("true"); - // Validate config. validate(); LOGGER.debug("Created config: {}", this); diff --git a/java/runtime/src/main/java/org/ray/runtime/runner/RunManager.java b/java/runtime/src/main/java/org/ray/runtime/runner/RunManager.java index 61c970adc..13851f119 100644 --- a/java/runtime/src/main/java/org/ray/runtime/runner/RunManager.java +++ b/java/runtime/src/main/java/org/ray/runtime/runner/RunManager.java @@ -226,7 +226,7 @@ public class RunManager { } // start gcs server - if (rayConfig.gcsServiceEnabled) { + if (System.getenv("RAY_GCS_SERVICE_ENABLED") != null) { String redisPasswordOption = ""; if (!Strings.isNullOrEmpty(rayConfig.headRedisPassword)) { redisPasswordOption = rayConfig.headRedisPassword; diff --git a/python/ray/node.py b/python/ray/node.py index 10a129f9f..870c6203c 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -626,7 +626,7 @@ class Node: # If this is the head node, start the relevant head node processes. self.start_redis() - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): self.start_gcs_server() else: self.start_raylet_monitor() diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 953a568fd..d751d1baf 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -13,12 +13,6 @@ def env_integer(key, default): return default -def env_bool(key, default): - if key in os.environ: - return True if os.environ[key].lower() == "true" else False - return default - - ID_SIZE = 20 # The default maximum number of bytes to allocate to the object store unless @@ -203,4 +197,4 @@ MACH_PAGE_SIZE_BYTES = 4096 # RAY_GCS_SERVICE_ENABLED only set in ci job. # TODO(ffbin): Once we entirely migrate to service-based GCS, we should # remove it. -GCS_SERVICE_ENABLED = env_bool("RAY_GCS_SERVICE_ENABLED", True) +RAY_GCS_SERVICE_ENABLED = "RAY_GCS_SERVICE_ENABLED" diff --git a/python/ray/tests/test_component_failures_3.py b/python/ray/tests/test_component_failures_3.py index 5cc47e7f8..8f082312b 100644 --- a/python/ray/tests/test_component_failures_3.py +++ b/python/ray/tests/test_component_failures_3.py @@ -82,7 +82,7 @@ def test_driver_lives_sequential(ray_start_regular): ray.worker._global_node.kill_plasma_store() ray.worker._global_node.kill_log_monitor() ray.worker._global_node.kill_monitor() - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): ray.worker._global_node.kill_gcs_server() else: ray.worker._global_node.kill_raylet_monitor() @@ -96,7 +96,7 @@ def test_driver_lives_sequential(ray_start_regular): def test_driver_lives_parallel(ray_start_regular): all_processes = ray.worker._global_node.all_processes - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] + all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] + all_processes[ray_constants.PROCESS_TYPE_RAYLET] + diff --git a/python/ray/tests/test_multinode_failures_2.py b/python/ray/tests/test_multinode_failures_2.py index adf0e1416..b29b77847 100644 --- a/python/ray/tests/test_multinode_failures_2.py +++ b/python/ray/tests/test_multinode_failures_2.py @@ -132,7 +132,7 @@ def test_driver_lives_sequential(ray_start_regular): ray.worker._global_node.kill_plasma_store() ray.worker._global_node.kill_log_monitor() ray.worker._global_node.kill_monitor() - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): ray.worker._global_node.kill_gcs_server() else: ray.worker._global_node.kill_raylet_monitor() @@ -145,7 +145,7 @@ def test_driver_lives_sequential(ray_start_regular): reason="Hanging with new GCS API.") def test_driver_lives_parallel(ray_start_regular): all_processes = ray.worker._global_node.all_processes - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] + all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] + all_processes[ray_constants.PROCESS_TYPE_RAYLET] + diff --git a/python/ray/tests/test_tempfile.py b/python/ray/tests/test_tempfile.py index 0e174fd23..473d940a3 100644 --- a/python/ray/tests/test_tempfile.py +++ b/python/ray/tests/test_tempfile.py @@ -145,7 +145,7 @@ def test_raylet_tempfiles(shutdown_only): "raylet.err" } - if ray_constants.GCS_SERVICE_ENABLED: + if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None): log_files_expected.update({"gcs_server.out", "gcs_server.err"}) else: log_files_expected.update({"raylet_monitor.out", "raylet_monitor.err"}) diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 1e7982a60..fd035e4ec 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -273,12 +273,3 @@ RAY_CONFIG(uint32_t, object_store_full_initial_delay_ms, 1000) /// Duration to wait between retries for failed tasks. RAY_CONFIG(uint32_t, task_retry_delay_ms, 5000) - -/// Whether to enable gcs service. -/// RAY_GCS_SERVICE_ENABLED is an env variable which only set in ci job. -/// If the value of RAY_GCS_SERVICE_ENABLED is false, we will disable gcs service, -/// otherwise gcs service is enabled. -/// TODO(ffbin): Once we entirely migrate to service-based GCS, we should remove it. -RAY_CONFIG(bool, gcs_service_enabled, - getenv("RAY_GCS_SERVICE_ENABLED") == nullptr || - getenv("RAY_GCS_SERVICE_ENABLED") == std::string("true")) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index de13080ea..e693d8637 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -115,7 +115,7 @@ CoreWorker::CoreWorker(const WorkerType worker_type, const Language language, RayLog::InstallFailureSignalHandler(); } // Initialize gcs client. - if (RayConfig::instance().gcs_service_enabled()) { + if (getenv("RAY_GCS_SERVICE_ENABLED") != nullptr) { gcs_client_ = std::make_shared(gcs_options); } else { gcs_client_ = std::make_shared(gcs_options); diff --git a/src/ray/core_worker/test/core_worker_test.cc b/src/ray/core_worker/test/core_worker_test.cc index b5f79ef54..dce131c49 100644 --- a/src/ray/core_worker/test/core_worker_test.cc +++ b/src/ray/core_worker/test/core_worker_test.cc @@ -109,7 +109,7 @@ class CoreWorkerTest : public ::testing::Test { } // start gcs server - if (RayConfig::instance().gcs_service_enabled()) { + if (getenv("RAY_GCS_SERVICE_ENABLED") != nullptr) { gcs_server_pid_ = StartGcsServer("127.0.0.1"); } else { // core worker test relies on node resources. It's important that one raylet can diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server/gcs_server.cc index c455024f7..499bba6d2 100644 --- a/src/ray/gcs/gcs_server/gcs_server.cc +++ b/src/ray/gcs/gcs_server/gcs_server.cc @@ -145,27 +145,18 @@ std::unique_ptr GcsServer::InitObjectInfoHandler() { void GcsServer::StoreGcsServerAddressInRedis() { boost::asio::ip::detail::endpoint primary_endpoint; boost::asio::ip::tcp::resolver resolver(main_service_); - boost::asio::ip::tcp::resolver::query query( - boost::asio::ip::host_name(), "", - boost::asio::ip::resolver_query_base::flags::v4_mapped); - boost::system::error_code error_code; - boost::asio::ip::tcp::resolver::iterator iter = resolver.resolve(query, error_code); + boost::asio::ip::tcp::resolver::query query(boost::asio::ip::host_name(), ""); + boost::asio::ip::tcp::resolver::iterator iter = resolver.resolve(query); boost::asio::ip::tcp::resolver::iterator end; // End marker. - if (!error_code) { - while (iter != end) { - boost::asio::ip::tcp::endpoint ep = *iter; - if (ep.address().is_v4() && !ep.address().is_loopback() && - !ep.address().is_multicast()) { - primary_endpoint.address(ep.address()); - primary_endpoint.port(ep.port()); - break; - } - iter++; + while (iter != end) { + boost::asio::ip::tcp::endpoint ep = *iter; + if (ep.address().is_v4() && !ep.address().is_loopback() && + !ep.address().is_multicast()) { + primary_endpoint.address(ep.address()); + primary_endpoint.port(ep.port()); + break; } - } else { - RAY_LOG(WARNING) << "Failed to resolve ip address, error = " - << strerror(error_code.value()); - iter = end; + iter++; } std::string address; diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index 42db174b2..e750bfc68 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -177,7 +177,8 @@ int main(int argc, char *argv[]) { ray::gcs::GcsClientOptions client_options(redis_address, redis_port, redis_password); std::shared_ptr gcs_client; - if (RayConfig::instance().gcs_service_enabled()) { + // RAY_GCS_SERVICE_ENABLED only set in ci job, so we just check if it is null. + if (getenv("RAY_GCS_SERVICE_ENABLED") != nullptr) { gcs_client = std::make_shared(client_options); } else { gcs_client = std::make_shared(client_options);