From 4a6c53da46c82d56bfda8bea31cf447928285560 Mon Sep 17 00:00:00 2001 From: fangfengbin <869218239a@zju.edu.cn> Date: Thu, 14 Jan 2021 21:50:32 +0800 Subject: [PATCH] [Core]Fix raylet scheduling bug (#13452) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Core]Fix raylet scheduling bug * fix lint error * fix lint error Co-authored-by: 灵洵 --- python/ray/tests/test_placement_group.py | 24 ++++++++++++++++++- .../scheduling/cluster_resource_scheduler.cc | 4 ++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 066ff0e75..7c5963f9e 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -14,7 +14,8 @@ from ray.test_utils import (generate_system_config_map, get_other_nodes, get_error_message) import ray.cluster_utils from ray._raylet import PlacementGroupID -from ray.util.placement_group import (PlacementGroup, +from ray.util.placement_group import (PlacementGroup, placement_group, + remove_placement_group, get_current_placement_group) @@ -1288,5 +1289,26 @@ def test_placement_group_wait_api(ray_start_cluster_head): placement_group1.wait(10) +def test_schedule_placement_groups_at_the_same_time(): + ray.init(num_cpus=4) + + pgs = [placement_group([{"CPU": 2}]) for _ in range(6)] + + wait_pgs = {pg.ready(): pg for pg in pgs} + + def is_all_placement_group_removed(): + ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5) + if ready: + ready_pg = wait_pgs[ready[0]] + remove_placement_group(ready_pg) + del wait_pgs[ready[0]] + + if len(wait_pgs) == 0: + return True + return False + + wait_for_condition(is_all_placement_group_removed) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc index 5e60c453a..90cf552cf 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc @@ -174,6 +174,10 @@ int64_t ClusterResourceScheduler::GetBestSchedulableNode(const TaskRequest &task bool actor_creation, int64_t *total_violations, bool *is_infeasible) { + // NOTE: We need to set `is_infeasible` to false in advance to avoid `is_infeasible` not + // being set. + *is_infeasible = false; + // Minimum number of soft violations across all nodes that can schedule the request. // We will pick the node with the smallest number of soft violations. int64_t min_violations = INT_MAX;