mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 07:44:45 +08:00
[autoscaler] RecoverUnhealthyWorker mitigation (#3699)
Increases number of retries for RecoverUnhealthyWorkers Closes #3435.
This commit is contained in:
@@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
|
||||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
|
||||
@@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
|
||||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
|
||||
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
# check that brew is installed
|
||||
which -s brew
|
||||
@@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
# check that brew is installed
|
||||
which -s brew
|
||||
@@ -68,7 +68,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
|
||||
elif [[ "$LINT" == "1" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake build-essential autoconf curl libtool unzip
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from flaky import flaky
|
||||
import shutil
|
||||
import tempfile
|
||||
import threading
|
||||
@@ -182,8 +183,8 @@ class AutoscalingTest(unittest.TestCase):
|
||||
shutil.rmtree(self.tmpdir)
|
||||
ray.shutdown()
|
||||
|
||||
def waitFor(self, condition):
|
||||
for _ in range(50):
|
||||
def waitFor(self, condition, num_retries=50):
|
||||
for _ in range(num_retries):
|
||||
if condition():
|
||||
return
|
||||
time.sleep(.1)
|
||||
@@ -674,6 +675,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
autoscaler.update()
|
||||
assert len(self.provider.nodes({})) == 0
|
||||
|
||||
@flaky(max_runs=4)
|
||||
def testRecoverUnhealthyWorkers(self):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
@@ -698,7 +700,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
lm.last_heartbeat_time_by_ip["172.0.0.0"] = 0
|
||||
num_calls = len(runner.calls)
|
||||
autoscaler.update()
|
||||
self.waitFor(lambda: len(runner.calls) > num_calls)
|
||||
self.waitFor(lambda: len(runner.calls) > num_calls, num_retries=150)
|
||||
|
||||
def testExternalNodeScaler(self):
|
||||
config = SMALL_CLUSTER.copy()
|
||||
|
||||
Reference in New Issue
Block a user