[cli] New logging for the rest of the ray commands (#9984)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
Maksim Smolin
2020-08-11 09:58:23 -07:00
committed by GitHub
parent 4f8fef134e
commit 40b8e35d61
8 changed files with 508 additions and 116 deletions
+69 -7
View File
@@ -9,6 +9,9 @@ as well as indentation and other structured output.
"""
import sys
import logging
import inspect
import os
import click
@@ -18,6 +21,60 @@ import colorful as cf
colorama.init()
def _patched_makeRecord(self,
name,
level,
fn,
lno,
msg,
args,
exc_info,
func=None,
extra=None,
sinfo=None):
"""Monkey-patched version of logging.Logger.makeRecord
We have to patch default loggers so they use the proper frame for
line numbers and function names (otherwise everything shows up as
e.g. cli_logger:info() instead of as where it was called from).
In Python 3.8 we could just use stacklevel=2, but we have to support
Python 3.6 and 3.7 as well.
The solution is this Python magic superhack.
The default makeRecord will deliberately check that we don't override
any existing property on the LogRecord using `extra`,
so we remove that check.
This patched version is otherwise identical to the one in the standard
library.
"""
rv = logging.LogRecord(name, level, fn, lno, msg, args, exc_info, func,
sinfo)
if extra is not None:
rv.__dict__.update(extra)
return rv
logging.Logger.makeRecord = _patched_makeRecord
def _parent_frame_info():
"""Get the info from the caller frame.
Used to override the logging function and line number with the correct
ones. See the comment on _patched_makeRecord for more info.
"""
frame = inspect.currentframe()
# we are also in a function, so must go 2 levels up
caller = frame.f_back.f_back
return {
"lineno": caller.f_lineno,
"filename": os.path.basename(caller.f_code.co_filename),
}
def _format_msg(msg,
*args,
_tags=None,
@@ -93,7 +150,7 @@ def _format_msg(msg,
if _no_format:
# todo: throw if given args/kwargs?
return numbering_str + msg + tags_str
return numbering_str + msg.format(*args, **kwargs) + tags_str
return numbering_str + cf.format(msg, *args, **kwargs) + tags_str
if kwargs:
raise ValueError("We do not support printing kwargs yet.")
@@ -180,7 +237,7 @@ class _CliLogger():
def newline(self):
"""Print a line feed.
"""
self._print("")
self.print("")
def _print(self, msg, linefeed=True):
"""Proxy for printing messages.
@@ -377,7 +434,8 @@ class _CliLogger():
For other arguments, see `_format_msg`.
"""
if self.old_style:
logger.debug(_format_msg(msg, *args, **kwargs))
logger.debug(
_format_msg(msg, *args, **kwargs), extra=_parent_frame_info())
return
def old_info(self, logger, msg, *args, **kwargs):
@@ -393,7 +451,8 @@ class _CliLogger():
For other arguments, see `_format_msg`.
"""
if self.old_style:
logger.info(_format_msg(msg, *args, **kwargs))
logger.info(
_format_msg(msg, *args, **kwargs), extra=_parent_frame_info())
return
def old_warning(self, logger, msg, *args, **kwargs):
@@ -409,7 +468,8 @@ class _CliLogger():
For other arguments, see `_format_msg`.
"""
if self.old_style:
logger.warning(_format_msg(msg, *args, **kwargs))
logger.warning(
_format_msg(msg, *args, **kwargs), extra=_parent_frame_info())
return
def old_error(self, logger, msg, *args, **kwargs):
@@ -425,7 +485,8 @@ class _CliLogger():
For other arguments, see `_format_msg`.
"""
if self.old_style:
logger.error(_format_msg(msg, *args, **kwargs))
logger.error(
_format_msg(msg, *args, **kwargs), extra=_parent_frame_info())
return
def old_exception(self, logger, msg, *args, **kwargs):
@@ -441,7 +502,8 @@ class _CliLogger():
For other arguments, see `_format_msg`.
"""
if self.old_style:
logger.exception(_format_msg(msg, *args, **kwargs))
logger.exception(
_format_msg(msg, *args, **kwargs), extra=_parent_frame_info())
return
def render_list(self, xs, separator=cf.reset(", ")):
+16 -3
View File
@@ -26,7 +26,20 @@ HASH_MAX_LENGTH = 10
KUBECTL_RSYNC = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "kubernetes/kubectl-rsync.sh")
_config = {"use_login_shells": True}
_config = {"use_login_shells": True, "silent_rsync": True}
def is_rsync_silent():
return _config["silent_rsync"]
def set_rsync_silent(val):
"""Choose whether to silence rsync output.
Most commands will want to list rsync'd files themselves rather than
print the default rsync spew.
"""
_config["silent_rsync"] = val
def is_using_login_shells():
@@ -460,7 +473,7 @@ class SSHCommandRunner(CommandRunnerInterface):
target)
]
cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
self._run_helper(command, silent=True)
self._run_helper(command, silent=is_rsync_silent())
def run_rsync_down(self, source, target):
self._set_ssh_ip_if_required()
@@ -473,7 +486,7 @@ class SSHCommandRunner(CommandRunnerInterface):
source), target
]
cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
self._run_helper(command, silent=True)
self._run_helper(command, silent=is_rsync_silent())
def remote_shell_command_str(self):
if self.ssh_private_key:
+23 -20
View File
@@ -29,7 +29,8 @@ from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
from ray.ray_constants import AUTOSCALER_RESOURCE_REQUEST_CHANNEL
from ray.autoscaler.updater import NodeUpdaterThread
from ray.autoscaler.command_runner import set_using_login_shells
from ray.autoscaler.command_runner import set_using_login_shells, \
set_rsync_silent
from ray.autoscaler.command_runner import DockerCommandRunner
from ray.autoscaler.log_timer import LogTimer
from ray.worker import global_worker
@@ -97,14 +98,9 @@ def create_or_update_cluster(
config_file: str, override_min_workers: Optional[int],
override_max_workers: Optional[int], no_restart: bool,
restart_only: bool, yes: bool, override_cluster_name: Optional[str],
no_config_cache: bool, log_old_style: bool, log_color: str,
dump_command_output: bool, use_login_shells: bool,
verbose: int) -> None:
no_config_cache: bool, dump_command_output: bool,
use_login_shells: bool) -> None:
"""Create or updates an autoscaling Ray cluster from a config json."""
cli_logger.old_style = log_old_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
set_using_login_shells(use_login_shells)
cmd_output_util.set_output_redirected(not dump_command_output)
@@ -184,6 +180,7 @@ def create_or_update_cluster(
# because it only supports aws
if config["provider"]["type"] != "aws":
cli_logger.old_style = True
cli_logger.newline()
config = _bootstrap_config(config, no_config_cache)
if config["provider"]["type"] != "aws":
cli_logger.old_style = False
@@ -217,7 +214,6 @@ def _bootstrap_config(config: Dict[str, Any],
try_reload_log_state(config_cache["config"]["provider"],
config_cache.get("provider_log_info"))
cli_logger.newline()
cli_logger.verbose_warning(
"Loaded cached provider configuration "
"from " + cf.bold("{}"), cache_key)
@@ -264,14 +260,8 @@ def _bootstrap_config(config: Dict[str, Any],
def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
override_cluster_name: Optional[str],
keep_min_workers: bool, log_old_style: bool,
log_color: str, verbose: int):
keep_min_workers: bool):
"""Destroys all nodes of a Ray cluster described by a config json."""
cli_logger.old_style = log_old_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
cli_logger.dump_command_output = verbose == 3 # todo: add a separate flag?
config = yaml.safe_load(open(config_file).read())
if override_cluster_name is not None:
config["cluster_name"] = override_cluster_name
@@ -375,7 +365,8 @@ def kill_node(config_file, yes, hard, override_cluster_name):
config["cluster_name"] = override_cluster_name
config = _bootstrap_config(config)
confirm("This will kill a node in your cluster", yes)
cli_logger.confirm(yes, "A random node will be killed.")
cli_logger.old_confirm("This will kill a node in your cluster", yes)
provider = get_node_provider(config["provider"], config["cluster_name"])
try:
@@ -383,7 +374,8 @@ def kill_node(config_file, yes, hard, override_cluster_name):
TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER
})
node = random.choice(nodes)
logger.info("kill_node: Shutdown worker {}".format(node))
cli_logger.print("Shutdown " + cf.bold("{}"), node)
cli_logger.old_info(logger, "kill_node: Shutdown worker {}", node)
if hard:
provider.terminate_node(node)
else:
@@ -682,7 +674,7 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
cli_logger.newline()
with cli_logger.group("Useful commands"):
cli_logger.print("Monitor auto-scailng with")
cli_logger.print("Monitor autoscaling with")
cli_logger.print(
cf.bold(" ray exec {}{} {}"), raw_config_file, modifiers,
quote(monitor_str))
@@ -820,9 +812,12 @@ def exec_cluster(config_file: str,
attach_command_parts.append("--screen")
attach_command = " ".join(attach_command_parts)
cli_logger.print("Run `{}` to check command status.",
cf.bold(attach_command))
attach_info = "Use `{}` to check on command status.".format(
attach_command)
logger.info(attach_info)
cli_logger.old_info(logger, attach_info)
return result
finally:
provider.cleanup()
@@ -873,6 +868,10 @@ def rsync(config_file: str,
down: whether we're syncing remote -> local
all_nodes: whether to sync worker nodes in addition to the head node
"""
if bool(source) != bool(target):
cli_logger.abort(
"Expected either both a source and a target, or neither.")
assert bool(source) == bool(target), (
"Must either provide both or neither source and target.")
@@ -918,6 +917,10 @@ def rsync(config_file: str,
rsync = updater.rsync_up
if source and target:
# print rsync progress for single file rsync
cmd_output_util.set_output_redirected(False)
set_rsync_silent(False)
rsync(source, target)
else:
updater.sync_file_mounts(rsync)
@@ -14,7 +14,7 @@ _config = {"redirect_output": True}
def is_output_redirected():
return _config["_redirect_output"]
return _config["redirect_output"]
def set_output_redirected(val):
@@ -26,7 +26,7 @@ def set_output_redirected(val):
val (bool): If true, subprocess output will be redirected to
a temporary file.
"""
_config["_redirect_output"] = val
_config["redirect_output"] = val
class ProcessRunnerError(Exception):
+15 -6
View File
@@ -119,7 +119,10 @@ class NodeUpdater:
self.exitcode = 0
def sync_file_mounts(self, sync_cmd):
def sync_file_mounts(self, sync_cmd, step_numbers=(0, 2)):
# step_numbers is (# of previous steps, total steps)
previous_steps, total_steps = step_numbers
nolog_paths = []
if cli_logger.verbosity == 0:
nolog_paths = [
@@ -154,18 +157,21 @@ class NodeUpdater:
# Rsync file mounts
with cli_logger.group(
"Processing file mounts", _numbered=("[]", 2, 6)):
"Processing file mounts",
_numbered=("[]", previous_steps + 1, total_steps)):
for remote_path, local_path in self.file_mounts.items():
do_sync(remote_path, local_path)
if self.cluster_synced_files:
with cli_logger.group(
"Processing worker file mounts", _numbered=("[]", 3, 6)):
"Processing worker file mounts",
_numbered=("[]", previous_steps + 2, total_steps)):
for path in self.cluster_synced_files:
do_sync(path, path, allow_non_existing_paths=True)
else:
cli_logger.print(
"No worker file mounts to sync", _numbered=("[]", 3, 6))
"No worker file mounts to sync",
_numbered=("[]", previous_steps + 2, total_steps))
def wait_ready(self, deadline):
with cli_logger.group(
@@ -239,7 +245,8 @@ class NodeUpdater:
# full setup might be cancelled here
cli_logger.print(
"Configuration already up to date, "
"skipping file mounts, initalization and setup commands.")
"skipping file mounts, initalization and setup commands.",
_numbered=("[]", "2-5", 6))
cli_logger.old_info(logger,
"{}{} already up-to-date, skip to ray start",
self.log_prefix, self.node_id)
@@ -252,7 +259,7 @@ class NodeUpdater:
self.provider.set_node_tags(
self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES})
cli_logger.labeled_value("New status", STATUS_SYNCING_FILES)
self.sync_file_mounts(self.rsync_up)
self.sync_file_mounts(self.rsync_up, step_numbers=(2, 6))
# Only run setup commands if runtime_hash has changed because
# we don't want to run setup_commands every time the head node
@@ -331,7 +338,9 @@ class NodeUpdater:
self.log_prefix + "Ray start commands", show_status=True):
for cmd in self.ray_start_commands:
try:
cmd_output_util.set_output_redirected(False)
self.cmd_runner.run(cmd)
cmd_output_util.set_output_redirected(True)
except ProcessRunnerError as e:
if e.msg_type == "ssh_command_failed":
cli_logger.error("Failed.")
+25 -6
View File
@@ -10,6 +10,9 @@ import sys
import ray
import ray.ray_constants as ray_constants
from ray.autoscaler.cli_logger import cli_logger
import colorful as cf
logger = logging.getLogger(__name__)
# Prefix for the node id resource that is automatically added to each node.
@@ -220,15 +223,31 @@ class ResourceSpec(
round(memory / 1e9, 2),
int(100 * (memory / system_memory))))
logger.info(
rounded_memory = ray_constants.round_to_memory_units(
memory, round_up=False)
worker_ram = round(rounded_memory / (1024**3), 2)
object_ram = round(object_store_memory / (1024**3), 2)
# TODO(maximsmol): this behavior is strange since we do not have a
# good grasp on when this will get called
# (you have to study node.py to make a guess)
with cli_logger.group("Available RAM"):
cli_logger.labeled_value("Workers", "{} GiB", str(worker_ram))
cli_logger.labeled_value("Objects", "{} GiB", str(object_ram))
cli_logger.newline()
cli_logger.print("To adjust these values, use")
with cf.with_style("monokai") as c:
cli_logger.print(
" ray{0}init(memory{1}{2}, "
"object_store_memory{1}{2})", c.magenta("."),
c.magenta("="), c.purple("<bytes>"))
cli_logger.old_info(
logger,
"Starting Ray with {} GiB memory available for workers and up to "
"{} GiB for objects. You can adjust these settings "
"with ray.init(memory=<bytes>, "
"object_store_memory=<bytes>).".format(
round(
ray_constants.round_to_memory_units(
memory, round_up=False) / (1024**3), 2),
round(object_store_memory / (1024**3), 2)))
"object_store_memory=<bytes>).", worker_ram, object_ram)
spec = ResourceSpec(num_cpus, num_gpus, memory, object_store_memory,
resources, redis_max_memory)
+342 -69
View File
@@ -21,6 +21,10 @@ import ray.ray_constants as ray_constants
import ray.utils
from ray.projects.scripts import project_cli, session_cli
from ray.autoscaler.subprocess_output_util import set_output_redirected
from ray.autoscaler.cli_logger import cli_logger
import colorful as cf
logger = logging.getLogger(__name__)
@@ -48,6 +52,38 @@ def check_no_existing_redis_clients(node_ip_address, redis_client):
"clients with this IP address.")
logging_options = [
click.option(
"--log-new-style/--log-old-style",
is_flag=True,
default=False,
envvar="RAY_LOG_NEWSTYLE",
help=("Whether to use the old or the new CLI UX. "
"The new UX supports colored, formatted output and was "
"designed to display only the most important information for "
"human users. The old UX uses the standard `logging` module. "
"It is most suitable for writing to a file and will include "
"timestamps and message level (ERROR/WARNING/INFO).")),
click.option(
"--log-color",
required=False,
type=click.Choice(["auto", "false", "true"], case_sensitive=False),
default="auto",
help=("Use color logging. "
"Valid values are: auto (if stdout is a tty), true, false.")),
click.option("-v", "--verbose", count=True)
]
def add_click_options(options):
def wrapper(f):
for option in reversed(logging_options):
f = option(f)
return f
return wrapper
@click.group()
@click.option(
"--logging-level",
@@ -353,6 +389,7 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
default=None,
help="the port to use to expose Ray metrics through a "
"Prometheus endpoint.")
@add_click_options(logging_options)
def start(node_ip_address, redis_address, address, redis_port, port,
num_redis_shards, redis_max_clients, redis_password,
redis_shard_ports, object_manager_port, node_manager_port,
@@ -363,32 +400,60 @@ def start(node_ip_address, redis_address, address, redis_port, port,
autoscaling_config, no_redirect_worker_output, no_redirect_output,
plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
java_worker_options, load_code_from_local, internal_config,
lru_evict, enable_object_reconstruction, metrics_export_port):
lru_evict, enable_object_reconstruction, metrics_export_port,
log_new_style, log_color, verbose):
"""Start Ray processes manually on the local machine."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
if gcs_server_port and not head:
raise ValueError(
"gcs_server_port can be only assigned when you specify --head.")
if redis_address is not None:
cli_logger.abort("{} is deprecated. Use {} instead.",
cf.bold("--redis-address"), cf.bold("--address"))
raise DeprecationWarning("The --redis-address argument is "
"deprecated. Please use --address instead.")
if redis_port is not None:
logger.warn("The --redis-port argument will be deprecated soon. "
"Please use --port instead.")
cli_logger.warning("{} is being deprecated. Use {} instead.",
cf.bold("--redis-port"), cf.bold("--port"))
cli_logger.old_warning(
logger, "The --redis-port argument will be deprecated soon. "
"Please use --port instead.")
if port is not None and port != redis_port:
cli_logger.abort(
"Incompatible values for {} and {}. Use only {} instead.",
cf.bold("--port"), cf.bold("--redis-port"), cf.bold("--port"))
raise ValueError("Cannot specify both --port and --redis-port "
"as port is a rename of deprecated redis-port")
if include_webui is not None:
logger.warn("The --include-webui argument will be deprecated soon"
"Please use --include-dashboard instead.")
cli_logger.warning("{} is being deprecated. Use {} instead.",
cf.bold("--include-webui"),
cf.bold("--include-dashboard"))
cli_logger.old_warning(
logger, "The --include-webui argument will be deprecated soon"
"Please use --include-dashboard instead.")
if include_dashboard is not None:
include_dashboard = include_webui
dashboard_host_default = "localhost"
if webui_host != dashboard_host_default:
logger.warn("The --webui-host argument will be deprecated"
" soon. Please use --dashboard-host instead.")
cli_logger.warning("{} is being deprecated. Use {} instead.",
cf.bold("--webui-host"),
cf.bold("--dashboard-host"))
cli_logger.old_warning(
logger, "The --webui-host argument will be deprecated"
" soon. Please use --dashboard-host instead.")
if webui_host != dashboard_host and dashboard_host != "localhost":
cli_logger.abort(
"Incompatible values for {} and {}. Use only {} instead.",
cf.bold("--dashboard-host"), cf.bold("--webui-host"),
cf.bold("--dashboard-host"))
raise ValueError(
"Cannot specify both --webui-host and --dashboard-host,"
" please specify only the latter")
@@ -407,6 +472,13 @@ def start(node_ip_address, redis_address, address, redis_port, port,
try:
resources = json.loads(resources)
except Exception:
cli_logger.error("`{}` is not a valid JSON string.",
cf.bold("--resources"))
cli_logger.abort(
"Valid values look like this: `{}`",
cf.bold("--resources='\"CustomResource3\": 1, "
"\"CustomResource2\": 2}'"))
raise Exception("Unable to parse the --resources argument using "
"json.loads. Try using a format like\n\n"
" --resources='{\"CustomResource1\": 3, "
@@ -454,6 +526,16 @@ def start(node_ip_address, redis_address, address, redis_port, port,
num_redis_shards = len(redis_shard_ports)
# Check that the arguments match.
if len(redis_shard_ports) != num_redis_shards:
cli_logger.error(
"`{}` must be a comma-separated list of ports, "
"with length equal to `{}` (which defaults to {})",
cf.bold("--redis-shard-ports"),
cf.bold("--num-redis-shards"), cf.bold("1"))
cli_logger.abort(
"Example: `{}`",
cf.bold("--num-redis-shards 3 "
"--redis_shard_ports 6380,6381,6382"))
raise Exception("If --redis-shard-ports is provided, it must "
"have the form '6380,6381,6382', and the "
"number of ports provided must equal "
@@ -461,6 +543,10 @@ def start(node_ip_address, redis_address, address, redis_port, port,
"provided)")
if redis_address is not None:
cli_logger.abort(
"`{}` starts a new Redis server, `{}` should not be set.",
cf.bold("--head"), cf.bold("--address"))
raise Exception("If --head is passed in, a Redis server will be "
"started, so a Redis address should not be "
"provided.")
@@ -468,8 +554,9 @@ def start(node_ip_address, redis_address, address, redis_port, port,
# Get the node IP address if one is not provided.
ray_params.update_if_absent(
node_ip_address=services.get_node_ip_address())
logger.info("Using IP address {} for this node.".format(
ray_params.node_ip_address))
cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
cli_logger.old_info(logger, "Using IP address {} for this node.",
ray_params.node_ip_address)
ray_params.update_if_absent(
redis_port=port or redis_port,
redis_shard_ports=redis_shard_ports,
@@ -484,7 +571,45 @@ def start(node_ip_address, redis_address, address, redis_port, port,
ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block)
redis_address = node.redis_address
logger.info(
# new-style CLI UX (--log-new-style)
# this is a noop if that flag is not set, so the old logger calls
# are still in place
cli_logger.newline()
startup_msg = "Ray runtime started."
cli_logger.success("-" * len(startup_msg))
cli_logger.success(startup_msg)
cli_logger.success("-" * len(startup_msg))
cli_logger.newline()
with cli_logger.group("Next steps"):
cli_logger.print(
"To connect to this Ray runtime from another node, run")
cli_logger.print(
cf.bold(" ray start --address='{}'{}"), redis_address,
" --redis-password='{}'".format(redis_password)
if redis_password else "")
cli_logger.newline()
cli_logger.print("Alternatively, use the following Python code:")
with cli_logger.indented():
with cf.with_style("monokai") as c:
cli_logger.print("{} ray", c.magenta("import"))
cli_logger.print(
"ray{}init(address{}{}{})", c.magenta("."),
c.magenta("="), c.yellow("'auto'"),
", redis_password{}{}".format(
c.magenta("="),
c.yellow("'" + redis_password + "'"))
if redis_password else "")
cli_logger.newline()
cli_logger.print(
cf.underlined("If connection fails, check your "
"firewall settings other "
"network configuration."))
cli_logger.newline()
cli_logger.print("To terminate the Ray runtime, run")
cli_logger.print(cf.bold(" ray stop"))
cli_logger.old_info(
logger,
"\nStarted Ray on this node. You can add additional nodes to "
"the cluster by calling\n\n"
" ray start --address='{}'{}\n\n"
@@ -503,29 +628,54 @@ def start(node_ip_address, redis_address, address, redis_port, port,
else:
# Start Ray on a non-head node.
if not (redis_port is None and port is None):
cli_logger.abort("`{}/{}` should not be specified without `{}`.",
cf.bold("--port"), cf.bold("--redis-port"),
cf.bold("--head"))
raise Exception(
"If --head is not passed in, --port and --redis-port are not "
"allowed.")
if redis_shard_ports is not None:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--redis-shard-ports"), cf.bold("--head"))
raise Exception("If --head is not passed in, --redis-shard-ports "
"is not allowed.")
if redis_address is None:
cli_logger.abort("`{}` is required unless starting with `{}`.",
cf.bold("--address"), cf.bold("--head"))
raise Exception("If --head is not passed in, --address must "
"be provided.")
if num_redis_shards is not None:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--num-redis-shards"), cf.bold("--head"))
raise Exception("If --head is not passed in, --num-redis-shards "
"must not be provided.")
if redis_max_clients is not None:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--redis-max-clients"), cf.bold("--head"))
raise Exception("If --head is not passed in, --redis-max-clients "
"must not be provided.")
if include_webui:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--include-web-ui"), cf.bold("--head"))
raise Exception("If --head is not passed in, the --include-webui"
"flag is not relevant.")
if include_dashboard:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--include-dashboard"), cf.bold("--head"))
raise ValueError(
"If --head is not passed in, the --include-dashboard"
"flag is not relevant.")
if include_java is not None:
cli_logger.abort("`{}` should not be specified without `{}`.",
cf.bold("--include-java"), cf.bold("--head"))
raise ValueError("--include-java should only be set for the head "
"node.")
@@ -545,8 +695,11 @@ def start(node_ip_address, redis_address, address, redis_port, port,
# Get the node IP address if one is not provided.
ray_params.update_if_absent(
node_ip_address=services.get_node_ip_address(redis_address))
logger.info("Using IP address {} for this node.".format(
ray_params.node_ip_address))
cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
cli_logger.old_info(logger, "Using IP address {} for this node.",
ray_params.node_ip_address)
# Check that there aren't already Redis clients with the same IP
# address connected with this Redis instance. This raises an exception
# if the Redis server already has clients on this node.
@@ -555,21 +708,54 @@ def start(node_ip_address, redis_address, address, redis_port, port,
ray_params.update(redis_address=redis_address)
node = ray.node.Node(
ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block)
logger.info("\nStarted Ray on this node. If you wish to terminate the "
"processes that have been started, run\n\n"
" ray stop")
cli_logger.newline()
startup_msg = "Ray runtime started."
cli_logger.success("-" * len(startup_msg))
cli_logger.success(startup_msg)
cli_logger.success("-" * len(startup_msg))
cli_logger.newline()
cli_logger.print("To terminate the Ray runtime, run")
cli_logger.print(cf.bold(" ray stop"))
cli_logger.old_info(
logger, "\nStarted Ray on this node. If you wish to terminate the "
"processes that have been started, run\n\n"
" ray stop")
if block:
cli_logger.newline()
with cli_logger.group(cf.bold("--block")):
cli_logger.print(
"This command will now block until terminated by a signal.")
cli_logger.print(
"Runing subprocesses are monitored and a message will be "
"printed if any of them terminate unexpectedly.")
while True:
time.sleep(1)
deceased = node.dead_processes()
if len(deceased) > 0:
logger.error("Ray processes died unexpectedly:")
for process_type, process in deceased:
logger.error("\t{} died with exit code {}".format(
process_type, process.returncode))
cli_logger.newline()
cli_logger.error("Some Ray subprcesses exited unexpectedly:")
cli_logger.old_error(logger,
"Ray processes died unexpectedly:")
with cli_logger.indented():
for process_type, process in deceased:
cli_logger.error(
"{}",
cf.bold(str(process_type)),
_tags={"exit code": str(process.returncode)})
cli_logger.old_error(
logger, "\t{} died with exit code {}".format(
process_type, process.returncode))
# shutdown_at_exit will handle cleanup.
logger.error("Killing remaining processes and exiting...")
cli_logger.newline()
cli_logger.error("Remaining processes will be killed.")
cli_logger.old_error(
logger, "Killing remaining processes and exiting...")
sys.exit(1)
@@ -579,13 +765,14 @@ def start(node_ip_address, redis_address, address, redis_port, port,
"--force",
is_flag=True,
help="If set, ray will send SIGKILL instead of SIGTERM.")
@click.option(
"-v",
"--verbose",
is_flag=True,
help="If set, ray prints out more information about processes to kill.")
def stop(force, verbose):
@add_click_options(logging_options)
def stop(force, verbose, log_new_style, log_color):
"""Stop Ray processes manually on the local machine."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
# Note that raylet needs to exit before object store, otherwise
# it cannot exit gracefully.
is_linux = sys.platform.startswith("linux")
@@ -619,12 +806,17 @@ def stop(force, verbose):
process_infos.append((proc, proc.name(), proc.cmdline()))
except psutil.Error:
pass
total_found = 0
total_stopped = 0
for keyword, filter_by_cmd in processes_to_kill:
if filter_by_cmd and is_linux and len(keyword) > 15:
# getting here is an internal bug, so we do not use cli_logger
msg = ("The filter string should not be more than {} "
"characters. Actual length: {}. Filter: {}").format(
15, len(keyword), keyword)
raise ValueError(msg)
found = []
for candidate in process_infos:
proc, proc_cmd, proc_args = candidate
@@ -632,11 +824,15 @@ def stop(force, verbose):
if filter_by_cmd else subprocess.list2cmdline(proc_args))
if keyword in corpus:
found.append(candidate)
for proc, proc_cmd, proc_args in found:
total_found += 1
proc_string = str(subprocess.list2cmdline(proc_args))
if verbose:
operation = "Terminating" if force else "Killing"
logger.info("%s process %s: %s", operation, proc.pid,
subprocess.list2cmdline(proc_args))
cli_logger.old_info(logger, "%s process %s: %s", operation,
proc.pid, proc_string)
try:
if force:
proc.kill()
@@ -645,10 +841,41 @@ def stop(force, verbose):
# We don't want CTRL_BREAK_EVENT, because that would
# terminate the entire process group. What to do?
proc.terminate()
if force:
cli_logger.verbose("Killed `{}` {} ", cf.bold(proc_string),
cf.gray("(via SIGKILL)"))
else:
cli_logger.verbose("Send termination request to `{}` {}",
cf.bold(proc_string),
cf.gray("(via SIGTERM)"))
total_stopped += 1
except psutil.NoSuchProcess:
cli_logger.verbose(
"Attempted to stop `{}`, but process was already dead.",
cf.bold(proc_string))
pass
except (psutil.Error, OSError) as ex:
logger.error("Error: %s", ex)
cli_logger.error("Could not terminate `{}` due to {}",
cf.bold(proc_string), str(ex))
cli_logger.old_error(logger, "Error: %s", ex)
if total_found == 0:
cli_logger.print("Did not find any active Ray processes.")
else:
if total_stopped == total_found:
cli_logger.success("Stopped all {} Ray processes.", total_stopped)
else:
cli_logger.warning(
"Stopped only {} out of {} Ray processes. "
"Set `{}` to see more details.", total_stopped, total_found,
cf.bold("-v"))
cli_logger.warning("Try running the command again, or use `{}`.",
cf.bold("--force"))
# TODO(maximsmol): we should probably block until the processes actually
# all died somehow
@cli.command()
@@ -692,18 +919,6 @@ def stop(force, verbose):
is_flag=True,
default=False,
help="Disable the local cluster config cache.")
@click.option(
"--log-old-style/--log-new-style",
is_flag=True,
default=True,
help=("Use old logging."))
@click.option(
"--log-color",
required=False,
type=str,
default="auto",
help=("Use color logging. "
"Valid values are: auto (if stdout is a tty), true, false."))
@click.option(
"--dump-command-output",
is_flag=True,
@@ -717,14 +932,22 @@ def stop(force, verbose):
help=("Ray uses login shells (bash --login -i) to run cluster commands "
"by default. If your workflow is compatible with normal shells, "
"this can be disabled for a better user experience."))
@click.option("-v", "--verbose", count=True)
@add_click_options(logging_options)
def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only,
yes, cluster_name, no_config_cache, log_old_style, log_color,
dump_command_output, use_login_shells, verbose):
yes, cluster_name, no_config_cache, dump_command_output,
use_login_shells, log_new_style, log_color, verbose):
"""Create or update a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
if restart_only or no_restart:
cli_logger.doassert(restart_only != no_restart,
"`{}` is incompatible with `{}`.",
cf.bold("--restart-only"), cf.bold("--no-restart"))
assert restart_only != no_restart, "Cannot set both 'restart_only' " \
"and 'no_restart' at the same time!"
if urllib.parse.urlparse(cluster_config_file).scheme in ("http", "https"):
try:
response = urllib.request.urlopen(cluster_config_file, timeout=5)
@@ -734,11 +957,14 @@ def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only,
f.write(content)
cluster_config_file = file_name
except urllib.error.HTTPError as e:
logger.info("Error downloading file: ", e)
cli_logger.warning("{}", str(e))
cli_logger.warning(
"Could not download remote cluster configuration file.")
cli_logger.old_info(logger, "Error downloading file: ", e)
create_or_update_cluster(cluster_config_file, min_workers, max_workers,
no_restart, restart_only, yes, cluster_name,
no_config_cache, log_old_style, log_color,
dump_command_output, use_login_shells, verbose)
no_config_cache, dump_command_output,
use_login_shells)
@cli.command()
@@ -765,24 +991,16 @@ def up(cluster_config_file, min_workers, max_workers, no_restart, restart_only,
is_flag=True,
default=False,
help="Retain the minimal amount of workers specified in the config.")
@click.option(
"--log-old-style/--log-new-style",
is_flag=True,
default=True,
help=("Use old logging."))
@click.option(
"--log-color",
required=False,
type=str,
default="auto",
help=("Use color logging. "
"Valid values are: auto (if stdout is a tty), true, false."))
@click.option("-v", "--verbose", count=True)
@add_click_options(logging_options)
def down(cluster_config_file, yes, workers_only, cluster_name,
keep_min_workers, log_old_style, log_color, verbose):
keep_min_workers, log_new_style, log_color, verbose):
"""Tear down a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
keep_min_workers, log_old_style, log_color, verbose)
keep_min_workers)
@cli.command()
@@ -825,8 +1043,14 @@ def kill_random_node(cluster_config_file, yes, hard, cluster_name):
required=False,
type=str,
help="Override the configured cluster name.")
def monitor(cluster_config_file, lines, cluster_name):
@add_click_options(logging_options)
def monitor(cluster_config_file, lines, cluster_name, log_new_style, log_color,
verbose):
"""Tails the autoscaler logs of a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
monitor_cluster(cluster_config_file, lines, cluster_name)
@@ -856,9 +1080,14 @@ def monitor(cluster_config_file, lines, cluster_name):
multiple=True,
type=int,
help="Port to forward. Use this multiple times to forward multiple ports.")
@add_click_options(logging_options)
def attach(cluster_config_file, start, screen, tmux, cluster_name, new,
port_forward):
port_forward, log_new_style, log_color, verbose):
"""Create or attach to a SSH session to a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
port_forward = [(port, port) for port in list(port_forward)]
attach_cluster(cluster_config_file, start, screen, tmux, cluster_name, new,
port_forward)
@@ -874,8 +1103,14 @@ def attach(cluster_config_file, start, screen, tmux, cluster_name, new,
required=False,
type=str,
help="Override the configured cluster name.")
def rsync_down(cluster_config_file, source, target, cluster_name):
@add_click_options(logging_options)
def rsync_down(cluster_config_file, source, target, cluster_name,
log_new_style, log_color, verbose):
"""Download specific files from a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
rsync(cluster_config_file, source, target, cluster_name, down=True)
@@ -895,8 +1130,14 @@ def rsync_down(cluster_config_file, source, target, cluster_name):
is_flag=True,
required=False,
help="Upload to all nodes (workers and head).")
def rsync_up(cluster_config_file, source, target, cluster_name, all_nodes):
@add_click_options(logging_options)
def rsync_up(cluster_config_file, source, target, cluster_name, all_nodes,
log_new_style, log_color, verbose):
"""Upload specific files to a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
rsync(
cluster_config_file,
source,
@@ -945,8 +1186,10 @@ def rsync_up(cluster_config_file, source, target, cluster_name, all_nodes):
type=str,
help="(deprecated) Use '-- --arg1 --arg2' for script args.")
@click.argument("script_args", nargs=-1)
@add_click_options(logging_options)
def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
port_forward, script, args, script_args):
port_forward, script, args, script_args, log_new_style, log_color,
verbose):
"""Uploads and runs a script on the specified cluster.
The script is automatically synced to the following location:
@@ -956,11 +1199,34 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
Example:
>>> ray submit [CLUSTER.YAML] experiment.py -- --smoke-test
"""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
set_output_redirected(False)
cli_logger.doassert(not (screen and tmux),
"`{}` and `{}` are incompatible.", cf.bold("--screen"),
cf.bold("--tmux"))
cli_logger.doassert(
not (script_args and args),
"`{0}` and `{1}` are incompatible. Use only `{1}`.\n"
"Example: `{2}`", cf.bold("--args"), cf.bold("-- <args ...>"),
cf.bold("ray submit script.py -- --arg=123 --flag"))
assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
assert not (script_args and args), "Use -- --arg1 --arg2 for script args."
if args:
logger.warning(
cli_logger.warning(
"`{}` is deprecated and will be removed in the future.",
cf.bold("--args"))
cli_logger.warning("Use `{}` instead. Example: `{}`.",
cf.bold("-- <args ...>"),
cf.bold("ray submit script.py -- --arg=123 --flag"))
cli_logger.newline()
cli_logger.old_warning(
logger,
"ray submit [yaml] [script.py] --args=... is deprecated and "
"will be removed in a future version of Ray. Use "
"`ray submit [yaml] script.py -- --arg1 --arg2` instead.")
@@ -1032,9 +1298,16 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
multiple=True,
type=int,
help="Port to forward. Use this multiple times to forward multiple ports.")
@add_click_options(logging_options)
def exec(cluster_config_file, cmd, run_env, screen, tmux, stop, start,
cluster_name, port_forward):
cluster_name, port_forward, log_new_style, log_color, verbose):
"""Execute a command via SSH on a Ray cluster."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
set_output_redirected(False)
port_forward = [(port, port) for port in list(port_forward)]
exec_cluster(
+16 -3
View File
@@ -19,6 +19,9 @@ import ray
import ray.ray_constants as ray_constants
import psutil
from ray.autoscaler.cli_logger import cli_logger
import colorful as cf
resource = None
if sys.platform != "win32":
import resource
@@ -579,6 +582,12 @@ def wait_for_redis_to_start(redis_ip_address, redis_port, password=None):
else:
break
else:
cli_logger.error(
"Unable to connect to Redis at "
"`{c.underlined}{}:{}{c.no_underlined}` after {} retries.",
redis_ip_address, redis_port, num_retries)
cli_logger.abort("Check your firewall and network settings.")
raise RuntimeError("Unable to connect to Redis. If the Redis instance "
"is on a different machine, check that your "
"firewall is configured properly.")
@@ -1191,9 +1200,13 @@ def start_dashboard(require_dashboard,
dashboard_url = "{}:{}".format(
host if host != "0.0.0.0" else get_node_ip_address(), port)
logger.info("View the Ray dashboard at {}{}{}{}{}".format(
colorama.Style.BRIGHT, colorama.Fore.GREEN, dashboard_url,
colorama.Fore.RESET, colorama.Style.NORMAL))
cli_logger.labeled_value("Dashboard URL", cf.underlined("http://{}"),
dashboard_url)
cli_logger.old_info(logger, "View the Ray dashboard at {}{}{}{}{}",
colorama.Style.BRIGHT, colorama.Fore.GREEN,
dashboard_url, colorama.Fore.RESET,
colorama.Style.NORMAL)
return dashboard_url, process_info
else: