[api] Initial API deprecations for Ray 1.0 (#10325)

This commit is contained in:
Eric Liang
2020-08-28 15:03:50 -07:00
committed by GitHub
parent 9c25ca6f5e
commit 519354a39a
75 changed files with 223 additions and 2414 deletions
+5 -8
View File
@@ -80,12 +80,11 @@ from ray.state import (jobs, nodes, actors, objects, timeline,
available_resources) # noqa: E402
from ray.worker import ( # noqa: F401
LOCAL_MODE, SCRIPT_MODE, WORKER_MODE, IO_WORKER_MODE, cancel, connect,
disconnect, get, get_actor, get_gpu_ids, get_resource_ids, get_webui_url,
init, is_initialized, put, kill, register_custom_serializer, remote,
shutdown, show_in_webui, wait,
disconnect, get, get_actor, get_gpu_ids, get_resource_ids,
get_dashboard_url, init, is_initialized, put, kill, remote, shutdown,
show_in_dashboard, wait,
) # noqa: E402
import ray.internal # noqa: E402
import ray.projects # noqa: E402
# We import ray.actor because some code is run in actor.py which initializes
# some functions in the worker.
import ray.actor # noqa: F401
@@ -113,7 +112,7 @@ __all__ = [
"get_actor",
"get_gpu_ids",
"get_resource_ids",
"get_webui_url",
"get_dashboard_url",
"init",
"internal",
"is_initialized",
@@ -127,12 +126,10 @@ __all__ = [
"objects",
"object_transfer_timeline",
"profile",
"projects",
"put",
"register_custom_serializer",
"remote",
"shutdown",
"show_in_webui",
"show_in_dashboard",
"timeline",
"util",
"wait",
+1 -1
View File
@@ -360,7 +360,7 @@ cdef execute_task(
CFiberEvent task_done_event
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids(as_str=True))
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
function_descriptor = CFunctionDescriptorToPython(
ray_function.GetFunctionDescriptor())
-4
View File
@@ -406,7 +406,6 @@ class ActorClass:
memory=None,
object_store_memory=None,
resources=None,
is_direct_call=None,
max_concurrency=None,
max_restarts=None,
max_task_retries=None,
@@ -430,7 +429,6 @@ class ActorClass:
this actor when creating objects.
resources: The custom resources required by the actor creation
task.
is_direct_call: Use direct actor calls.
max_concurrency: The max number of concurrent calls to allow for
this actor. This only works with direct actor calls. The max
concurrency defaults to 1 for threaded execution, and 1000 for
@@ -456,8 +454,6 @@ class ActorClass:
args = []
if kwargs is None:
kwargs = {}
if is_direct_call is not None and not is_direct_call:
raise ValueError("Non-direct call actors are no longer supported.")
meta = self.__ray_metadata__
actor_has_async_methods = len(
inspect.getmembers(
+1 -1
View File
@@ -53,7 +53,7 @@ class Cluster:
output_info = ray.init(
ignore_reinit_error=True,
address=self.redis_address,
redis_password=self.redis_password)
_redis_password=self.redis_password)
logger.info(output_info)
self.connected = True
@@ -202,7 +202,7 @@ class Actor extends React.Component<Props & WithStyles<typeof styles>, State> {
.sort()
.map((key, _, __) => {
// Construct the value from actor.
// Please refer to worker.py::show_in_webui for schema.
// Please refer to worker.py::show_in_dashboard for schema.
const valueEncoded = actor.webuiDisplay![key];
const valueParsed = JSON.parse(valueEncoded);
let valueRendered = valueParsed["message"];
+2 -4
View File
@@ -1,10 +1,8 @@
from .api import get, wait
from .dynamic_resources import set_resource
from .object_spilling import force_spill_objects, force_restore_spilled_objects
from .placement_group import (placement_group, placement_group_table,
remove_placement_group)
__all__ = [
"get", "wait", "set_resource", "force_spill_objects",
"force_restore_spilled_objects", "placement_group",
"placement_group_table", "remove_placement_group"
"set_resource", "force_spill_objects", "force_restore_spilled_objects",
"placement_group", "placement_group_table", "remove_placement_group"
]
-59
View File
@@ -1,59 +0,0 @@
import ray
import numpy as np
def get(object_refs):
"""Get a single or a collection of remote objects from the object store.
This method is identical to `ray.get` except it adds support for tuples,
ndarrays and dictionaries.
Args:
object_refs: Object ref of the object to get, a list, tuple, ndarray of
object refs to get or a dict of {key: object ref}.
Returns:
A Python object, a list of Python objects or a dict of {key: object}.
"""
if isinstance(object_refs, (tuple, np.ndarray)):
return ray.get(list(object_refs))
elif isinstance(object_refs, dict):
keys_to_get = [
k for k, v in object_refs.items() if isinstance(v, ray.ObjectRef)
]
ids_to_get = [
v for k, v in object_refs.items() if isinstance(v, ray.ObjectRef)
]
values = ray.get(ids_to_get)
result = object_refs.copy()
for key, value in zip(keys_to_get, values):
result[key] = value
return result
else:
return ray.get(object_refs)
def wait(object_refs, num_returns=1, timeout=None):
"""Return a list of IDs that are ready and a list of IDs that are not.
This method is identical to `ray.wait` except it adds support for tuples
and ndarrays.
Args:
object_refs (List[ObjectRef], Tuple(ObjectRef), np.array(ObjectRef)):
List like of object refs for objects that may or may not be ready.
Note that these IDs must be unique.
num_returns (int): The number of object refs that should be returned.
timeout (float): The maximum amount of time in seconds to wait before
returning.
Returns:
A list of object refs that are ready and a list of the remaining object
IDs.
"""
if isinstance(object_refs, (tuple, np.ndarray)):
return ray.wait(
list(object_refs), num_returns=num_returns, timeout=timeout)
return ray.wait(object_refs, num_returns=num_returns, timeout=timeout)
-4
View File
@@ -18,10 +18,6 @@ logger = logging.getLogger(__name__)
class ImportThread:
"""A thread used to import exports from the driver or other workers.
Note: The driver also has an import thread, which is used only to import
custom class definitions from calls to _register_custom_serializer that
happen under the hood on workers.
Attributes:
worker: the worker object in this process.
mode: worker mode
-5
View File
@@ -1,5 +0,0 @@
from ray.projects.projects import ProjectDefinition
__all__ = [
"ProjectDefinition",
]
-41
View File
@@ -1,41 +0,0 @@
Ray Projects
============
To run these example projects, we first have to make sure the full
repository is checked out into the project directory.
Open Tacotron
-------------
```shell
cd open-tacotron
# Check out the original repository
git init
git remote add origin https://github.com/keithito/tacotron.git
git fetch
git checkout -t origin/master
# Serve the model
ray session start serve
# Terminate the session
ray session stop
```
PyTorch Transformers
--------------------
```shell
cd python-transformers
# Check out the original repository
git init
git remote add origin https://github.com/huggingface/pytorch-transformers.git
git fetch
git checkout -t origin/master
# Now we can start the training
ray session start train --dataset SST-2
# Terminate the session
ray session stop
```
@@ -1,18 +0,0 @@
# This file is generated by `ray project create`
# A unique identifier for the head node and workers of this cluster.
cluster_name: open-tacotron
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers defaults to 0.
max_workers: 1
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
@@ -1,18 +0,0 @@
# This file is generated by `ray project create`
name: open-tacotron
description: "A TensorFlow implementation of Google's Tacotron speech synthesis with pre-trained model (unofficial)"
repo: https://github.com/keithito/tacotron
cluster:
config: ray-project/cluster.yaml
environment:
requirements: ray-project/requirements.txt
shell:
- curl http://data.keithito.com/data/speech/tacotron-20180906.tar.gz | tar xzC /tmp
commands:
- name: serve
command: python demo_server.py --checkpoint /tmp/tacotron-20180906/model.ckpt
@@ -1,11 +0,0 @@
# Adapted from https://github.com/keithito/tacotron/blob/master/requirements.txt
# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
# depends on your platform. It is assumed you have already installed tensorflow.
falcon==1.2.0
inflect==0.2.5
librosa==0.5.1
matplotlib==2.0.2
numpy==1.14.3
scipy==0.19.0
tqdm==4.11.2
Unidecode==0.4.20
@@ -1,18 +0,0 @@
# This file is generated by `ray project create`
# An unique identifier for the head node and workers of this cluster.
cluster_name: pytorch-transformers
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
max_workers: 1
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
@@ -1,35 +0,0 @@
# This file is generated by `ray project create`
name: pytorch-transformers
description: "A library of state-of-the-art pretrained models for Natural Language Processing (NLP)"
repo: https://github.com/huggingface/pytorch-transformers
cluster:
config: ray-project/cluster.yaml
environment:
requirements: ray-project/requirements.txt
commands:
- name: train
command: |
wget https://raw.githubusercontent.com/ray-project/project-data/master/download_glue_data.py && \
python download_glue_data.py -d /tmp -t {{dataset}} && \
python ./examples/run_glue.py \
--model_type bert \
--model_name_or_path bert-base-uncased \
--task_name {{dataset}} \
--do_train \
--do_eval \
--do_lower_case \
--data_dir /tmp/{{dataset}} \
--max_seq_length 128 \
--per_gpu_eval_batch_size=8 \
--per_gpu_train_batch_size=8 \
--learning_rate 2e-5 \
--num_train_epochs 3.0 \
--output_dir /tmp/output/
params:
- name: "dataset"
help: "The GLUE dataset to fine-tune on"
choices: ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI", "QNLI", "RTE", "WNLI"]
@@ -1,17 +0,0 @@
# Adapted from https://github.com/huggingface/pytorch-transformers/blob/master/requirements.txt
# PyTorch
torch>=1.0.0
# progress bars in model download and training scripts
tqdm
# Accessing files from S3 directly.
boto3
# Used for downloading models over HTTP
requests
# For OpenAI GPT
regex
# For XLNet
sentencepiece
# TensorBoard visualization
tensorboardX
# Pytorch transformers
pytorch_transformers
-213
View File
@@ -1,213 +0,0 @@
import argparse
import copy
import json
import jsonschema
import os
import yaml
def make_argument_parser(name, params, wildcards):
"""Build argument parser dynamically to parse parameter arguments.
Args:
name (str): Name of the command to parse.
params (dict): Parameter specification used to construct
the argparse parser.
wildcards (bool): Whether wildcards are allowed as arguments.
Returns:
The argparse parser.
A dictionary from argument name to list of valid choices.
"""
parser = argparse.ArgumentParser(prog=name)
# For argparse arguments that have a 'choices' list associated
# with them, save it in the following dictionary.
choices = {}
for param in params:
# Construct arguments to pass into argparse's parser.add_argument.
argparse_kwargs = copy.deepcopy(param)
name = argparse_kwargs.pop("name")
if wildcards and "choices" in param:
choices[name] = param["choices"]
argparse_kwargs["choices"] = param["choices"] + ["*"]
if "type" in param:
types = {"int": int, "str": str, "float": float}
if param["type"] in types:
argparse_kwargs["type"] = types[param["type"]]
else:
raise ValueError(
"Parameter {} has type {} which is not supported. "
"Type must be one of {}".format(name, param["type"],
list(types.keys())))
parser.add_argument("--" + name, dest=name, **argparse_kwargs)
return parser, choices
class ProjectDefinition:
def __init__(self, current_dir):
"""Finds ray-project folder for current project, parse and validates it.
Args:
current_dir (str): Path from which to search for ray-project.
Raises:
jsonschema.exceptions.ValidationError: This exception is raised
if the project file is not valid.
ValueError: This exception is raised if there are other errors in
the project definition (e.g. files not existing).
"""
root = find_root(current_dir)
if root is None:
raise ValueError("No project root found")
# Add an empty pathname to the end so that rsync will copy the project
# directory to the correct target.
self.root = os.path.join(root, "")
# Parse the project YAML.
project_file = os.path.join(self.root, "ray-project", "project.yaml")
if not os.path.exists(project_file):
raise ValueError("Project file {} not found".format(project_file))
with open(project_file) as f:
self.config = yaml.safe_load(f)
check_project_config(self.root, self.config)
def cluster_yaml(self):
"""Return the project's cluster configuration filename."""
return self.config["cluster"]["config"]
def working_directory(self):
"""Return the project's working directory on a cluster session."""
# Add an empty pathname to the end so that rsync will copy the project
# directory to the correct target.
directory = os.path.join("~", self.config["name"], "")
return directory
def get_command_info(self, command_name, args, shell, wildcards=False):
"""Get the shell command, parsed arguments and config for a command.
Args:
command_name (str): Name of the command to run. The command
definition should be available in project.yaml.
args (tuple): Tuple containing arguments to format the command
with.
wildcards (bool): If True, enable wildcards as arguments.
Returns:
The raw shell command to run with placeholders for the arguments.
The parsed argument dictonary, parsed with argparse.
The config dictionary of the command.
Raises:
ValueError: This exception is raised if the given command is not
found in project.yaml.
"""
if shell or not command_name:
return command_name, {}, {}
command_to_run = None
params = None
config = None
for command_definition in self.config["commands"]:
if command_definition["name"] == command_name:
command_to_run = command_definition["command"]
params = command_definition.get("params", [])
config = command_definition.get("config", {})
if not command_to_run:
raise ValueError(
"Cannot find the command named '{}' in commmands section "
"of the project file.".format(command_name))
parser, choices = make_argument_parser(command_name, params, wildcards)
parsed_args = vars(parser.parse_args(list(args)))
if wildcards:
for key, val in parsed_args.items():
if val == "*":
parsed_args[key] = choices[key]
return command_to_run, parsed_args, config
def git_repo(self):
return self.config.get("repo", None)
def find_root(directory):
"""Find root directory of the ray project.
Args:
directory (str): Directory to start the search in.
Returns:
Path of the parent directory containing the ray-project or
None if no such project is found.
"""
prev, directory = None, os.path.abspath(directory)
while prev != directory:
if os.path.isdir(os.path.join(directory, "ray-project")):
return directory
prev, directory = directory, os.path.abspath(
os.path.join(directory, os.pardir))
return None
def validate_project_schema(project_config):
"""Validate a project config against the official ray project schema.
Args:
project_config (dict): Parsed project yaml.
Raises:
jsonschema.exceptions.ValidationError: This exception is raised
if the project file is not valid.
"""
dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(dir, "schema.json")) as f:
schema = json.load(f)
jsonschema.validate(instance=project_config, schema=schema)
def check_project_config(project_root, project_config):
"""Checks if the project definition is valid.
Args:
project_root (str): Path containing the ray-project
project_config (dict): Project config definition
Raises:
jsonschema.exceptions.ValidationError: This exception is raised
if the project file is not valid.
ValueError: This exception is raised if there are other errors in
the project definition (e.g. files not existing).
"""
validate_project_schema(project_config)
# Make sure the cluster yaml file exists
cluster_file = os.path.join(project_root,
project_config["cluster"]["config"])
if not os.path.exists(cluster_file):
raise ValueError("'cluster' file does not exist "
"in {}".format(project_root))
if "environment" in project_config:
env = project_config["environment"]
if sum(["dockerfile" in env, "dockerimage" in env]) > 1:
raise ValueError("Cannot specify both 'dockerfile' and "
"'dockerimage' in environment.")
if "requirements" in env:
requirements_file = os.path.join(project_root, env["requirements"])
if not os.path.exists(requirements_file):
raise ValueError("'requirements' file in 'environment' does "
"not exist in {}".format(project_root))
if "dockerfile" in env:
docker_file = os.path.join(project_root, env["dockerfile"])
if not os.path.exists(docker_file):
raise ValueError("'dockerfile' file in 'environment' does "
"not exist in {}".format(project_root))
-183
View File
@@ -1,183 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"name": {
"description": "The name of the project",
"type": "string"
},
"description": {
"description": "A short description of the project",
"type": "string"
},
"repo": {
"description": "The URL of the repo this project is part of",
"type": "string"
},
"documentation": {
"description": "Link to the documentation of this project",
"type": "string"
},
"tags": {
"description": "Relevant tags for this project",
"type": "array",
"items": {
"type": "string"
}
},
"cluster": {
"type": "object",
"properties": {
"config": {
"type": "string",
"description": "Path to a .yaml cluster configuration file (relative to the project root)"
},
"params": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"help": {
"type": "string"
},
"choices": {
"type": "array"
},
"default": {
},
"type": {
"type": "string",
"enum": [
"int",
"float",
"str"
]
}
},
"required": [
"name"
],
"additionalProperties": false
}
}
},
"required": [
"config"
],
"additionalProperties": false
},
"environment": {
"description": "The environment that needs to be set up to run the project",
"type": "object",
"properties": {
"dockerimage": {
"description": "URL to a docker image that can be pulled to run the project in",
"type": "string"
},
"dockerfile": {
"description": "Path to a Dockerfile to set up an image the project can run in (relative to the project root)",
"type": "string"
},
"requirements": {
"description": "Path to a Python requirements.txt file to set up project dependencies (relative to the project root)",
"type": "string"
},
"shell": {
"description": "A sequence of shell commands to run to set up the project environment",
"type": "array",
"items": {
"type": "string"
}
}
},
"additionalProperties": false
},
"commands": {
"type": "array",
"items": {
"description": "Possible commands to run to start a session",
"type": "object",
"properties": {
"name": {
"description": "Name of the command",
"type": "string"
},
"help": {
"description": "Help string for the command",
"type": "string"
},
"command": {
"description": "Shell command to run on the cluster",
"type": "string"
},
"params": {
"type": "array",
"items": {
"description": "Possible parameters in the command",
"type": "object",
"properties": {
"name": {
"description": "Name of the parameter",
"type": "string"
},
"help": {
"description": "Help string for the parameter",
"type": "string"
},
"choices": {
"description": "Possible values the parameter can take",
"type": "array"
},
"default": {
},
"type": {
"description": "Required type for the parameter",
"type": "string",
"enum": [
"int",
"float",
"str"
]
}
},
"required": [
"name"
],
"additionalProperties": false
}
},
"config": {
"description": "Configuration options for the command",
"type": "object",
"properties": {
"tmux": {
"description": "If true, the command will be run inside of tmux",
"type": "boolean"
}
},
"additionalProperties": false
}
},
"required": [
"name",
"command"
],
"additionalProperties": false
}
},
"output_files": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"name",
"cluster"
],
"additionalProperties": false
}
-445
View File
@@ -1,445 +0,0 @@
import argparse
import click
import copy
import jsonschema
import logging
import os
from shutil import copyfile
import subprocess
import sys
import time
import ray
from ray.autoscaler.commands import (
attach_cluster,
exec_cluster,
create_or_update_cluster,
rsync,
teardown_cluster,
)
logging.basicConfig(format=ray.ray_constants.LOGGER_FORMAT, level=logging.INFO)
logger = logging.getLogger(__file__)
# File layout for generated project files
# user-dir/
# ray-project/
# project.yaml
# cluster.yaml
# requirements.txt
PROJECT_DIR = "ray-project"
PROJECT_YAML = os.path.join(PROJECT_DIR, "project.yaml")
CLUSTER_YAML = os.path.join(PROJECT_DIR, "cluster.yaml")
REQUIREMENTS_TXT = os.path.join(PROJECT_DIR, "requirements.txt")
# File layout for templates file
# RAY/.../projects/
# templates/
# cluster_template.yaml
# project_template.yaml
# requirements.txt
_THIS_FILE_DIR = os.path.split(os.path.abspath(__file__))[0]
_TEMPLATE_DIR = os.path.join(_THIS_FILE_DIR, "templates")
PROJECT_TEMPLATE = os.path.join(_TEMPLATE_DIR, "project_template.yaml")
CLUSTER_TEMPLATE = os.path.join(_TEMPLATE_DIR, "cluster_template.yaml")
REQUIREMENTS_TXT_TEMPLATE = os.path.join(_TEMPLATE_DIR, "requirements.txt")
@click.group(
"project", help="[Experimental] Commands working with ray project")
def project_cli():
pass
@project_cli.command(help="Validate current project spec")
@click.option(
"--verbose", help="If set, print the validated file", is_flag=True)
def validate(verbose):
try:
project = ray.projects.ProjectDefinition(os.getcwd())
print("Project files validated!", file=sys.stderr)
if verbose:
print(project.config)
except (jsonschema.exceptions.ValidationError, ValueError) as e:
print("Validation failed for the following reason", file=sys.stderr)
raise click.ClickException(e)
@project_cli.command(help="Create a new project within current directory")
@click.argument("project_name")
@click.option(
"--cluster-yaml",
help="Path to autoscaler yaml. Created by default",
default=None)
@click.option(
"--requirements",
help="Path to requirements.txt. Created by default",
default=None)
def create(project_name, cluster_yaml, requirements):
if os.path.exists(PROJECT_DIR):
raise click.ClickException(
"Project directory {} already exists.".format(PROJECT_DIR))
os.makedirs(PROJECT_DIR)
if cluster_yaml is None:
logger.warning("Using default autoscaler yaml")
with open(CLUSTER_TEMPLATE) as f:
template = f.read().replace(r"{{name}}", project_name)
with open(CLUSTER_YAML, "w") as f:
f.write(template)
cluster_yaml = CLUSTER_YAML
if requirements is None:
logger.warning("Using default requirements.txt")
# no templating required, just copy the file
copyfile(REQUIREMENTS_TXT_TEMPLATE, REQUIREMENTS_TXT)
requirements = REQUIREMENTS_TXT
repo = None
if os.path.exists(".git"):
try:
repo = subprocess.check_output(
"git remote get-url origin".split(" ")).strip()
logger.info("Setting repo URL to %s", repo)
except subprocess.CalledProcessError:
pass
with open(PROJECT_TEMPLATE) as f:
project_template = f.read()
# NOTE(simon):
# We could use jinja2, which will make the templating part easier.
project_template = project_template.replace(r"{{name}}", project_name)
project_template = project_template.replace(r"{{cluster}}",
cluster_yaml)
project_template = project_template.replace(r"{{requirements}}",
requirements)
if repo is None:
project_template = project_template.replace(
r"{{repo_string}}", "# repo: {}".format("..."))
else:
project_template = project_template.replace(
r"{{repo_string}}", "repo: {}".format(repo))
with open(PROJECT_YAML, "w") as f:
f.write(project_template)
@click.group(
"session",
help="[Experimental] Commands working with sessions, which are "
"running instances of a project.")
def session_cli():
pass
def load_project_or_throw():
# Validate the project file
try:
return ray.projects.ProjectDefinition(os.getcwd())
except (jsonschema.exceptions.ValidationError, ValueError):
raise click.ClickException(
"Project file validation failed. Please run "
"`ray project validate` to inspect the error.")
class SessionRunner:
"""Class for setting up a session and executing commands in it."""
def __init__(self, session_name=None):
"""Initialize session runner and try to parse the command arguments.
Args:
session_name (str): Name of the session.
Raises:
click.ClickException: This exception is raised if any error occurs.
"""
self.project_definition = load_project_or_throw()
self.session_name = session_name
# Check for features we don't support right now
project_environment = self.project_definition.config.get(
"environment", {})
need_docker = ("dockerfile" in project_environment
or "dockerimage" in project_environment)
if need_docker:
raise click.ClickException(
"Docker support in session is currently not implemented.")
def create_cluster(self, no_config_cache):
"""Create a cluster that will run the session."""
create_or_update_cluster(
config_file=self.project_definition.cluster_yaml(),
override_min_workers=None,
override_max_workers=None,
no_restart=False,
restart_only=False,
yes=True,
override_cluster_name=self.session_name,
no_config_cache=no_config_cache,
)
def sync_files(self):
"""Synchronize files with the session."""
rsync(
self.project_definition.cluster_yaml(),
source=self.project_definition.root,
target=self.project_definition.working_directory(),
override_cluster_name=self.session_name,
down=False,
)
def setup_environment(self):
"""Set up the environment of the session."""
project_environment = self.project_definition.config.get(
"environment", {})
if "requirements" in project_environment:
requirements_txt = project_environment["requirements"]
# Create a temporary requirements_txt in the head node.
remote_requirements_txt = os.path.join(
ray.utils.get_user_temp_dir(),
"ray_project_requirements_txt_{}".format(time.time()))
rsync(
self.project_definition.cluster_yaml(),
source=requirements_txt,
target=remote_requirements_txt,
override_cluster_name=self.session_name,
down=False,
)
self.execute_command(
"pip install -r {}".format(remote_requirements_txt))
if "shell" in project_environment:
for cmd in project_environment["shell"]:
self.execute_command(cmd)
def execute_command(self, cmd, config={}):
"""Execute a shell command in the session.
Args:
cmd (str): Shell command to run in the session. It will be
run in the working directory of the project.
"""
cwd = self.project_definition.working_directory()
cmd = "cd {cwd}; {cmd}".format(cwd=cwd, cmd=cmd)
exec_cluster(
config_file=self.project_definition.cluster_yaml(),
cmd=cmd,
run_env=config.get("run_env", "auto"),
screen=False,
tmux=config.get("tmux", False),
stop=False,
start=False,
override_cluster_name=self.session_name,
port_forward=config.get("port_forward", None),
)
def format_command(command, parsed_args):
"""Substitute arguments into command.
Args:
command (str): Shell comand with argument placeholders.
parsed_args (dict): Dictionary that maps from argument names
to their value.
Returns:
Shell command with parameters from parsed_args substituted.
"""
for key, val in parsed_args.items():
command = command.replace("{{" + key + "}}", str(val))
return command
def get_session_runs(name, command, parsed_args):
"""Get a list of sessions to start.
Args:
command (str): Shell command with argument placeholders.
parsed_args (dict): Dictionary that maps from argument names
to their values.
Returns:
List of sessions to start, which are dictionaries with keys:
"name": Name of the session to start,
"command": Command to run after starting the session,
"params": Parameters for this run,
"num_steps": 4 if a command should be run, 3 if not.
"""
if not command:
return [{"name": name, "command": None, "params": {}, "num_steps": 3}]
# Try to find a wildcard argument (i.e. one that has a list of values)
# and give an error if there is more than one (currently unsupported).
wildcard_arg = None
for key, val in parsed_args.items():
if isinstance(val, list):
if not wildcard_arg:
wildcard_arg = key
else:
raise click.ClickException(
"More than one wildcard is not supported at the moment")
if not wildcard_arg:
session_run = {
"name": name,
"command": format_command(command, parsed_args),
"params": parsed_args,
"num_steps": 4
}
return [session_run]
else:
session_runs = []
for val in parsed_args[wildcard_arg]:
parsed_args = copy.deepcopy(parsed_args)
parsed_args[wildcard_arg] = val
session_run = {
"name": "{}-{}-{}".format(name, wildcard_arg, val),
"command": format_command(command, parsed_args),
"params": parsed_args,
"num_steps": 4
}
session_runs.append(session_run)
return session_runs
@session_cli.command(help="Attach to an existing cluster")
@click.option(
"--screen", is_flag=True, default=False, help="Run the command in screen.")
@click.option("--tmux", help="Attach to tmux session", is_flag=True)
def attach(screen, tmux):
project_definition = load_project_or_throw()
attach_cluster(
project_definition.cluster_yaml(),
start=False,
use_screen=screen,
use_tmux=tmux,
override_cluster_name=None,
new=False,
)
@session_cli.command(help="Stop a session based on current project config")
@click.option("--name", help="Name of the session to stop", default=None)
def stop(name):
project_definition = load_project_or_throw()
if not name:
name = project_definition.config["name"]
teardown_cluster(
project_definition.cluster_yaml(),
yes=True,
workers_only=False,
override_cluster_name=name)
@session_cli.command(
name="start",
context_settings=dict(ignore_unknown_options=True, ),
help="Start a session based on current project config")
@click.argument("command", required=False)
@click.argument("args", nargs=-1, type=click.UNPROCESSED)
@click.option(
"--shell",
help=(
"If set, run the command as a raw shell command instead of looking up "
"the command in the project config"),
is_flag=True)
@click.option("--name", help="A name to tag the session with.", default=None)
@click.option(
"--no-config-cache",
is_flag=True,
default=False,
help="Disable the local cluster config cache.")
def session_start(command, args, shell, name, no_config_cache):
project_definition = load_project_or_throw()
if not name:
name = project_definition.config["name"]
# Get the actual command to run. This also validates the command,
# which should be done before the cluster is started.
try:
command, parsed_args, config = project_definition.get_command_info(
command, args, shell, wildcards=True)
except ValueError as e:
raise click.ClickException(e)
session_runs = get_session_runs(name, command, parsed_args)
if len(session_runs) > 1 and not config.get("tmux", False):
logging.info("Using wildcards with tmux = False would not create "
"sessions in parallel, so we are overriding it with "
"tmux = True.")
config["tmux"] = True
for run in session_runs:
runner = SessionRunner(session_name=run["name"])
logger.info("[1/{}] Creating cluster".format(run["num_steps"]))
runner.create_cluster(no_config_cache)
logger.info("[2/{}] Syncing the project".format(run["num_steps"]))
runner.sync_files()
logger.info("[3/{}] Setting up environment".format(run["num_steps"]))
runner.setup_environment()
if run["command"]:
# Run the actual command.
logger.info("[4/4] Running command")
runner.execute_command(run["command"], config)
@session_cli.command(
name="commands",
help="Print available commands for sessions of this project.")
def session_commands():
project_definition = load_project_or_throw()
print("Active project: " + project_definition.config["name"])
print()
commands = project_definition.config["commands"]
for command in commands:
print("Command \"{}\":".format(command["name"]))
parser = argparse.ArgumentParser(
command["name"], description=command.get("help"), add_help=False)
params = command.get("params", [])
for param in params:
name = param.pop("name")
if "type" in param:
param.pop("type")
parser.add_argument("--" + name, **param)
help_string = parser.format_help()
# Indent the help message by two spaces and print it.
print("\n".join([" " + line for line in help_string.split("\n")]))
@session_cli.command(
name="execute",
context_settings=dict(ignore_unknown_options=True, ),
help="Execute a command in a session")
@click.argument("command", required=False)
@click.argument("args", nargs=-1, type=click.UNPROCESSED)
@click.option(
"--shell",
help=(
"If set, run the command as a raw shell command instead of looking up "
"the command in the project config"),
is_flag=True)
@click.option(
"--name", help="Name of the session to run this command on", default=None)
def session_execute(command, args, shell, name):
project_definition = load_project_or_throw()
try:
command, parsed_args, config = project_definition.get_command_info(
command, args, shell, wildcards=False)
except ValueError as e:
raise click.ClickException(e)
runner = SessionRunner(session_name=name)
command = format_command(command, parsed_args)
runner.execute_command(command)
@@ -1,18 +0,0 @@
# This file is generated by `ray project create`.
# A unique identifier for the head node and workers of this cluster.
cluster_name: {{name}}
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers defaults to 0.
max_workers: 1
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
@@ -1,22 +0,0 @@
# This file is generated by `ray project create`.
name: {{name}}
# description: A short description of the project.
# The URL of the repo this project is part of.
{{repo_string}}
cluster:
config: {{cluster}}
environment:
# dockerfile: The dockerfile to be built and ran the commands with.
# dockerimage: The docker image to be used to run the project in, e.g. ubuntu:18.04.
requirements: {{requirements}}
shell: # Shell commands to be ran for environment setup.
- echo "Setting up the environment"
commands:
- name: default
command: echo "Starting ray job"
@@ -1 +0,0 @@
ray[debug]
-3
View File
@@ -145,7 +145,6 @@ class RemoteFunction:
args=None,
kwargs=None,
num_return_vals=None,
is_direct_call=None,
num_cpus=None,
num_gpus=None,
memory=None,
@@ -185,8 +184,6 @@ class RemoteFunction:
if num_return_vals is None:
num_return_vals = self._num_return_vals
if is_direct_call is not None and not is_direct_call:
raise ValueError("Non-direct call tasks are no longer supported.")
if max_retries is None:
max_retries = self._max_retries
+4 -38
View File
@@ -20,7 +20,6 @@ from ray.autoscaler.commands import (
debug_status, RUN_ENV_TYPES)
import ray.ray_constants as ray_constants
import ray.utils
from ray.projects.scripts import project_cli, session_cli
from ray.autoscaler.cli_logger import cli_logger
import colorful as cf
@@ -1009,7 +1008,7 @@ def down(cluster_config_file, yes, workers_only, cluster_name,
keep_min_workers)
@cli.command()
@cli.command(hidden=True)
@click.argument("cluster_config_file", required=True, type=str)
@click.option(
"--yes",
@@ -1454,36 +1453,6 @@ def timeline(address):
"You can open this with chrome://tracing in the Chrome browser.")
@cli.command()
@click.option(
"--address",
required=False,
type=str,
help="Override the address to connect to.")
def statistics(address):
"""Get the current metrics protobuf from a Ray cluster (developer tool)."""
if not address:
address = services.find_redis_address_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
import grpc
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
for raylet in ray.nodes():
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
ray.nodes()[0]["NodeManagerPort"])
logger.info(f"Querying raylet {raylet_address}")
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.GetNodeStats(
node_manager_pb2.GetNodeStatsRequest(include_memory_info=False),
timeout=2.0)
print(reply)
@cli.command()
@click.option(
"--address",
@@ -1520,13 +1489,13 @@ def status(address):
print(debug_status())
@cli.command()
@cli.command(hidden=True)
@click.option(
"--address",
required=False,
type=str,
help="Override the address to connect to.")
def globalgc(address):
def global_gc(address):
"""Trigger Python garbage collection on all cluster workers."""
if not address:
address = services.find_redis_address_or_die()
@@ -1614,13 +1583,10 @@ add_command_alias(get_head_ip, name="get_head_ip", hidden=True)
cli.add_command(get_worker_ips)
cli.add_command(microbenchmark)
cli.add_command(stack)
cli.add_command(statistics)
cli.add_command(status)
cli.add_command(memory)
cli.add_command(globalgc)
cli.add_command(global_gc)
cli.add_command(timeline)
cli.add_command(project_cli)
cli.add_command(session_cli)
cli.add_command(install_nightly)
try:
@@ -77,7 +77,7 @@ async def trial(actors, session, data_size):
async def main():
ray.init(log_to_driver=False)
ray.init(_log_to_driver=False)
serve.init()
serve.create_backend("backend", backend)
+5 -22
View File
@@ -256,35 +256,18 @@ def remaining_processes_alive():
return ray.worker._global_node.remaining_processes_alive()
def validate_redis_address(address, redis_address):
"""Validates redis address parameter and splits it into host/ip components.
We temporarily support both 'address' and 'redis_address', so both are
handled here.
def validate_redis_address(address):
"""Validates address parameter.
Returns:
redis_address: string containing the full <host:port> address.
redis_ip: string representing the host portion of the address.
redis_port: integer representing the port portion of the address.
Raises:
ValueError: if both address and redis_address were specified or the
address was malformed.
"""
if redis_address == "auto":
raise ValueError("auto address resolution not supported for "
"redis_address parameter. Please use address.")
if address:
if redis_address:
raise ValueError(
"Both address and redis_address specified. Use only address.")
if address == "auto":
address = find_redis_address_or_die()
redis_address = address
redis_address = address_to_ip(redis_address)
if address == "auto":
address = find_redis_address_or_die()
redis_address = address_to_ip(address)
redis_address_parts = redis_address.split(":")
if len(redis_address_parts) != 2:
+3 -3
View File
@@ -844,7 +844,7 @@ state = GlobalState()
def jobs():
"""Get a list of the jobs in the cluster.
"""Get a list of the jobs in the cluster (for debugging only).
Returns:
Information from the job table, namely a list of dicts with keys:
@@ -858,7 +858,7 @@ def jobs():
def nodes():
"""Get a list of the nodes in the cluster.
"""Get a list of the nodes in the cluster (for debugging only).
Returns:
Information about the Ray clients in the cluster.
@@ -899,7 +899,7 @@ def node_ids():
def actors(actor_id=None):
"""Fetch and parse the actor info for one or more actor IDs.
"""Fetch actor info for one or more actor IDs (for debugging only).
Args:
actor_id: A hex string of the actor ID to fetch information about. If
-1
View File
@@ -88,7 +88,6 @@ py_test_module_list(
"test_multi_tenancy.py",
"test_node_manager.py",
"test_numba.py",
"test_projects.py",
"test_ray_init.py",
"test_serialization.py",
"test_tempfile.py",
+1 -1
View File
@@ -30,7 +30,7 @@ def ray_init_with_task_retry_delay():
@pytest.mark.parametrize(
"ray_start_regular", [{
"object_store_memory": 150 * 1024 * 1024,
"lru_evict": True,
"_lru_evict": True,
}],
indirect=True)
def test_actor_eviction(ray_start_regular):
+2 -2
View File
@@ -95,10 +95,10 @@ def test_actor_gpus(ray_start_cluster):
@ray.remote(num_gpus=1)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids(as_str=True)
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids(as_str=True) == self.gpu_ids
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
-21
View File
@@ -49,27 +49,6 @@ def test_internal_free(shutdown_only):
ray.get(big_id)
def test_wait_iterables(ray_start_regular):
@ray.remote
def f(delay):
time.sleep(delay)
return 1
object_refs = (f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5))
ready_ids, remaining_ids = ray.experimental.wait(object_refs)
assert len(ready_ids) == 1
assert len(remaining_ids) == 3
object_refs = np.array(
[f.remote(1.0),
f.remote(0.5),
f.remote(0.5),
f.remote(0.5)])
ready_ids, remaining_ids = ray.experimental.wait(object_refs)
assert len(ready_ids) == 1
assert len(remaining_ids) == 3
def test_multiple_waits_and_gets(shutdown_only):
# It is important to use three workers here, so that the three tasks
# launched in this experiment can run at the same time.
-19
View File
@@ -633,25 +633,6 @@ def save_gpu_ids_shutdown_only():
del os.environ["CUDA_VISIBLE_DEVICES"]
@pytest.mark.parametrize("as_str", [False, True])
def test_gpu_ids_as_str(save_gpu_ids_shutdown_only, as_str):
allowed_gpu_ids = [4, 5, 6]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
str(i) for i in allowed_gpu_ids)
ray.init()
@ray.remote
def get_gpu_ids(as_str):
gpu_ids = ray.get_gpu_ids(as_str)
for gpu_id in gpu_ids:
if as_str:
assert isinstance(gpu_id, str)
else:
assert isinstance(gpu_id, int)
ray.get([get_gpu_ids.remote(as_str) for _ in range(10)])
def test_specific_gpus(save_gpu_ids_shutdown_only):
allowed_gpu_ids = [4, 5, 6]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+2 -43
View File
@@ -208,7 +208,7 @@ class CaptureOutputAndError:
def test_logging_to_driver(shutdown_only):
ray.init(num_cpus=1, log_to_driver=True)
ray.init(num_cpus=1, _log_to_driver=True)
@ray.remote
def f():
@@ -233,7 +233,7 @@ def test_logging_to_driver(shutdown_only):
def test_not_logging_to_driver(shutdown_only):
ray.init(num_cpus=1, log_to_driver=False)
ray.init(num_cpus=1, _log_to_driver=False)
@ray.remote
def f():
@@ -272,23 +272,6 @@ def test_workers(shutdown_only):
worker_ids = set(ray.get([f.remote() for _ in range(10)]))
def test_specific_job_id():
dummy_driver_id = ray.JobID.from_int(1)
ray.init(num_cpus=1, job_id=dummy_driver_id)
# in driver
assert dummy_driver_id == ray.worker.global_worker.current_job_id
# in worker
@ray.remote
def f():
return ray.worker.global_worker.current_job_id
assert dummy_driver_id == ray.get(f.remote())
ray.shutdown()
def test_object_ref_properties():
id_bytes = b"00112233445566778899"
object_ref = ray.ObjectRef(id_bytes)
@@ -397,23 +380,6 @@ def test_ray_stack(ray_start_2_cpus):
"'ray stack'")
def test_socket_dir_not_existing(shutdown_only):
if sys.platform != "win32":
random_name = ray.ObjectRef.from_random().hex()
temp_raylet_socket_dir = os.path.join(ray.utils.get_ray_temp_dir(),
"tests", random_name)
temp_raylet_socket_name = os.path.join(temp_raylet_socket_dir,
"raylet_socket")
ray.init(num_cpus=2, raylet_socket_name=temp_raylet_socket_name)
@ray.remote
def foo(x):
time.sleep(1)
return 2 * x
ray.get([foo.remote(i) for i in range(2)])
def test_raylet_is_robust_to_random_messages(ray_start_regular):
node_manager_address = None
node_manager_port = None
@@ -465,13 +431,6 @@ def test_put_pins_object(ray_start_object_store_memory):
assert not ray.worker.global_worker.core_worker.object_exists(
ray.ObjectRef(x_binary))
# weakref put
y_id = ray.put(obj, weakref=True)
for _ in range(10):
ray.put(np.zeros(10 * 1024 * 1024))
with pytest.raises(ray.exceptions.UnreconstructableError):
ray.get(y_id)
def test_decorated_function(ray_start_regular):
def function_invocation_decorator(f):
-30
View File
@@ -456,36 +456,6 @@ def test_putting_object_that_closes_over_object_ref(
ray.put(f)
def test_custom_serializers(ray_start_shared_local_modes):
class Foo:
def __init__(self):
self.x = 3
def custom_serializer(obj):
return 3, "string1", type(obj).__name__
def custom_deserializer(serialized_obj):
return serialized_obj, "string2"
ray.register_custom_serializer(
Foo, serializer=custom_serializer, deserializer=custom_deserializer)
assert ray.get(ray.put(Foo())) == ((3, "string1", Foo.__name__), "string2")
class Bar:
def __init__(self):
self.x = 3
ray.register_custom_serializer(
Bar, serializer=custom_serializer, deserializer=custom_deserializer)
@ray.remote
def f():
return Bar()
assert ray.get(f.remote()) == ((3, "string1", Bar.__name__), "string2")
def test_keyword_args(ray_start_shared_local_modes):
@ray.remote
def keyword_fct1(a, b="hello"):
-19
View File
@@ -366,25 +366,6 @@ def test_get_multiple(ray_start_regular_shared):
assert results == indices
def test_get_multiple_experimental(ray_start_regular_shared):
object_refs = [ray.put(i) for i in range(10)]
object_refs_tuple = tuple(object_refs)
assert ray.experimental.get(object_refs_tuple) == list(range(10))
object_refs_nparray = np.array(object_refs)
assert ray.experimental.get(object_refs_nparray) == list(range(10))
def test_get_dict(ray_start_regular_shared):
d = {str(i): ray.put(i) for i in range(5)}
for i in range(5, 10):
d[str(i)] = i
result = ray.experimental.get(d)
expected = {str(i): i for i in range(10)}
assert result == expected
def test_get_with_timeout(ray_start_regular_shared):
signal = ray.test_utils.SignalActor.remote()
+17 -17
View File
@@ -33,7 +33,7 @@ def test_cancel_chain(ray_start_regular, use_force):
obj4 = wait_for.remote([obj3])
assert len(ray.wait([obj1], timeout=.1)[0]) == 0
ray.cancel(obj1, use_force)
ray.cancel(obj1, force=use_force)
for ob in [obj1, obj2, obj3, obj4]:
with pytest.raises(valid_exceptions(use_force)):
ray.get(ob)
@@ -45,7 +45,7 @@ def test_cancel_chain(ray_start_regular, use_force):
obj4 = wait_for.remote([obj3])
assert len(ray.wait([obj3], timeout=.1)[0]) == 0
ray.cancel(obj3, use_force)
ray.cancel(obj3, force=use_force)
for ob in [obj3, obj4]:
with pytest.raises(valid_exceptions(use_force)):
ray.get(ob)
@@ -74,7 +74,7 @@ def test_cancel_multiple_dependents(ray_start_regular, use_force):
deps.append(wait_for.remote([head]))
assert len(ray.wait([head], timeout=.1)[0]) == 0
ray.cancel(head, use_force)
ray.cancel(head, force=use_force)
for d in deps:
with pytest.raises(valid_exceptions(use_force)):
ray.get(d)
@@ -86,7 +86,7 @@ def test_cancel_multiple_dependents(ray_start_regular, use_force):
deps2.append(wait_for.remote([head]))
for d in deps2:
ray.cancel(d, use_force)
ray.cancel(d, force=use_force)
for d in deps2:
with pytest.raises(valid_exceptions(use_force)):
@@ -111,11 +111,11 @@ def test_single_cpu_cancel(shutdown_only, use_force):
indep = wait_for.remote([signaler.wait.remote()])
assert len(ray.wait([obj3], timeout=.1)[0]) == 0
ray.cancel(obj3, use_force)
ray.cancel(obj3, force=use_force)
with pytest.raises(valid_exceptions(use_force)):
ray.get(obj3)
ray.cancel(obj1, use_force)
ray.cancel(obj1, force=use_force)
for d in [obj1, obj2]:
with pytest.raises(valid_exceptions(use_force)):
@@ -145,12 +145,12 @@ def test_comprehensive(ray_start_regular, use_force):
assert len(ray.wait([a, b, a2, combo], timeout=1)[0]) == 0
ray.cancel(a, use_force)
ray.cancel(a, force=use_force)
with pytest.raises(valid_exceptions(use_force)):
ray.get(a, 10)
ray.get(a, timeout=10)
with pytest.raises(valid_exceptions(use_force)):
ray.get(a2, 10)
ray.get(a2, timeout=10)
signaler.send.remote()
@@ -177,10 +177,10 @@ def test_stress(shutdown_only, use_force):
cancelled = set()
for t in tasks:
if random.random() > 0.5:
ray.cancel(t, use_force)
ray.cancel(t, force=use_force)
cancelled.add(t)
ray.cancel(first, use_force)
ray.cancel(first, force=use_force)
cancelled.add(first)
for done in cancelled:
@@ -188,7 +188,7 @@ def test_stress(shutdown_only, use_force):
ray.get(done)
for indx, t in enumerate(tasks):
if sleep_or_no[indx]:
ray.cancel(t, use_force)
ray.cancel(t, force=use_force)
cancelled.add(t)
if t in cancelled:
with pytest.raises(valid_exceptions(use_force)):
@@ -209,7 +209,7 @@ def test_fast(shutdown_only, use_force):
ids = list()
for _ in range(100):
x = fast.remote("a")
ray.cancel(x, use_force)
ray.cancel(x, force=use_force)
ids.append(x)
@ray.remote
@@ -223,7 +223,7 @@ def test_fast(shutdown_only, use_force):
for idx in range(100, 5100):
if random.random() > 0.95:
ray.cancel(ids[idx], use_force)
ray.cancel(ids[idx], force=use_force)
signaler.send.remote()
for obj_ref in ids:
try:
@@ -250,12 +250,12 @@ def test_remote_cancel(ray_start_regular, use_force):
inner = ray.get(outer)[0]
with pytest.raises(RayTimeoutError):
ray.get(inner, 1)
ray.get(inner, timeout=1)
ray.cancel(inner, use_force)
ray.cancel(inner, force=use_force)
with pytest.raises(valid_exceptions(use_force)):
ray.get(inner, 10)
ray.get(inner, timeout=10)
if __name__ == "__main__":
+2 -2
View File
@@ -6,7 +6,7 @@ import ray.test_utils
def test_cross_language_raise_kwargs(shutdown_only):
ray.init(load_code_from_local=True, include_java=True)
ray.init(_load_code_from_local=True, _include_java=True)
with pytest.raises(Exception, match="kwargs"):
ray.java_function("a", "b").remote(x="arg1")
@@ -16,7 +16,7 @@ def test_cross_language_raise_kwargs(shutdown_only):
def test_cross_language_raise_exception(shutdown_only):
ray.init(load_code_from_local=True, include_java=True)
ray.init(_load_code_from_local=True, _include_java=True)
class PythonObject(object):
pass
@@ -23,7 +23,7 @@ def test_errors_before_initializing_ray():
lambda: ray.get_actor("name"),
ray.get_gpu_ids,
ray.get_resource_ids,
ray.get_webui_url,
ray.get_dashboard_url,
ray.jobs,
lambda: ray.kill(None), # Not valid API usage.
ray.nodes,
+1 -1
View File
@@ -1000,7 +1000,7 @@ def test_fill_object_store_lru_fallback(shutdown_only):
ray.init(
num_cpus=2,
object_store_memory=10**8,
lru_evict=True,
_lru_evict=True,
_system_config=config)
@ray.remote
+5 -4
View File
@@ -47,8 +47,9 @@ class TestMemoryLimits(unittest.TestCase):
def testTooLargeAllocation(self):
try:
ray.init(num_cpus=1, driver_object_store_memory=100 * MB)
ray.put(np.zeros(50 * MB, dtype=np.uint8), weakref=True)
ray.init(num_cpus=1, _driver_object_store_memory=100 * MB)
ray.worker.global_worker.put_object(
np.zeros(50 * MB, dtype=np.uint8), pin_object=False)
self.assertRaises(
OBJECT_TOO_LARGE,
lambda: ray.put(np.zeros(200 * MB, dtype=np.uint8)))
@@ -61,9 +62,9 @@ class TestMemoryLimits(unittest.TestCase):
ray.init(
num_cpus=1,
object_store_memory=300 * MB,
driver_object_store_memory=driver_quota)
_driver_object_store_memory=driver_quota)
obj = np.ones(200 * 1024, dtype=np.uint8)
z = ray.put(obj, weakref=True)
z = ray.worker.global_worker.put_object(obj, pin_object=False)
a = LightActor._remote(object_store_memory=a_quota)
b = GreedyActor._remote(object_store_memory=b_quota)
for _ in range(5):
+1 -1
View File
@@ -34,7 +34,7 @@ def train_oom(config, reporter):
class TestMemoryScheduling(unittest.TestCase):
def testMemoryRequest(self):
try:
ray.init(num_cpus=1, memory=200 * MB)
ray.init(num_cpus=1, _memory=200 * MB)
# fits first 2
a = Actor.remote()
b = Actor.remote()
+4 -4
View File
@@ -50,7 +50,7 @@ def test_worker_stats(shutdown_only):
@ray.remote
def f():
ray.show_in_webui("test")
ray.show_in_dashboard("test")
return os.getpid()
@ray.remote
@@ -59,10 +59,10 @@ def test_worker_stats(shutdown_only):
pass
def f(self):
ray.show_in_webui("test")
ray.show_in_dashboard("test")
return os.getpid()
# Test show_in_webui for remote functions.
# Test show_in_dashboard for remote functions.
worker_pid = ray.get(f.remote())
reply = try_get_node_stats()
target_worker_present = False
@@ -75,7 +75,7 @@ def test_worker_stats(shutdown_only):
assert stats.webui_display[""] == "" # Empty proto
assert target_worker_present
# Test show_in_webui for remote actors.
# Test show_in_dashboard for remote actors.
a = Actor.remote()
worker_pid = ray.get(a.f.remote())
reply = try_get_node_stats()
+1 -1
View File
@@ -448,7 +448,7 @@ def test_calling_start_ray_head(call_ray_stop_only):
["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"],
indirect=True)
def test_using_hostnames(call_ray_start):
ray.init(node_ip_address="localhost", address="localhost:6379")
ray.init(_node_ip_address="localhost", address="localhost:6379")
@ray.remote
def f():
+5 -5
View File
@@ -11,7 +11,7 @@ def test_spill_objects_manually(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=75 * 1024 * 1024,
object_spilling_config={
_object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
@@ -58,7 +58,7 @@ def test_spill_objects_manually_from_workers(shutdown_only):
# Limit our object store to 100 MiB of memory.
ray.init(
object_store_memory=100 * 1024 * 1024,
object_spilling_config={
_object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
@@ -84,7 +84,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=100 * 1024 * 1024,
object_spilling_config={
_object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
@@ -111,7 +111,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
"ray_start_cluster_head", [{
"num_cpus": 0,
"object_store_memory": 75 * 1024 * 1024,
"object_spilling_config": {
"_object_spilling_config": {
"type": "filesystem",
"params": {
"directory_path": "/tmp"
@@ -127,7 +127,7 @@ def test_spill_remote_object(ray_start_cluster_head):
cluster = ray_start_cluster_head
cluster.add_node(
object_store_memory=75 * 1024 * 1024,
object_spilling_config={
_object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
-256
View File
@@ -1,256 +0,0 @@
import jsonschema
import os
import pytest
import subprocess
import yaml
from click.testing import CliRunner
import sys
from unittest.mock import patch, DEFAULT
from contextlib import contextmanager
from ray.projects.scripts import (session_start, session_commands,
session_execute)
from ray.test_utils import check_call_ray
import ray
TEST_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "project_files")
def load_project_description(project_file):
path = os.path.join(TEST_DIR, project_file)
with open(path) as f:
return yaml.safe_load(f)
def test_validation():
project_dirs = ["docker_project", "requirements_project", "shell_project"]
for project_dir in project_dirs:
project_dir = os.path.join(TEST_DIR, project_dir)
ray.projects.ProjectDefinition(project_dir)
bad_schema_dirs = ["no_project1"]
for project_dir in bad_schema_dirs:
project_dir = os.path.join(TEST_DIR, project_dir)
with pytest.raises(jsonschema.exceptions.ValidationError):
ray.projects.ProjectDefinition(project_dir)
bad_project_dirs = ["no_project2", "noproject3"]
for project_dir in bad_project_dirs:
project_dir = os.path.join(TEST_DIR, project_dir)
with pytest.raises(ValueError):
ray.projects.ProjectDefinition(project_dir)
def test_project_root():
path = os.path.join(TEST_DIR, "project1")
project_definition = ray.projects.ProjectDefinition(path)
assert os.path.normpath(project_definition.root) == os.path.normpath(path)
path2 = os.path.join(TEST_DIR, "project1", "subdir")
project_definition = ray.projects.ProjectDefinition(path2)
assert os.path.normpath(project_definition.root) == os.path.normpath(path)
path3 = ray.utils.get_user_temp_dir() + os.sep
with pytest.raises(ValueError):
project_definition = ray.projects.ProjectDefinition(path3)
def test_project_validation():
with _chdir_and_back(os.path.join(TEST_DIR, "project1")):
check_call_ray(["project", "validate"])
def test_project_no_validation():
with _chdir_and_back(TEST_DIR):
with pytest.raises(subprocess.CalledProcessError):
check_call_ray(["project", "validate"])
@contextmanager
def _chdir_and_back(d):
old_dir = os.getcwd()
try:
os.chdir(d)
yield
finally:
os.chdir(old_dir)
def run_test_project(project_dir, command, args):
# Run the CLI commands with patching
test_dir = os.path.join(TEST_DIR, project_dir)
with _chdir_and_back(test_dir):
runner = CliRunner()
with patch.multiple(
"ray.projects.scripts",
create_or_update_cluster=DEFAULT,
rsync=DEFAULT,
exec_cluster=DEFAULT,
) as mock_calls:
result = runner.invoke(command, args)
return result, mock_calls, test_dir
def test_session_start_default_project():
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "project-pass"), session_start,
["default"])
loaded_project = ray.projects.ProjectDefinition(test_dir)
assert result.exit_code == 0
# Part 1/3: Cluster Launching Call
create_or_update_cluster_call = mock_calls["create_or_update_cluster"]
assert create_or_update_cluster_call.call_count == 1
_, kwargs = create_or_update_cluster_call.call_args
assert kwargs["config_file"] == loaded_project.cluster_yaml()
# Part 2/3: Rsync Calls
rsync_call = mock_calls["rsync"]
# 1 for rsyncing the project directory, 1 for rsyncing the
# requirements.txt.
assert rsync_call.call_count == 2
_, kwargs = rsync_call.call_args
assert kwargs["source"] == loaded_project.config["environment"][
"requirements"]
# Part 3/3: Exec Calls
exec_cluster_call = mock_calls["exec_cluster"]
commands_executed = []
for _, kwargs in exec_cluster_call.call_args_list:
commands_executed.append(kwargs["cmd"].replace(
"cd {}; ".format(loaded_project.working_directory()), ""))
expected_commands = loaded_project.config["environment"]["shell"]
expected_commands += [
command["command"] for command in loaded_project.config["commands"]
]
if "requirements" in loaded_project.config["environment"]:
assert any("pip install -r" for cmd in commands_executed)
# pop the `pip install` off commands executed
commands_executed = [
cmd for cmd in commands_executed if "pip install -r" not in cmd
]
assert expected_commands == commands_executed
def test_session_execute_default_project():
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "project-pass"), session_execute,
["default"])
loaded_project = ray.projects.ProjectDefinition(test_dir)
assert result.exit_code == 0
assert mock_calls["rsync"].call_count == 0
assert mock_calls["create_or_update_cluster"].call_count == 0
exec_cluster_call = mock_calls["exec_cluster"]
commands_executed = []
for _, kwargs in exec_cluster_call.call_args_list:
commands_executed.append(kwargs["cmd"].replace(
"cd {}; ".format(loaded_project.working_directory()), ""))
expected_commands = [
command["command"] for command in loaded_project.config["commands"]
]
assert expected_commands == commands_executed
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "project-pass"), session_execute,
["--shell", "uptime"])
assert result.exit_code == 0
def test_session_start_docker_fail():
result, _, _ = run_test_project(
os.path.join("session-tests", "with-docker-fail"), session_start, [])
assert result.exit_code == 1
assert ("Docker support in session is currently "
"not implemented") in result.output
def test_session_invalid_config_errored():
result, _, _ = run_test_project(
os.path.join("session-tests", "invalid-config-fail"), session_start,
[])
assert result.exit_code == 1
assert "validation failed" in result.output
# check that we are displaying actional error message
assert "ray project validate" in result.output
def test_session_create_command():
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "commands-test"), session_start,
["first", "--a", "1", "--b", "2"])
# Verify the project can be loaded.
ray.projects.ProjectDefinition(test_dir)
assert result.exit_code == 0
exec_cluster_call = mock_calls["exec_cluster"]
found_command = False
for _, kwargs in exec_cluster_call.call_args_list:
if "Starting ray job with 1 and 2" in kwargs["cmd"]:
found_command = True
assert found_command
def test_session_create_multiple():
for args in [{"a": "*", "b": "2"}, {"a": "1", "b": "*"}]:
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "commands-test"), session_start,
["first", "--a", args["a"], "--b", args["b"]])
loaded_project = ray.projects.ProjectDefinition(test_dir)
assert result.exit_code == 0
exec_cluster_call = mock_calls["exec_cluster"]
commands_executed = []
for _, kwargs in exec_cluster_call.call_args_list:
commands_executed.append(kwargs["cmd"].replace(
"cd {}; ".format(loaded_project.working_directory()), ""))
assert commands_executed.count("echo \"Setting up\"") == 2
if args["a"] == "*":
assert commands_executed.count(
"echo \"Starting ray job with 1 and 2\"") == 1
assert commands_executed.count(
"echo \"Starting ray job with 2 and 2\"") == 1
if args["b"] == "*":
assert commands_executed.count(
"echo \"Starting ray job with 1 and 1\"") == 1
assert commands_executed.count(
"echo \"Starting ray job with 1 and 2\"") == 1
# Using multiple wildcards shouldn't work
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "commands-test"), session_start,
["first", "--a", "*", "--b", "*"])
assert result.exit_code == 1
def test_session_commands():
result, mock_calls, test_dir = run_test_project(
os.path.join("session-tests", "commands-test"), session_commands, [])
assert "This is the first parameter" in result.output
assert "This is the second parameter" in result.output
assert 'Command "first"' in result.output
assert 'Command "second"' in result.output
if __name__ == "__main__":
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+1 -15
View File
@@ -23,7 +23,7 @@ class TestRedisPassword:
def f():
return 1
info = ray.init(redis_password=password)
info = ray.init(_redis_password=password)
address = info["redis_address"]
redis_ip, redis_port = address.split(":")
@@ -58,20 +58,6 @@ class TestRedisPassword:
object_ref = f.remote()
ray.get(object_ref)
def test_redis_port(self, shutdown_only):
@ray.remote
def f():
return 1
info = ray.init(redis_port=1234, redis_password="testpassword")
address = info["redis_address"]
redis_ip, redis_port = address.split(":")
assert redis_port == "1234"
redis_client = redis.StrictRedis(
host=redis_ip, port=redis_port, password="testpassword")
assert redis_client.ping()
if __name__ == "__main__":
import pytest
-24
View File
@@ -315,30 +315,6 @@ def test_numpy_serialization(ray_start_regular):
assert len(buffers) == 1
def test_numpy_subclass_serialization(ray_start_regular):
class MyNumpyConstant(np.ndarray):
def __init__(self, value):
super().__init__()
self.constant = value
def __str__(self):
print(self.constant)
constant = MyNumpyConstant(123)
def explode(x):
raise RuntimeError("Expected error.")
ray.register_custom_serializer(
type(constant), serializer=explode, deserializer=explode)
try:
ray.put(constant)
assert False, "Should never get here!"
except (RuntimeError, IndexError):
print("Correct behavior, proof that customer serializer was used.")
def test_numpy_subclass_serialization_pickle(ray_start_regular):
class MyNumpyConstant(np.ndarray):
def __init__(self, value):
+5 -10
View File
@@ -1,25 +1,20 @@
import numpy as np
import os
import pytest
import ray
@pytest.fixture(params=[1, 4])
@pytest.fixture(params=[1])
def ray_start_sharded(request):
num_redis_shards = request.param
if os.environ.get("RAY_USE_NEW_GCS") == "on":
num_redis_shards = 1
# For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
# 1-node chain for that shard only.
# TODO(ekl) enable this again once GCS supports sharding.
# num_redis_shards = request.param
# Start the Ray processes.
ray.init(
object_store_memory=int(0.5 * 10**9),
num_cpus=10,
num_redis_shards=num_redis_shards,
redis_max_memory=10**7)
# _num_redis_shards=num_redis_shards,
_redis_max_memory=10**7)
yield None
+3 -76
View File
@@ -5,7 +5,6 @@ import time
import pytest
import ray
from ray.cluster_utils import Cluster
from ray.test_utils import check_call_ray
@@ -24,43 +23,11 @@ def unix_socket_delete(unix_socket):
return os.remove(unix_socket) if unix else None
def test_conn_cluster():
# plasma_store_socket_name
with pytest.raises(Exception) as exc_info:
ray.init(
address="127.0.0.1:6379",
plasma_store_socket_name=os.path.join(
ray.utils.get_user_temp_dir(), "this_should_fail"))
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"plasma_store_socket_name must not be provided.")
# raylet_socket_name
with pytest.raises(Exception) as exc_info:
ray.init(
address="127.0.0.1:6379",
raylet_socket_name=os.path.join(ray.utils.get_user_temp_dir(),
"this_should_fail"))
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"raylet_socket_name must not be provided.")
# temp_dir
with pytest.raises(Exception) as exc_info:
ray.init(
address="127.0.0.1:6379",
temp_dir=os.path.join(ray.utils.get_user_temp_dir(),
"this_should_fail"))
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"temp_dir must not be provided.")
def test_tempdir(shutdown_only):
shutil.rmtree(ray.utils.get_ray_temp_dir(), ignore_errors=True)
ray.init(
temp_dir=os.path.join(ray.utils.get_user_temp_dir(),
"i_am_a_temp_dir"))
_temp_dir=os.path.join(ray.utils.get_user_temp_dir(),
"i_am_a_temp_dir"))
assert os.path.exists(
os.path.join(ray.utils.get_user_temp_dir(),
"i_am_a_temp_dir")), "Specified temp dir not found."
@@ -94,47 +61,7 @@ def test_tempdir_long_path():
maxlen = 104 if sys.platform.startswith("darwin") else 108
temp_dir = os.path.join(ray.utils.get_user_temp_dir(), "z" * maxlen)
with pytest.raises(OSError):
ray.init(temp_dir=temp_dir) # path should be too long
def test_raylet_socket_name(shutdown_only):
sock1 = unix_socket_create_path("i_am_a_temp_socket_1")
ray.init(raylet_socket_name=sock1)
unix_socket_verify(sock1)
ray.shutdown()
try:
unix_socket_delete(sock1)
except OSError:
pass # It could have been removed by Ray.
cluster = Cluster(True)
sock2 = unix_socket_create_path("i_am_a_temp_socket_2")
cluster.add_node(raylet_socket_name=sock2)
unix_socket_verify(sock2)
cluster.shutdown()
try:
unix_socket_delete(sock2)
except OSError:
pass # It could have been removed by Ray.
def test_temp_plasma_store_socket(shutdown_only):
sock1 = unix_socket_create_path("i_am_a_temp_socket_1")
ray.init(plasma_store_socket_name=sock1)
unix_socket_verify(sock1)
ray.shutdown()
try:
unix_socket_delete(sock1)
except OSError:
pass # It could have been removed by Ray.
cluster = Cluster(True)
sock2 = unix_socket_create_path("i_am_a_temp_socket_2")
cluster.add_node(plasma_store_socket_name=sock2)
unix_socket_verify(sock2)
cluster.shutdown()
try:
unix_socket_delete(sock2)
except OSError:
pass # It could have been removed by Ray.
ray.init(_temp_dir=temp_dir) # path should be too long
def test_raylet_tempfiles(shutdown_only):
@@ -9,13 +9,14 @@ class TestUnreconstructableErrors(unittest.TestCase):
ray.init(
num_cpus=1,
object_store_memory=150 * 1024 * 1024,
redis_max_memory=10000000)
_redis_max_memory=10000000)
def tearDown(self):
ray.shutdown()
def testDriverPutEvictedCannotReconstruct(self):
x_id = ray.put(np.zeros(1 * 1024 * 1024), weakref=True)
x_id = ray.worker.global_worker.put_object(
np.zeros(1 * 1024 * 1024), pin_object=False)
ray.get(x_id)
for _ in range(20):
ray.put(np.zeros(10 * 1024 * 1024))
+1 -1
View File
@@ -13,7 +13,7 @@ import ray
def test_get_webui(shutdown_only):
addresses = ray.init(include_dashboard=True, num_cpus=1)
webui_url = addresses["webui_url"]
assert ray.get_webui_url() == webui_url
assert ray.get_dashboard_url() == webui_url
assert re.match(r"^(localhost|\d+\.\d+\.\d+\.\d+):\d+$", webui_url)
@@ -106,7 +106,7 @@ def tune_transformer(num_samples=8,
gpus_per_trial=0,
smoke_test=False,
ray_address=None):
ray.init(ray_address, log_to_driver=False)
ray.init(ray_address, _log_to_driver=False)
data_dir_name = "./data" if not smoke_test else "./test_data"
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
if not os.path.exists(data_dir):
+3 -3
View File
@@ -396,7 +396,7 @@ class RayTrialExecutor(TrialExecutor):
try:
reset_val = ray.get(
trainable.reset.remote(new_config, trial.logdir),
DEFAULT_GET_TIMEOUT)
timeout=DEFAULT_GET_TIMEOUT)
except RayTimeoutError:
logger.exception("Trial %s: reset timed out.", trial)
return False
@@ -465,7 +465,7 @@ class RayTrialExecutor(TrialExecutor):
raise ValueError("Trial was not running.")
self._running.pop(trial_future[0])
with warn_if_slow("fetch_result"):
result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
# For local mode
if isinstance(result, _LocalWrapper):
@@ -734,7 +734,7 @@ class RayTrialExecutor(TrialExecutor):
with self._change_working_directory(trial):
return ray.get(
trial.runner.export_model.remote(trial.export_formats),
DEFAULT_GET_TIMEOUT)
timeout=DEFAULT_GET_TIMEOUT)
return {}
def has_gpus(self):
+1 -1
View File
@@ -31,7 +31,7 @@ class Capturing:
@pytest.fixture
def start_ray():
ray.init(log_to_driver=False, local_mode=True)
ray.init(_log_to_driver=False, local_mode=True)
_register_all()
yield
ray.shutdown()
+2 -2
View File
@@ -96,9 +96,9 @@ class UtilMonitor(Thread):
def pin_in_object_store(obj):
"""Deprecated, use ray.put(value, weakref=False) instead."""
"""Deprecated, use ray.put(value) instead."""
obj_ref = ray.put(obj, weakref=False)
obj_ref = ray.put(obj)
_pinned_objects.append(obj_ref)
return obj_ref
-2
View File
@@ -2,13 +2,11 @@ from ray.util import iter
from ray.util.actor_pool import ActorPool
from ray.util.debug import log_once, disable_log_once_globally, \
enable_periodic_logging
from ray.util.named_actors import get_actor
__all__ = [
"ActorPool",
"disable_log_once_globally",
"enable_periodic_logging",
"get_actor",
"iter",
"log_once",
]
-27
View File
@@ -1,27 +0,0 @@
import logging
import ray
logger = logging.getLogger(__name__)
def _get_actor(name):
worker = ray.worker.global_worker
handle = worker.core_worker.get_named_actor_handle(name)
return handle
def get_actor(name: str) -> ray.actor.ActorHandle:
"""Get a named actor which was previously created.
If the actor doesn't exist, an exception will be raised.
Args:
name: The name of the named actor.
Returns:
The ActorHandle object corresponding to the name.
"""
logger.warning("ray.util.get_actor has been moved to ray.get_actor and "
"will be removed in the future.")
return _get_actor(name)
@@ -102,7 +102,7 @@ if __name__ == "__main__":
args, _ = parser.parse_known_args()
num_cpus = 4 if args.smoke_test else None
ray.init(address=args.address, num_cpus=num_cpus, log_to_driver=True)
ray.init(address=args.address, num_cpus=num_cpus, _log_to_driver=True)
trainer1 = TorchTrainer(
model_creator=ResNet18,
@@ -98,7 +98,7 @@ if __name__ == "__main__":
"--tune", action="store_true", default=False, help="Tune training")
args, _ = parser.parse_known_args()
ray.init(address=args.address, log_to_driver=True)
ray.init(address=args.address, _log_to_driver=True)
TorchTrainable = TorchTrainer.as_trainable(
model_creator=ResNet18,
+115 -256
View File
@@ -373,7 +373,7 @@ class Worker:
sys.exit(0)
def get_gpu_ids(as_str=False):
def get_gpu_ids():
"""Get the IDs of the GPUs that are available to the worker.
If the CUDA_VISIBLE_DEVICES environment variable was set when the worker
@@ -407,16 +407,6 @@ def get_gpu_ids(as_str=False):
max_gpus = global_worker.node.get_resource_spec().num_gpus
assigned_ids = global_worker.original_gpu_ids[:max_gpus]
if not as_str:
from ray.util.debug import log_once
if log_once("ray.get_gpu_ids.as_str"):
logger.warning(
"ray.get_gpu_ids() will return a list of strings by default"
" in a future version of Ray for compatibility with CUDA. "
"To enable the forward-compatible behavior, use "
"`ray.get_gpu_ids(as_str=True)`.")
assigned_ids = [int(assigned_id) for assigned_id in assigned_ids]
return assigned_ids
@@ -438,13 +428,13 @@ def get_resource_ids():
return global_worker.core_worker.resource_ids()
def get_webui_url():
"""Get the URL to access the web UI.
def get_dashboard_url():
"""Get the URL to access the Ray dashboard.
Note that the URL does not specify which node the web UI is on.
Note that the URL does not specify which node the dashboard is on.
Returns:
The URL of the web UI as a string.
The URL of the dashboard as a string.
"""
worker = global_worker
worker.check_connected()
@@ -477,48 +467,38 @@ def print_failed_task(task_status):
""")
def init(address=None,
redis_address=None,
redis_port=None,
num_cpus=None,
num_gpus=None,
memory=None,
object_store_memory=None,
resources=None,
driver_object_store_memory=None,
redis_max_memory=None,
log_to_driver=True,
node_ip_address=ray_constants.NODE_DEFAULT_IP,
object_ref_seed=None,
local_mode=False,
redirect_worker_output=None,
redirect_output=None,
ignore_reinit_error=False,
num_redis_shards=None,
redis_max_clients=None,
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
plasma_directory=None,
huge_pages=False,
include_java=False,
include_dashboard=None,
dashboard_host=ray_constants.DEFAULT_DASHBOARD_IP,
dashboard_port=ray_constants.DEFAULT_DASHBOARD_PORT,
job_id=None,
job_config=None,
configure_logging=True,
logging_level=logging.INFO,
logging_format=ray_constants.LOGGER_FORMAT,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None,
load_code_from_local=False,
java_worker_options=None,
use_pickle=True,
_system_config=None,
lru_evict=False,
enable_object_reconstruction=False,
_metrics_export_port=None,
object_spilling_config=None):
def init(
address=None,
*,
num_cpus=None,
num_gpus=None,
resources=None,
object_store_memory=None,
local_mode=False,
ignore_reinit_error=False,
include_dashboard=None,
dashboard_host=ray_constants.DEFAULT_DASHBOARD_IP,
dashboard_port=ray_constants.DEFAULT_DASHBOARD_PORT,
job_config=None,
configure_logging=True,
logging_level=logging.INFO,
logging_format=ray_constants.LOGGER_FORMAT,
enable_object_reconstruction=False,
# The following are unstable parameters and their use is discouraged.
_redis_max_memory=None,
_node_ip_address=ray_constants.NODE_DEFAULT_IP,
_driver_object_store_memory=None,
_log_to_driver=True,
_memory=None,
_redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
_include_java=False,
_java_worker_options=None,
_temp_dir=None,
_load_code_from_local=False,
_lru_evict=False,
_metrics_export_port=None,
_object_spilling_config=None,
_system_config=None):
"""
Connect to an existing Ray cluster or start one and connect to it.
@@ -551,53 +531,19 @@ def init(address=None,
is running on a node in a Ray cluster, using `auto` as the value
tells the driver to detect the the cluster, removing the need to
specify a specific node address.
redis_address (str): Deprecated; same as address.
redis_port (int): The port that the primary Redis shard should listen
to. If None, then a random port will be chosen.
num_cpus (int): Number of CPUs the user wishes to assign to each
raylet.
raylet. By default, this is set based on virtual cores.
num_gpus (int): Number of GPUs the user wishes to assign to each
raylet.
raylet. By default, this is set based on detected GPUs.
resources: A dictionary mapping the names of custom resources to the
quantities for them available.
memory: The amount of memory (in bytes) that is available for use by
workers requesting memory resources. By default, this is
automatically set based on available system memory.
object_store_memory: The amount of memory (in bytes) to start the
object store with. By default, this is automatically set based on
available system memory, subject to a 20GB cap.
redis_max_memory: The max amount of memory (in bytes) to allow each
redis shard to use. Once the limit is exceeded, redis will start
LRU eviction of entries. This only applies to the sharded redis
tables (task, object, and profile tables). By default, this is
autoset based on available system memory, subject to a 10GB cap.
log_to_driver (bool): If true, the output from all of the worker
processes on all nodes will be directed to the driver.
node_ip_address (str): The IP address of the node that we are on.
object_ref_seed (int): Used to seed the deterministic generation of
object refs. The same value can be used across multiple runs of the
same driver in order to generate the object refs in a consistent
manner. However, the same ID should not be used for different
drivers.
available system memory.
local_mode (bool): If true, the code will be executed serially. This
is useful for debugging.
driver_object_store_memory (int): Limit the amount of memory the driver
can use in the object store for creating objects. By default, this
is autoset based on available system memory, subject to a 20GB cap.
ignore_reinit_error: If true, Ray suppresses errors from calling
ray.init() a second time. Ray won't be restarted.
num_redis_shards: The number of Redis shards to start in addition to
the primary Redis shard.
redis_max_clients: If provided, attempt to configure Redis with this
maxclients number.
redis_password (str): Prevents external clients without the password
from connecting to Redis if provided.
plasma_directory: A directory where the Plasma memory mapped files
will be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
include_java: Boolean flag indicating whether or not to enable java
workers.
include_dashboard: Boolean flag indicating whether or not to start the
Ray dashboard, which displays the status of the Ray
cluster. If this argument is None, then the UI will be started if
@@ -608,7 +554,6 @@ def init(address=None,
external machines.
dashboard_port: The port to bind the dashboard server to. Defaults to
8265.
job_id: The ID of this job.
job_config (ray.job_config.JobConfig): The job configuration.
configure_logging: True (default) if configuration of logging is
allowed here. Otherwise, the user may want to configure it
@@ -619,37 +564,42 @@ def init(address=None,
timestamp, filename, line number, and message. See the source file
ray_constants.py for details. Ignored unless "configure_logging"
is true.
plasma_store_socket_name (str): If provided, specifies the socket
name used by the plasma store.
raylet_socket_name (str): If provided, specifies the socket path
used by the raylet process.
temp_dir (str): If provided, specifies the root temporary
directory for the Ray process. Defaults to an OS-specific
conventional location, e.g., "/tmp/ray".
load_code_from_local: Whether code should be loaded from a local
module or from the GCS.
java_worker_options: Overwrite the options to start Java workers.
use_pickle: Deprecated.
_system_config (dict): Configuration for overriding RayConfig
defaults. Used to set system configuration and for experimental Ray
core feature flags.
lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.UnreconstructableError may be thrown. If False, then
reference counting will be used to decide which objects are safe
to evict and when under memory pressure, ray.ObjectStoreFullError
may be thrown.
enable_object_reconstruction (bool): If True, when an object stored in
the distributed plasma store is lost due to node failure, Ray will
attempt to reconstruct the object by re-executing the task that
created the object. Arguments to the task will be recursively
reconstructed. If False, then ray.UnreconstructableError will be
thrown.
_redis_max_memory: Redis max memory.
_node_ip_address (str): The IP address of the node that we are on.
_driver_object_store_memory (int): Limit the amount of memory the
driver can use in the object store for creating objects.
_log_to_driver (bool): If true, the output from all of the worker
processes on all nodes will be directed to the driver.
_memory: Amount of reservable memory resource to create.
_redis_password (str): Prevents external clients without the password
from connecting to Redis if provided.
_include_java: Boolean flag indicating whether or not to enable java
workers.
_temp_dir (str): If provided, specifies the root temporary
directory for the Ray process. Defaults to an OS-specific
conventional location, e.g., "/tmp/ray".
_load_code_from_local: Whether code should be loaded from a local
module or from the GCS.
_java_worker_options: Overwrite the options to start Java workers.
_lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.UnreconstructableError may be thrown. If False, then
reference counting will be used to decide which objects are safe
to evict and when under memory pressure, ray.ObjectStoreFullError
may be thrown.
_metrics_export_port(int): Port number Ray exposes system metrics
through a Prometheus endpoint. It is currently under active
development, and the API is subject to change.
object_spilling_config (str): The configuration json string for object
_object_spilling_config (str): The configuration json string for object
spilling I/O worker.
_system_config (str): JSON configuration for overriding
RayConfig defaults. For testing purposes ONLY.
Returns:
Address information about the started processes.
@@ -659,15 +609,8 @@ def init(address=None,
arguments is passed in.
"""
if not use_pickle:
raise DeprecationWarning("The use_pickle argument is deprecated.")
if redis_address is not None:
raise DeprecationWarning("The redis_address argument is deprecated. "
"Please use address instead.")
if "RAY_ADDRESS" in os.environ:
if redis_address is None and (address is None or address == "auto"):
if address is None or address == "auto":
address = os.environ["RAY_ADDRESS"]
else:
raise RuntimeError(
@@ -677,9 +620,10 @@ def init(address=None,
"please call ray.init() or ray.init(address=\"auto\") on the "
"driver.")
if redis_address is not None or address is not None:
redis_address, _, _ = services.validate_redis_address(
address, redis_address)
if address:
redis_address, _, _ = services.validate_redis_address(address)
else:
redis_address = None
if configure_logging:
setup_logger(logging_level, logging_format)
@@ -700,12 +644,6 @@ def init(address=None,
"'ignore_reinit_error=True' or by calling "
"'ray.shutdown()' prior to 'ray.init()'.")
# Convert hostnames to numerical IP address.
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
raylet_ip_address = node_ip_address
_system_config = _system_config or {}
if not isinstance(_system_config, dict):
raise TypeError("The _system_config must be a dict.")
@@ -715,39 +653,37 @@ def init(address=None,
# In this case, we need to start a new cluster.
ray_params = ray.parameter.RayParams(
redis_address=redis_address,
redis_port=redis_port,
node_ip_address=node_ip_address,
raylet_ip_address=raylet_ip_address,
object_ref_seed=object_ref_seed,
node_ip_address=None,
raylet_ip_address=None,
object_ref_seed=None,
driver_mode=driver_mode,
redirect_worker_output=redirect_worker_output,
redirect_output=redirect_output,
redirect_worker_output=None,
redirect_output=None,
num_cpus=num_cpus,
num_gpus=num_gpus,
resources=resources,
num_redis_shards=num_redis_shards,
redis_max_clients=redis_max_clients,
redis_password=redis_password,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
include_java=include_java,
num_redis_shards=None,
redis_max_clients=None,
redis_password=_redis_password,
plasma_directory=None,
huge_pages=None,
include_java=_include_java,
include_dashboard=include_dashboard,
dashboard_host=dashboard_host,
dashboard_port=dashboard_port,
memory=memory,
memory=_memory,
object_store_memory=object_store_memory,
redis_max_memory=redis_max_memory,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir,
load_code_from_local=load_code_from_local,
java_worker_options=java_worker_options,
redis_max_memory=_redis_max_memory,
plasma_store_socket_name=None,
temp_dir=_temp_dir,
load_code_from_local=_load_code_from_local,
java_worker_options=_java_worker_options,
start_initial_python_workers_for_first_job=True,
_system_config=_system_config,
lru_evict=lru_evict,
lru_evict=_lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=_metrics_export_port,
object_spilling_config=object_spilling_config)
object_spilling_config=_object_spilling_config)
# Start the Ray processes. We set shutdown_at_exit=False because we
# shutdown the node in the ray.shutdown call that happens in the atexit
# handler. We still spawn a reaper process in case the atexit handler
@@ -766,45 +702,15 @@ def init(address=None,
if resources is not None:
raise ValueError("When connecting to an existing cluster, "
"resources must not be provided.")
if num_redis_shards is not None:
raise ValueError("When connecting to an existing cluster, "
"num_redis_shards must not be provided.")
if redis_max_clients is not None:
raise ValueError("When connecting to an existing cluster, "
"redis_max_clients must not be provided.")
if memory is not None:
raise ValueError("When connecting to an existing cluster, "
"memory must not be provided.")
if object_store_memory is not None:
raise ValueError("When connecting to an existing cluster, "
"object_store_memory must not be provided.")
if redis_max_memory is not None:
raise ValueError("When connecting to an existing cluster, "
"redis_max_memory must not be provided.")
if plasma_directory is not None:
raise ValueError("When connecting to an existing cluster, "
"plasma_directory must not be provided.")
if huge_pages:
raise ValueError("When connecting to an existing cluster, "
"huge_pages must not be provided.")
if temp_dir is not None:
raise ValueError("When connecting to an existing cluster, "
"temp_dir must not be provided.")
if plasma_store_socket_name is not None:
raise ValueError("When connecting to an existing cluster, "
"plasma_store_socket_name must not be provided.")
if raylet_socket_name is not None:
raise ValueError("When connecting to an existing cluster, "
"raylet_socket_name must not be provided.")
if java_worker_options is not None:
raise ValueError("When connecting to an existing cluster, "
"java_worker_options must not be provided.")
if _system_config is not None and len(_system_config) != 0:
raise ValueError("When connecting to an existing cluster, "
"_system_config must not be provided.")
if lru_evict:
if _lru_evict:
raise ValueError("When connecting to an existing cluster, "
"lru_evict must not be provided.")
"_lru_evict must not be provided.")
if enable_object_reconstruction:
raise ValueError(
"When connecting to an existing cluster, "
@@ -812,15 +718,15 @@ def init(address=None,
# In this case, we only need to connect the node.
ray_params = ray.parameter.RayParams(
node_ip_address=node_ip_address,
raylet_ip_address=raylet_ip_address,
node_ip_address=None,
raylet_ip_address=None,
redis_address=redis_address,
redis_password=redis_password,
object_ref_seed=object_ref_seed,
temp_dir=temp_dir,
load_code_from_local=load_code_from_local,
redis_password=_redis_password,
object_ref_seed=None,
temp_dir=_temp_dir,
load_code_from_local=_load_code_from_local,
_system_config=_system_config,
lru_evict=lru_evict,
lru_evict=_lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=_metrics_export_port)
_global_node = ray.node.Node(
@@ -833,10 +739,10 @@ def init(address=None,
connect(
_global_node,
mode=driver_mode,
log_to_driver=log_to_driver,
log_to_driver=_log_to_driver,
worker=global_worker,
driver_object_store_memory=driver_object_store_memory,
job_id=job_id,
driver_object_store_memory=_driver_object_store_memory,
job_id=None,
job_config=job_config)
for hook in _post_init_hooks:
@@ -849,7 +755,7 @@ def init(address=None,
_post_init_hooks = []
def shutdown(exiting_interpreter=False):
def shutdown(_exiting_interpreter=False):
"""Disconnect the worker, and terminate processes started by ray.init().
This will automatically run at the end when a Python process that uses Ray
@@ -863,16 +769,16 @@ def shutdown(exiting_interpreter=False):
will need to reload the module.
Args:
exiting_interpreter (bool): True if this is called by the atexit hook
_exiting_interpreter (bool): True if this is called by the atexit hook
and false otherwise. If we are exiting the interpreter, we will
wait a little while to print any extra error messages.
"""
if exiting_interpreter and global_worker.mode == SCRIPT_MODE:
if _exiting_interpreter and global_worker.mode == SCRIPT_MODE:
# This is a duration to sleep before shutting down everything in order
# to make sure that log messages finish printing.
time.sleep(0.5)
disconnect(exiting_interpreter)
disconnect(_exiting_interpreter)
# We need to destruct the core worker here because after this function,
# we will tear down any processes spawned by ray.init() and the background
@@ -1422,50 +1328,7 @@ def _changeproctitle(title, next_title):
setproctitle.setproctitle(next_title)
def register_custom_serializer(cls,
serializer,
deserializer,
use_pickle=False,
use_dict=False,
class_id=None):
"""Registers custom functions for efficient object serialization.
The serializer and deserializer are used when transferring objects of
`cls` across processes and nodes. This can be significantly faster than
the Ray default fallbacks. Wraps `register_custom_serializer` underneath.
Args:
cls (type): The class that ray should use this custom serializer for.
serializer: The custom serializer that takes in a cls instance and
outputs a serialized representation. use_pickle and use_dict
must be False if provided.
deserializer: The custom deserializer that takes in a serialized
representation of the cls and outputs a cls instance. use_pickle
and use_dict must be False if provided.
use_pickle: Deprecated.
use_dict: Deprecated.
class_id (str): Unique ID of the class. Autogenerated if None.
"""
worker = global_worker
worker.check_connected()
if use_pickle:
raise DeprecationWarning(
"`use_pickle` is no longer a valid parameter and will be removed "
"in future versions of Ray. If this breaks your application, "
"see `SerializationContext.register_custom_serializer`.")
if use_dict:
raise DeprecationWarning(
"`use_pickle` is no longer a valid parameter and will be removed "
"in future versions of Ray. If this breaks your application, "
"see `SerializationContext.register_custom_serializer`.")
assert serializer is not None and deserializer is not None
context = global_worker.get_serialization_context()
context.register_custom_serializer(
cls, serializer, deserializer, class_id=class_id)
def show_in_webui(message, key="", dtype="text"):
def show_in_dashboard(message, key="", dtype="text"):
"""Display message in dashboard.
Display message for the current task or actor in the dashboard.
@@ -1497,7 +1360,7 @@ def show_in_webui(message, key="", dtype="text"):
blocking_get_inside_async_warned = False
def get(object_refs, timeout=None):
def get(object_refs, *, timeout=None):
"""Get a remote object or a list of remote objects from the object store.
This method blocks until the object corresponding to the object ref is
@@ -1570,17 +1433,13 @@ def get(object_refs, timeout=None):
return values
def put(value, weakref=False):
def put(value):
"""Store an object in the object store.
The object may not be evicted while a reference to the returned ID exists.
Args:
value: The Python object to be stored.
weakref: If set, allows the object to be evicted while a reference
to the returned ID exists. You might want to set this if putting
a lot of objects that you might not need in the future.
It allows Ray to more aggressively reclaim memory.
Returns:
The object ref assigned to this value.
@@ -1589,7 +1448,7 @@ def put(value, weakref=False):
worker.check_connected()
with profiling.profile("ray.put"):
try:
object_ref = worker.put_object(value, pin_object=not weakref)
object_ref = worker.put_object(value, pin_object=True)
except ObjectStoreFullError:
logger.info(
"Put failed since the value was either too large or the "
@@ -1602,7 +1461,7 @@ def put(value, weakref=False):
blocking_wait_inside_async_warned = False
def wait(object_refs, num_returns=1, timeout=None):
def wait(object_refs, *, num_returns=1, timeout=None):
"""Return a list of IDs that are ready and a list of IDs that are not.
If timeout is set, the function returns either when the requested number of
@@ -1710,11 +1569,11 @@ def get_actor(name):
"""
worker = global_worker
worker.check_connected()
return ray.util.named_actors._get_actor(name)
handle = worker.core_worker.get_named_actor_handle(name)
return handle
def kill(actor, no_restart=True):
def kill(actor, *, no_restart=True):
"""Kill an actor forcefully.
This will interrupt any running tasks on the actor, causing them to fail
@@ -1740,7 +1599,7 @@ def kill(actor, no_restart=True):
worker.core_worker.kill_actor(actor._ray_actor_id, no_restart)
def cancel(object_ref, force=False):
def cancel(object_ref, *, force=False):
"""Cancels a task according to the following conditions.
If the specified task is pending execution, it will not be executed. If