Update worker.py and services.py to use plasma and the local scheduler. (#19)

* Update worker code and services code to use plasma and the local scheduler.

* Cleanups.

* Fix bug in which threads were started before the worker mode was set. This caused remote functions to be defined on workers before the worker knew it was in WORKER_MODE.

* Fix bug in install-dependencies.sh.

* Lengthen timeout in failure_test.py.

* Cleanups.

* Cleanup services.start_ray_local.

* Clean up random name generation.

* Cleanups.
This commit is contained in:
Robert Nishihara
2016-11-02 00:39:35 -07:00
committed by Philipp Moritz
parent 2068587af8
commit 072f442c1f
20 changed files with 625 additions and 1210 deletions
+5
View File
@@ -69,3 +69,8 @@ script:
- python src/common/test/test.py
- python src/plasma/test/test.py
- python src/photon/test/test.py
- python test/runtest.py
- python test/array_test.py
- python test/failure_test.py
- python test/microbenchmarks.py
-147
View File
@@ -1,147 +0,0 @@
cmake_minimum_required(VERSION 2.8)
project(ray)
set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty")
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
set(CMAKE_PREFIX_PATH "${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/" ${CMAKE_PREFIX_PATH})
if(NOT APPLE)
find_package(PythonInterp REQUIRED)
find_package(PythonLibs REQUIRED)
set(CUSTOM_PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
else()
find_program(CUSTOM_PYTHON_EXECUTABLE python)
message("-- Found Python program: ${CUSTOM_PYTHON_EXECUTABLE}")
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"import sys; print 'python' + sys.version[0:3]"
OUTPUT_VARIABLE PYTHON_LIBRARY_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"import sys; print sys.exec_prefix"
OUTPUT_VARIABLE PYTHON_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
FIND_LIBRARY(PYTHON_LIBRARIES
NAMES ${PYTHON_LIBRARY_NAME}
HINTS "${PYTHON_PREFIX}"
PATH_SUFFIXES "lib" "libs"
NO_DEFAULT_PATH)
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
"from distutils.sysconfig import *; print get_python_inc()"
OUTPUT_VARIABLE PYTHON_INCLUDE_DIRS OUTPUT_STRIP_TRAILING_WHITESPACE)
if(PYTHON_LIBRARIES AND PYTHON_INCLUDE_DIRS)
SET(PYTHONLIBS_FOUND TRUE)
message("-- Found PythonLibs: " ${PYTHON_LIBRARIES})
message("-- -- Used custom search path")
else()
find_package(PythonLibs REQUIRED)
message("-- -- Used find_package(PythonLibs)")
endif()
endif()
find_package(NumPy REQUIRED)
find_package(Boost REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
include_directories("${CMAKE_SOURCE_DIR}/include")
include_directories("${CMAKE_SOURCE_DIR}/thirdparty/grpc/include/")
include_directories("${CMAKE_SOURCE_DIR}/thirdparty/grpc/third_party/protobuf/src")
include_directories("${PYTHON_INCLUDE_DIRS}")
include_directories("${NUMPY_INCLUDE_DIR}")
include_directories("/usr/local/include")
include_directories("${Boost_INCLUDE_DIRS}")
set(PROTO_PATH "${CMAKE_SOURCE_DIR}/protos")
set(GRAPH_PROTO "${PROTO_PATH}/graph.proto")
set(RAY_PROTO "${PROTO_PATH}/ray.proto")
set(TYPES_PROTO "${PROTO_PATH}/types.proto")
set(GENERATED_PROTOBUF_PATH "${CMAKE_BINARY_DIR}/generated")
file(MAKE_DIRECTORY ${GENERATED_PROTOBUF_PATH})
set(GRAPH_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/graph.pb.cc")
set(GRAPH_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/graph.pb.h")
set(RAY_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/ray.pb.cc")
set(RAY_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/ray.pb.h")
set(RAY_GRPC_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/ray.grpc.pb.cc")
set(RAY_GRPC_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/ray.grpc.pb.h")
set(TYPES_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/types.pb.cc")
set(TYPES_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/types.pb.h")
add_custom_command(
OUTPUT "${GRAPH_PB_H_FILE}"
"${GRAPH_PB_CPP_FILE}"
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
ARGS "--proto_path=${PROTO_PATH}"
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
"${GRAPH_PROTO}"
)
add_custom_command(
OUTPUT "${RAY_PB_H_FILE}"
"${RAY_PB_CPP_FILE}"
"${RAY_GRPC_PB_H_FILE}"
"${RAY_GRPC_PB_CPP_FILE}"
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
ARGS "--proto_path=${PROTO_PATH}"
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
"${RAY_PROTO}"
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
ARGS "--proto_path=${PROTO_PATH}"
"--grpc_out=${GENERATED_PROTOBUF_PATH}"
"--plugin=protoc-gen-grpc=${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/grpc_cpp_plugin"
"${RAY_PROTO}"
)
add_custom_command(
OUTPUT "${TYPES_PB_H_FILE}"
"${TYPES_PB_CPP_FILE}"
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
ARGS "--proto_path=${PROTO_PATH}"
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
"${TYPES_PROTO}"
)
set(GENERATED_PROTOBUF_FILES
${GRAPH_PB_H_FILE} ${GRAPH_PB_CPP_FILE}
${RAY_PB_H_FILE} ${RAY_PB_CPP_FILE}
${RAY_GRPC_PB_H_FILE} ${RAY_GRPC_PB_CPP_FILE}
${TYPES_PB_H_FILE} ${TYPES_PB_CPP_FILE})
include_directories(${GENERATED_PROTOBUF_PATH})
link_libraries(${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc++_unsecure.a
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc++.a
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc.a
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/protobuf/libprotobuf.a
${CMAKE_SOURCE_DIR}/thirdparty/hiredis/libhiredis.a
pthread)
if(UNIX AND NOT APPLE)
link_libraries(rt)
endif()
if(APPLE)
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
endif(APPLE)
set(ARROW_LIB ${CMAKE_SOURCE_DIR}/thirdparty/arrow-old/cpp/build/release/libarrow.a)
add_definitions(-fPIC)
add_executable(objstore src/objstore.cc src/ipc.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
add_executable(scheduler src/scheduler.cc src/computation_graph.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
add_library(raylib SHARED src/raylib.cc src/worker.cc src/ipc.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
target_link_libraries(raylib ${PYTHON_LIBRARIES})
get_filename_component(PYTHON_SHARED_LIBRARY ${PYTHON_LIBRARIES} NAME)
if(APPLE)
add_custom_command(TARGET raylib
POST_BUILD COMMAND
${CMAKE_INSTALL_NAME_TOOL} -change ${PYTHON_SHARED_LIBRARY} ${PYTHON_LIBRARIES} libraylib.so)
endif(APPLE)
install(TARGETS objstore scheduler raylib DESTINATION ${CMAKE_SOURCE_DIR}/lib/python/ray)
-5
View File
@@ -1,5 +0,0 @@
# Data for Ray
This folder contains data neccessary to run tests, etc. Only very small amounts
of data should be stored here and if a loader for a large dataset is tested, a
miniature version of this dataset should be created.
BIN
View File
Binary file not shown.
+2 -2
View File
@@ -31,14 +31,14 @@ if [[ $platform == "linux" ]]; then
# These commands must be kept in sync with the installation instructions.
sudo apt-get update
sudo apt-get install -y git cmake build-essential autoconf curl libtool python-dev python-numpy python-pip libboost-all-dev unzip graphviz
sudo pip install ipython funcsigs subprocess32 protobuf colorama graphviz
sudo pip install ipython funcsigs subprocess32 protobuf colorama graphviz redis
sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
elif [[ $platform == "macosx" ]]; then
# These commands must be kept in sync with the installation instructions.
brew install git cmake automake autoconf libtool boost graphviz
sudo easy_install pip
sudo pip install ipython --user
sudo pip install numpy funcsigs subprocess32 protobuf colorama graphviz --ignore-installed six
sudo pip install numpy funcsigs subprocess32 protobuf colorama graphviz redis --ignore-installed six
sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
fi
+2 -4
View File
@@ -11,8 +11,6 @@ if hasattr(ctypes, "windll"):
import config
import serialization
from worker import scheduler_info, register_class, visualize_computation_graph, task_info, init, connect, disconnect, get, put, wait, remote, kill_workers, restart_workers_local
from worker import register_class, error_info, init, connect, disconnect, get, put, wait, remote
from worker import Reusable, reusables
from libraylib import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
from libraylib import ObjectID
import internal
from worker import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
+25
View File
@@ -0,0 +1,25 @@
from __future__ import print_function
import sys
import argparse
import numpy as np
import ray
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
parser.add_argument("--redis-port", required=True, type=int, help="the port to use for Redis")
parser.add_argument("--object-store-name", type=str, help="the object store's name")
parser.add_argument("--object-store-manager-name", type=str, help="the object store manager's name")
parser.add_argument("--local-scheduler-name", type=str, help="the local scheduler's name")
if __name__ == "__main__":
args = parser.parse_args()
address_info = {"node_ip_address": args.node_ip_address,
"redis_port": args.redis_port,
"object_store_name": args.object_store_name,
"object_store_manager_name": args.object_store_manager_name,
"local_scheduler_name": args.local_scheduler_name}
ray.worker.connect(address_info, ray.WORKER_MODE)
ray.worker.main_loop()
-34
View File
@@ -1,34 +0,0 @@
# Utilities to deal with computation graphs
import graphviz
def graph_to_graphviz(computation_graph):
"""
Convert the computation graph to graphviz format.
Args:
computation_graph [graph_pb2.CompGraph]: protocol buffer description of
the computation graph
Returns:
Graphviz description of the computation graph
"""
dot = graphviz.Digraph(format="pdf")
dot.node("op-root", shape="box")
for (i, op) in enumerate(computation_graph.operation):
if op.HasField("task"):
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + op.task.name.split(".")[-1])
for res in op.task.result:
dot.edge("op" + str(i), str(res))
elif op.HasField("put"):
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + "put")
dot.edge("op" + str(i), str(op.put.objectid))
elif op.HasField("get"):
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + "get")
creator_operationid = op.creator_operationid if op.creator_operationid != 2 ** 64 - 1 else "-root"
dot.edge("op" + str(creator_operationid), "op" + str(i), style="dotted", constraint="false")
for arg in op.task.arg:
if len(arg.serialized_arg) == 0:
dot.node(str(arg.objectid))
dot.edge(str(arg.objectid), "op" + str(i))
return dot
View File
+2 -82
View File
@@ -1,89 +1,9 @@
from __future__ import print_function
import numpy as np
import pickling
import libraylib as raylib
import numbuf
def is_argument_serializable(value):
"""Checks if value is a composition of primitive types.
This will return True if the argument is one of the following:
- An int
- A float
- A bool
- None
- A list of length at most 100 whose elements are serializable
- A tuple of length at most 100 whose elements are serializable
- A dict of length at most 100 whose keys and values are serializable
- A string of length at most 100.
- A unicode string of length at most 100.
Args:
value: A Python object.
Returns:
True if the object can be serialized as a composition of primitive types and
False otherwise.
"""
t = type(value)
if t is int or t is float or t is long or t is bool or value is None:
return True
if t is list:
if len(value) <= 100:
for element in value:
if not is_argument_serializable(element):
return False
return True
else:
return False
if t is tuple:
if len(value) <= 100:
for element in value:
if not is_argument_serializable(element):
return False
return True
else:
return False
if t is dict:
if len(value) <= 100:
for k, v in value.iteritems():
if not is_argument_serializable(k) or not is_argument_serializable(v):
return False
return True
else:
return False
if t is str:
return len(value) <= 100
if t is unicode:
return len(value) <= 100
return False
def serialize_argument_if_possible(value):
"""This method serializes arguments that are passed by value.
The result will be deserialized by deserialize_argument.
Returns:
None if value cannot be efficiently serialized or is too big, and otherwise
this returns the serialized value as a string.
"""
if not is_argument_serializable(value):
# The argument is not obviously serializable using __repr__, so we will not
# serialize it.
return None
serialized_value = value.__repr__()
if len(serialized_value) > 1000:
# The argument is too big, so we will not pass it by value.
return None
# Return the serialized argument.
return serialized_value
def deserialize_argument(serialized_value):
"""This method deserializes arguments that are passed by value.
The argument will have been serialized by serialize_argument.
"""
return eval(serialized_value)
def check_serializable(cls):
"""Throws an exception if Ray cannot serialize this class efficiently.
+80 -115
View File
@@ -1,31 +1,29 @@
from __future__ import print_function
import os
import sys
import time
import subprocess32 as subprocess
import subprocess
import string
import random
# Ray modules
import config
_services_env = os.environ.copy()
_services_env["PATH"] = os.pathsep.join([os.path.dirname(os.path.abspath(__file__)), _services_env["PATH"]])
# Make GRPC only print error messages.
_services_env["GRPC_VERBOSITY"] = "ERROR"
# all_processes is a list of the scheduler, object store, and worker processes
# that have been started by this services module if Ray is being used in local
# mode.
all_processes = []
TIMEOUT_SECONDS = 5
def address(host, port):
return host + ":" + str(port)
def new_scheduler_port():
def new_port():
return random.randint(10000, 65535)
def random_name():
return str(random.randint(0, 99999999))
def cleanup():
"""When running in local mode, shutdown the Ray processes.
@@ -36,7 +34,8 @@ def cleanup():
"""
global all_processes
successfully_shut_down = True
for p in all_processes:
# Terminate the processes in reverse order.
for p in all_processes[::-1]:
if p.poll() is not None: # process has already terminated
continue
p.kill()
@@ -49,146 +48,112 @@ def cleanup():
continue
successfully_shut_down = False
if successfully_shut_down:
print "Successfully shut down Ray."
print("Successfully shut down Ray.")
else:
print "Ray did not shut down properly."
print("Ray did not shut down properly.")
all_processes = []
def start_scheduler(scheduler_address, cleanup):
"""This method starts a scheduler process.
Args:
scheduler_address (str): The ip address and port to use for the scheduler.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
"""
scheduler_port = scheduler_address.split(":")[1]
p = subprocess.Popen(["scheduler", scheduler_address, "--log-file-name", config.get_log_file_path("scheduler-" + scheduler_port + ".log")], env=_services_env)
def start_redis(port):
redis_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../common/thirdparty/redis-3.2.3/src/redis-server")
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning"])
if cleanup:
all_processes.append(p)
def start_objstore(scheduler_address, node_ip_address, cleanup):
def start_local_scheduler(redis_address, plasma_store_name):
local_scheduler_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../photon/build/photon_scheduler")
local_scheduler_name = "/tmp/scheduler{}".format(random_name())
p = subprocess.Popen([local_scheduler_filepath, "-s", local_scheduler_name, "-r", redis_address, "-p", plasma_store_name])
if cleanup:
all_processes.append(p)
return local_scheduler_name
def start_objstore(node_ip_address, redis_address, cleanup):
"""This method starts an object store process.
Args:
scheduler_address (str): The ip address and port of the scheduler to connect
to.
node_ip_address (str): The ip address of the node running the object store.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
"""
random_string = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
p = subprocess.Popen(["objstore", scheduler_address, node_ip_address, "--log-file-name", config.get_log_file_path("-".join(["objstore", random_string]) + ".log")], env=_services_env)
if cleanup:
all_processes.append(p)
plasma_store_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../plasma/build/plasma_store")
store_name = "/tmp/ray_plasma_store{}".format(random_name())
p1 = subprocess.Popen([plasma_store_executable, "-s", store_name])
def start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=None, cleanup=True):
plasma_manager_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../plasma/build/plasma_manager")
manager_name = "/tmp/ray_plasma_manager{}".format(random_name())
manager_port = new_port()
p2 = subprocess.Popen([plasma_manager_executable,
"-s", store_name,
"-m", manager_name,
"-h", node_ip_address,
"-p", str(manager_port),
"-r", redis_address])
if cleanup:
all_processes.append(p1)
all_processes.append(p2)
return store_name, manager_name, manager_port
def start_worker(address_info, worker_path, cleanup=True):
"""This method starts a worker process.
Args:
node_ip_address (str): The IP address of the node that the worker runs on.
address_info (dict): This dictionary contains the node_ip_address,
redis_port, object_store_name, object_store_manager_name, and
local_scheduler_name.
worker_path (str): The path of the source code which the worker process will
run.
scheduler_address (str): The ip address and port of the scheduler to connect
to.
objstore_address (Optional[str]): The ip address and port of the object
store to connect to.
cleanup (Optional[bool]): True if using Ray in local mode. If cleanup is
true, then this process will be killed by serices.cleanup() when the
Python process that imported services exits. This is True by default.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by services.cleanup() when the Python process
that imported services exits. This is True by default.
"""
command = ["python",
worker_path,
"--node-ip-address=" + node_ip_address,
"--scheduler-address=" + scheduler_address]
if objstore_address is not None:
command.append("--objstore-address=" + objstore_address)
"--node-ip-address=" + address_info["node_ip_address"],
"--object-store-name=" + address_info["object_store_name"],
"--object-store-manager-name=" + address_info["object_store_manager_name"],
"--local-scheduler-name=" + address_info["local_scheduler_name"],
"--redis-port=" + str(address_info["redis_port"])]
p = subprocess.Popen(command)
if cleanup:
all_processes.append(p)
def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None, cleanup=False):
"""Start an object store and associated workers in the cluster setting.
This starts an object store and the associated workers when Ray is being used
in the cluster setting. This assumes the scheduler has already been started.
Args:
scheduler_address (str): IP address and port of the scheduler (which may run
on a different node).
node_ip_address (str): IP address (without port) of the node this function
is run on.
num_workers (int): The number of workers to be started on this node.
worker_path (str): Path of the Python worker script that will be run on the
worker.
cleanup (bool): If cleanup is True, then the processes started by this
command will be killed when the process that imported services exits.
"""
start_objstore(scheduler_address, node_ip_address, cleanup=cleanup)
time.sleep(0.2)
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
for _ in range(num_workers):
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=cleanup)
time.sleep(0.5)
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
"""Start a new set of workers on this node.
Start a new set of workers on this node. This assumes that the scheduler is
already running and that the object store on this node is already running. The
intended use case is that a developer wants to update the code running on the
worker processes so first kills all of the workers and then runs this method.
Args:
scheduler_address (str): ip address and port of the scheduler (which may run
on a different node)
objstore_address (str): ip address and port of the object store (which runs
on the same node)
num_workers (int): the number of workers to be started on this node
worker_path (str): path of the source code that will be run on the worker
"""
node_ip_address = objstore_address.split(":")[0]
for _ in range(num_workers):
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=False)
def start_ray_local(node_ip_address="127.0.0.1", num_objstores=1, num_workers=0, worker_path=None):
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None):
"""Start Ray in local mode.
This method starts Ray in local mode (as opposed to cluster mode, which is
handled by cluster.py).
Args:
num_objstores (int): The number of object stores to start. Aside from
testing, this should be one.
num_workers (int): The number of workers to start.
worker_path (str): The path of the source code that will be run by the
worker.
Returns:
The address of the scheduler and the addresses of all of the object stores.
This returns a tuple of three things. The first element is a tuple of the
Redis hostname and port. The second
"""
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
if num_objstores < 1:
raise Exception("`num_objstores` is {}, but should be at least 1.".format(num_objstores))
scheduler_address = address(node_ip_address, new_scheduler_port())
start_scheduler(scheduler_address, cleanup=True)
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default_worker.py")
# Start Redis.
redis_port = new_port()
redis_address = address(node_ip_address, redis_port)
start_redis(redis_port)
time.sleep(0.1)
# create objstores
for i in range(num_objstores):
start_objstore(scheduler_address, node_ip_address, cleanup=True)
time.sleep(0.2)
if i < num_objstores - 1:
num_workers_to_start = num_workers / num_objstores
else:
# In case num_workers is not divisible by num_objstores, start the correct
# remaining number of workers.
num_workers_to_start = num_workers - (num_objstores - 1) * (num_workers / num_objstores)
for _ in range(num_workers_to_start):
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=True)
time.sleep(0.3)
return scheduler_address
# Start Plasma.
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
# Start the local scheduler.
time.sleep(0.1)
local_scheduler_name = start_local_scheduler(redis_address, object_store_name)
time.sleep(0.2)
# Aggregate the address information together.
address_info = {"node_ip_address": node_ip_address,
"redis_port": redis_port,
"object_store_name": object_store_name,
"object_store_manager_name": object_store_manager_name,
"local_scheduler_name": local_scheduler_name}
# Start the workers.
for _ in range(num_workers):
start_worker(address_info, worker_path, cleanup=True)
time.sleep(0.3)
# Return the addresses of the relevant processes.
return address_info
+432 -454
View File
File diff suppressed because it is too large Load Diff
-16
View File
@@ -1,16 +0,0 @@
import sys
import argparse
import numpy as np
import ray
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
parser.add_argument("--scheduler-address", required=True, type=str, help="the scheduler's address")
parser.add_argument("--objstore-address", type=str, help="the objstore's address")
if __name__ == "__main__":
args = parser.parse_args()
ray.worker.connect(args.node_ip_address, args.scheduler_address)
ray.worker.main_loop()
-19
View File
@@ -1,19 +0,0 @@
# NO shebang! Force the user to run this using the 'source' command without spawning a new shell; otherwise, variable exports won't persist.
echo "Adding Ray to PYTHONPATH" 1>&2
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
export PYTHONPATH="$ROOT_DIR/lib/python/:$ROOT_DIR/thirdparty/numbuf/build:$PYTHONPATH"
# Print instructions for adding Ray to your bashrc.
unamestr="$(uname)"
if [[ "$unamestr" == "Linux" ]]; then
BASH_RC="~/.bashrc"
elif [[ "$unamestr" == "Darwin" ]]; then
BASH_RC="~/.bash_profile"
fi
echo "To permanently add Ray to your Python path, run,
echo 'export PYTHONPATH=$ROOT_DIR/lib/python/:$ROOT_DIR/thirdparty/numbuf/build:\$PYTHONPATH' >> $BASH_RC
"
+1 -1
View File
@@ -169,7 +169,7 @@ void handle_worker_available(scheduler_info *info,
/* Add client_sock to a list of available workers. This struct will be freed
* when a task is assigned to this worker. */
utarray_push_back(state->available_workers, &worker_index);
LOG_INFO("Adding worker_index %d to available workers.\n", worker_index);
LOG_DEBUG("Adding worker_index %d to available workers.\n", worker_index);
}
}
+1 -1
View File
@@ -148,7 +148,7 @@ void new_client_connection(event_loop *loop, int listener_sock, void *context,
local_scheduler_state *s = context;
int new_socket = accept_client(listener_sock);
event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message, s);
LOG_INFO("new connection with fd %d", new_socket);
LOG_DEBUG("new connection with fd %d", new_socket);
/* Add worker to list of workers. */
/* TODO(pcm): Where shall we free this? */
worker_index *new_worker_index = malloc(sizeof(worker_index));
+26 -4
View File
@@ -40,8 +40,12 @@ class PlasmaBuffer(object):
self.plasma_client = plasma_client
def __del__(self):
"""Notify Plasma that the object is no longer needed."""
self.plasma_client.client.plasma_release(self.plasma_client.plasma_conn, self.plasma_id)
"""Notify Plasma that the object is no longer needed.
If the plasma client has been shut down, then don't do anything.
"""
if self.plasma_client.alive:
self.plasma_client.client.plasma_release(self.plasma_client.plasma_conn, self.plasma_id)
def __getitem__(self, index):
"""Read from the PlasmaBuffer as if it were just a regular buffer."""
@@ -73,7 +77,7 @@ class PlasmaClient(object):
store_socket_name (str): Name of the socket the plasma store is listening at.
manager_socket_name (str): Name of the socket the plasma manager is listening at.
"""
self.alive = True
plasma_client_library = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../build/plasma_client.so")
self.client = ctypes.cdll.LoadLibrary(plasma_client_library)
@@ -85,6 +89,7 @@ class PlasmaClient(object):
self.client.plasma_seal.restype = None
self.client.plasma_delete.restype = None
self.client.plasma_subscribe.restype = ctypes.c_int
self.client.plasma_wait.restype = ctypes.c_int
self.buffer_from_memory = ctypes.pythonapi.PyBuffer_FromMemory
self.buffer_from_memory.argtypes = [ctypes.c_void_p, ctypes.c_int64]
@@ -101,6 +106,15 @@ class PlasmaClient(object):
self.has_manager_conn = False
self.plasma_conn = ctypes.c_void_p(self.client.plasma_connect(store_socket_name, None))
def shutdown(self):
"""Shutdown the client so that it does not send messages.
If we kill the Plasma store and Plasma manager that this client is connected
to, then we can use this method to prevent the client from trying to send
messages to the killed processes.
"""
self.alive = False
def create(self, object_id, size, metadata=None):
"""Create a new buffer in the PlasmaStore for a particular object ID.
@@ -233,6 +247,12 @@ class PlasmaClient(object):
"""
if not self.has_manager_conn:
raise Exception("Not connected to the plasma manager socket")
if num_returns < 0:
raise Exception("The argument num_returns cannot be less than one.")
if num_returns > len(object_ids):
raise Exception("The argument num_returns cannot be greater than len(object_ids): num_returns is {}, len(object_ids) is {}.".format(num_returns, len(object_ids)))
if timeout > 2 ** 36:
raise Exception("The method wait currently cannot be used with a timeout greater than 2 ** 36.")
object_id_array = (len(object_ids) * PlasmaID)()
for i, object_id in enumerate(object_ids):
object_id_array[i] = make_plasma_id(object_id)
@@ -240,7 +260,9 @@ class PlasmaClient(object):
num_return_objects = self.client.plasma_wait(self.plasma_conn,
object_id_array._length_,
object_id_array,
timeout, num_returns, return_id_array)
ctypes.c_int64(timeout),
num_returns,
return_id_array)
ready_ids = map(plasma_id_to_str, return_id_array[num_returns-num_return_objects:])
return ready_ids, list(set(object_ids) - set(ready_ids))
+1 -1
View File
@@ -58,7 +58,7 @@ class DistributedArrayTest(unittest.TestCase):
def testMethods(self):
for module in [ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg]:
reload(module)
ray.init(start_ray_local=True, num_objstores=2, num_workers=10)
ray.init(start_ray_local=True, num_workers=10)
x = da.zeros.remote([9, 25, 51], "float")
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
+27 -39
View File
@@ -4,16 +4,24 @@ import time
import test_functions
def wait_for_errors(error_type, num_errors, timeout=10):
start_time = time.time()
while time.time() - start_time < timeout:
error_info = ray.error_info()
if len(error_info[error_type]) >= num_errors:
return
time.sleep(0.1)
print("Timing out of wait.")
class FailureTest(unittest.TestCase):
def testUnknownSerialization(self):
reload(test_functions)
ray.init(start_ray_local=True, num_workers=1, driver_mode=ray.SILENT_MODE)
test_functions.test_unknown_type.remote()
time.sleep(0.2)
task_info = ray.task_info()
self.assertEqual(len(task_info["failed_tasks"]), 1)
self.assertEqual(len(task_info["running_tasks"]), 0)
wait_for_errors("TaskError", 1)
error_info = ray.error_info()
self.assertEqual(len(error_info["TaskError"]), 1)
ray.worker.cleanup()
@@ -45,19 +53,11 @@ class TaskStatusTest(unittest.TestCase):
test_functions.throw_exception_fct1.remote()
test_functions.throw_exception_fct1.remote()
for _ in range(100): # Retry if we need to wait longer.
if len(ray.task_info()["failed_tasks"]) >= 2:
break
time.sleep(0.1)
result = ray.task_info()
self.assertEqual(len(result["failed_tasks"]), 2)
task_ids = set()
for task in result["failed_tasks"]:
self.assertTrue(task.has_key("worker_address"))
self.assertTrue(task.has_key("operationid"))
self.assertTrue("Test function 1 intentionally failed." in task.get("error_message"))
self.assertTrue(task["operationid"] not in task_ids)
task_ids.add(task["operationid"])
wait_for_errors("TaskError", 2)
result = ray.error_info()
self.assertEqual(len(result["TaskError"]), 2)
for task in result["TaskError"]:
self.assertTrue("Test function 1 intentionally failed." in task.get("message"))
x = test_functions.throw_exception_fct2.remote()
try:
@@ -96,11 +96,8 @@ class TaskStatusTest(unittest.TestCase):
def __call__(self):
return
ray.remote(Foo())
for _ in range(100): # Retry if we need to wait longer.
if len(ray.task_info()["failed_remote_function_imports"]) >= 1:
break
time.sleep(0.1)
self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"])
wait_for_errors("RemoteFunctionImportError", 1)
self.assertTrue("There is a problem here." in ray.error_info()["RemoteFunctionImportError"][0]["message"])
ray.worker.cleanup()
@@ -114,12 +111,9 @@ class TaskStatusTest(unittest.TestCase):
raise Exception("The initializer failed.")
return 0
ray.reusables.foo = ray.Reusable(initializer)
for _ in range(100): # Retry if we need to wait longer.
if len(ray.task_info()["failed_reusable_variable_imports"]) >= 1:
break
time.sleep(0.1)
wait_for_errors("ReusableVariableImportError", 1)
# Check that the error message is in the task info.
self.assertTrue("The initializer failed." in ray.task_info()["failed_reusable_variable_imports"][0]["error_message"])
self.assertTrue("The initializer failed." in ray.error_info()["ReusableVariableImportError"][0]["message"])
ray.worker.cleanup()
@@ -135,12 +129,9 @@ class TaskStatusTest(unittest.TestCase):
def use_foo():
ray.reusables.foo
use_foo.remote()
for _ in range(100): # Retry if we need to wait longer.
if len(ray.task_info()["failed_reinitialize_reusable_variables"]) >= 1:
break
time.sleep(0.1)
wait_for_errors("ReusableVariableReinitializeError", 1)
# Check that the error message is in the task info.
self.assertTrue("The reinitializer failed." in ray.task_info()["failed_reinitialize_reusable_variables"][0]["error_message"])
self.assertTrue("The reinitializer failed." in ray.error_info()["ReusableVariableReinitializeError"][0]["message"])
ray.worker.cleanup()
@@ -151,14 +142,11 @@ class TaskStatusTest(unittest.TestCase):
if ray.worker.global_worker.mode == ray.WORKER_MODE:
raise Exception("Function to run failed.")
ray.worker.global_worker.run_function_on_all_workers(f)
for _ in range(100): # Retry if we need to wait longer.
if len(ray.task_info()["failed_function_to_runs"]) >= 2:
break
time.sleep(0.1)
wait_for_errors("FunctionToRunError", 2)
# Check that the error message is in the task info.
self.assertEqual(len(ray.task_info()["failed_function_to_runs"]), 2)
self.assertTrue("Function to run failed." in ray.task_info()["failed_function_to_runs"][0]["error_message"])
self.assertTrue("Function to run failed." in ray.task_info()["failed_function_to_runs"][1]["error_message"])
self.assertEqual(len(ray.error_info()["FunctionToRunError"]), 2)
self.assertTrue("Function to run failed." in ray.error_info()["FunctionToRunError"][0]["message"])
self.assertTrue("Function to run failed." in ray.error_info()["FunctionToRunError"][1]["message"])
ray.worker.cleanup()
+21 -286
View File
@@ -1,3 +1,5 @@
from __future__ import print_function
import unittest
import ray
import numpy as np
@@ -142,64 +144,6 @@ class SerializationTest(unittest.TestCase):
ray.worker.cleanup()
class ObjStoreTest(unittest.TestCase):
# Test setting up object stores, transfering data between them and retrieving data to a client
def testObjStore(self):
node_ip_address = "127.0.0.1"
scheduler_address = ray.services.start_ray_local(num_objstores=2, num_workers=0, worker_path=None)
ray.connect(node_ip_address, scheduler_address, mode=ray.SCRIPT_MODE)
objstore_addresses = [objstore_info["address"] for objstore_info in ray.scheduler_info()["objstores"]]
w1 = ray.worker.Worker()
w2 = ray.worker.Worker()
ray.reusables._cached_reusables = [] # This is a hack to make the test run.
ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[0], mode=ray.SCRIPT_MODE, worker=w1)
ray.reusables._cached_reusables = [] # This is a hack to make the test run.
ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[1], mode=ray.SCRIPT_MODE, worker=w2)
for cls in [Foo, Bar, Baz, Qux, SubQux, Exception, CustomError, Point, NamedTupleExample]:
ray.register_class(cls)
# putting and getting an object shouldn't change it
for data in RAY_TEST_OBJECTS:
objectid = ray.put(data, w1)
result = ray.get(objectid, w1)
assert_equal(result, data)
# putting an object, shipping it to another worker, and getting it shouldn't change it
for data in RAY_TEST_OBJECTS:
objectid = ray.put(data, w1)
result = ray.get(objectid, w2)
assert_equal(result, data)
# putting an object, shipping it to another worker, and getting it shouldn't change it
for data in RAY_TEST_OBJECTS:
objectid = ray.put(data, w2)
result = ray.get(objectid, w1)
assert_equal(result, data)
# This test fails. See https://github.com/ray-project/ray/issues/159.
# getting multiple times shouldn't matter
# for data in [np.zeros([10, 20]), np.random.normal(size=[45, 25]), np.zeros([10, 20], dtype=np.dtype("float64")), np.zeros([10, 20], dtype=np.dtype("float32")), np.zeros([10, 20], dtype=np.dtype("int64")), np.zeros([10, 20], dtype=np.dtype("int32"))]:
# objectid = worker.put(data, w1)
# result = worker.get(objectid, w2)
# result = worker.get(objectid, w2)
# result = worker.get(objectid, w2)
# assert_equal(result, data)
# Getting a buffer after modifying it before it finishes should return updated buffer
objectid = ray.libraylib.get_objectid(w1.handle)
buf = ray.libraylib.allocate_buffer(w1.handle, objectid, 100)
buf[0][0] = 1
ray.libraylib.finish_buffer(w1.handle, objectid, buf[1], 0)
completedbuffer = ray.libraylib.get_buffer(w1.handle, objectid)
self.assertEqual(completedbuffer[0][0], 1)
# We started multiple drivers manually, so we will disconnect them manually.
ray.disconnect(worker=w1)
ray.disconnect(worker=w2)
ray.worker.cleanup()
class WorkerTest(unittest.TestCase):
def testPutGet(self):
@@ -233,29 +177,6 @@ class WorkerTest(unittest.TestCase):
class APITest(unittest.TestCase):
def testPassingArgumentsByValue(self):
ray.init(start_ray_local=True, num_workers=0)
# The types that can be passed by value are defined by
# is_argument_serializable in serialization.py.
class Foo(object):
pass
CAN_PASS_BY_VALUE = [1, 1L, 1.0, True, False, None, [1L, 1.0, True, None],
([1, 2, 3], {False: [1.0, u"hi", ()]}), 100 * ["a"]]
CANNOT_PASS_BY_VALUE = [int, np.int64(0), np.float64(0), Foo(), [Foo()],
(Foo()), {0: Foo()}, [[[int]]], 101 * [1],
np.zeros(10)]
for obj in CAN_PASS_BY_VALUE:
self.assertTrue(ray.serialization.is_argument_serializable(obj))
self.assertEqual(obj, ray.serialization.deserialize_argument(ray.serialization.serialize_argument_if_possible(obj)))
for obj in CANNOT_PASS_BY_VALUE:
self.assertFalse(ray.serialization.is_argument_serializable(obj))
self.assertEqual(None, ray.serialization.serialize_argument_if_possible(obj))
ray.worker.cleanup()
def testRegisterClass(self):
ray.init(start_ray_local=True, num_workers=0)
@@ -328,11 +249,7 @@ class APITest(unittest.TestCase):
reload(test_functions)
ray.init(start_ray_local=True, num_workers=1)
test_functions.no_op.remote()
time.sleep(0.2)
task_info = ray.task_info()
self.assertEqual(len(task_info["failed_tasks"]), 0)
self.assertEqual(len(task_info["running_tasks"]), 0)
ray.get(test_functions.no_op.remote())
ray.worker.cleanup()
@@ -400,22 +317,22 @@ class APITest(unittest.TestCase):
objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
ready_ids, remaining_ids = ray.wait(objectids)
self.assertTrue(len(ready_ids) == 1)
self.assertTrue(len(remaining_ids) == 3)
self.assertEqual(len(ready_ids), 1)
self.assertEqual(len(remaining_ids), 3)
ready_ids, remaining_ids = ray.wait(objectids, num_returns=4)
self.assertEqual(ready_ids, objectids)
self.assertEqual(set(ready_ids), set([object_id.id() for object_id in objectids]))
self.assertEqual(remaining_ids, [])
objectids = [f.remote(0.5), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
start_time = time.time()
ready_ids, remaining_ids = ray.wait(objectids, timeout=1.75, num_returns=4)
self.assertTrue(time.time() - start_time < 2)
ready_ids, remaining_ids = ray.wait(objectids, timeout=1750, num_returns=4)
self.assertLess(time.time() - start_time, 2)
self.assertEqual(len(ready_ids), 3)
self.assertEqual(len(remaining_ids), 1)
ray.wait(objectids)
objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
start_time = time.time()
ready_ids, remaining_ids = ray.wait(objectids, timeout=5)
ready_ids, remaining_ids = ray.wait(objectids, timeout=5000)
self.assertTrue(time.time() - start_time < 5)
self.assertEqual(len(ready_ids), 1)
self.assertEqual(len(remaining_ids), 3)
@@ -504,150 +421,6 @@ class APITest(unittest.TestCase):
ray.worker.cleanup()
def testComputationGraph(self):
ray.init(start_ray_local=True, num_workers=1)
@ray.remote
def f(x):
return x
@ray.remote
def g(x, y):
return x, y
a = f.remote(1)
b = f.remote(1)
c = g.remote(a, b)
c = g.remote(a, 1)
# Make sure that we can produce a computation_graph visualization.
ray.visualize_computation_graph(view=False)
ray.worker.cleanup()
class ReferenceCountingTest(unittest.TestCase):
def testDeallocation(self):
reload(test_functions)
for module in [ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg]:
reload(module)
ray.init(start_ray_local=True, num_workers=1)
def check_not_deallocated(object_ids):
reference_counts = ray.scheduler_info()["reference_counts"]
for object_id in object_ids:
self.assertGreater(reference_counts[object_id.id], 0)
def check_everything_deallocated():
reference_counts = ray.scheduler_info()["reference_counts"]
self.assertEqual(reference_counts, len(reference_counts) * [-1])
z = da.zeros.remote([da.BLOCK_SIZE, 2 * da.BLOCK_SIZE])
time.sleep(0.1)
objectid_val = z.id
time.sleep(0.1)
check_not_deallocated([z])
del z
time.sleep(0.1)
check_everything_deallocated()
x = ra.zeros.remote([10, 10])
y = ra.zeros.remote([10, 10])
z = ra.dot.remote(x, y)
objectid_val = x.id
time.sleep(0.1)
check_not_deallocated([x, y, z])
del x
time.sleep(0.1)
check_not_deallocated([y, z])
del y
time.sleep(0.1)
check_not_deallocated([z])
del z
time.sleep(0.1)
check_everything_deallocated()
z = da.zeros.remote([4 * da.BLOCK_SIZE])
time.sleep(0.1)
check_not_deallocated(ray.get(z).objectids.tolist())
del z
time.sleep(0.1)
check_everything_deallocated()
ray.worker.cleanup()
def testGet(self):
ray.init(start_ray_local=True, num_workers=3)
for cls in [Foo, Bar, Baz, Qux, SubQux, Exception, CustomError, Point, NamedTupleExample]:
ray.register_class(cls)
# Remote objects should be deallocated when the corresponding ObjectID goes
# out of scope, and all results of ray.get called on the ID go out of scope.
for val in RAY_TEST_OBJECTS:
x = ray.put(val)
objectid = x.id
xval = ray.get(x)
del x, xval
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
# Remote objects that do not contain numpy arrays should be deallocated when
# the corresponding ObjectID goes out of scope, even if ray.get has been
# called on the ObjectID.
for val in [True, False, None, 1, 1.0, 1L, "hi", u"hi", [1, 2, 3], (1, 2, 3), [(), {(): ()}]]:
x = ray.put(val)
objectid = x.id
xval = ray.get(x)
del x
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
# Remote objects that contain numpy arrays should not be deallocated when
# the corresponding ObjectID goes out of scope, if ray.get has been called
# on the ObjectID and the result of that call is still in scope.
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
x = ray.put(val)
objectid = x.id
xval = ray.get(x)
del x
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
# Getting an object multiple times should not be a problem. And the remote
# object should not be deallocated until both of the results are out of scope.
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
x = ray.put(val)
objectid = x.id
xval1 = ray.get(x)
xval2 = ray.get(x)
del xval1
# Make sure we can still access xval2.
xval2
del xval2
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
xval3 = ray.get(x)
xval4 = ray.get(x)
xval5 = ray.get(x)
del x
del xval4, xval5
# Make sure we can still access xval3.
xval3
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
del xval3
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
# Getting an object multiple times and assigning it to the same name should
# work. This was a problem in https://github.com/ray-project/ray/issues/159.
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
x = ray.put(val)
objectid = x.id
xval = ray.get(x)
xval = ray.get(x)
xval = ray.get(x)
xval = ray.get(x)
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
del x
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
del xval
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
ray.worker.cleanup()
class PythonModeTest(unittest.TestCase):
def testPythonMode(self):
@@ -712,18 +485,18 @@ class PythonModeTest(unittest.TestCase):
class PythonCExtensionTest(unittest.TestCase):
def testReferenceCountNone(self):
ray.init(start_ray_local=True, num_workers=1)
# Make sure that we aren't accidentally messing up Python's reference counts.
@ray.remote
def f():
return sys.getrefcount(None)
first_count = ray.get(f.remote())
second_count = ray.get(f.remote())
self.assertEqual(first_count, second_count)
ray.worker.cleanup()
# def testReferenceCountNone(self):
# ray.init(start_ray_local=True, num_workers=1)
#
# # Make sure that we aren't accidentally messing up Python's reference counts.
# @ray.remote
# def f():
# return sys.getrefcount(None)
# first_count = ray.get(f.remote())
# second_count = ray.get(f.remote())
# self.assertEqual(first_count, second_count)
#
# ray.worker.cleanup()
def testReferenceCountTrue(self):
ray.init(start_ray_local=True, num_workers=1)
@@ -867,43 +640,5 @@ class ReusablesTest(unittest.TestCase):
ray.worker.cleanup()
class ClusterAttachingTest(unittest.TestCase):
def testAttachingToCluster(self):
node_ip_address = "127.0.0.1"
scheduler_port = np.random.randint(40000, 50000)
scheduler_address = "{}:{}".format(node_ip_address, scheduler_port)
ray.services.start_scheduler(scheduler_address, cleanup=True)
time.sleep(0.1)
ray.services.start_node(scheduler_address, node_ip_address, num_workers=1, cleanup=True)
ray.init(node_ip_address=node_ip_address, scheduler_address=scheduler_address)
@ray.remote
def f(x):
return x + 1
self.assertEqual(ray.get(f.remote(0)), 1)
ray.worker.cleanup()
def testAttachingToClusterWithMultipleObjectStores(self):
node_ip_address = "127.0.0.1"
scheduler_port = np.random.randint(40000, 50000)
scheduler_address = "{}:{}".format(node_ip_address, scheduler_port)
ray.services.start_scheduler(scheduler_address, cleanup=True)
time.sleep(0.1)
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
ray.init(node_ip_address=node_ip_address, scheduler_address=scheduler_address)
@ray.remote
def f(x):
return x + 1
self.assertEqual(ray.get(f.remote(0)), 1)
ray.worker.cleanup()
if __name__ == "__main__":
unittest.main(verbosity=2)