mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
Update worker.py and services.py to use plasma and the local scheduler. (#19)
* Update worker code and services code to use plasma and the local scheduler. * Cleanups. * Fix bug in which threads were started before the worker mode was set. This caused remote functions to be defined on workers before the worker knew it was in WORKER_MODE. * Fix bug in install-dependencies.sh. * Lengthen timeout in failure_test.py. * Cleanups. * Cleanup services.start_ray_local. * Clean up random name generation. * Cleanups.
This commit is contained in:
committed by
Philipp Moritz
parent
2068587af8
commit
072f442c1f
@@ -69,3 +69,8 @@ script:
|
||||
- python src/common/test/test.py
|
||||
- python src/plasma/test/test.py
|
||||
- python src/photon/test/test.py
|
||||
|
||||
- python test/runtest.py
|
||||
- python test/array_test.py
|
||||
- python test/failure_test.py
|
||||
- python test/microbenchmarks.py
|
||||
|
||||
-147
@@ -1,147 +0,0 @@
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(ray)
|
||||
|
||||
set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
|
||||
|
||||
set(CMAKE_PREFIX_PATH "${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/" ${CMAKE_PREFIX_PATH})
|
||||
|
||||
if(NOT APPLE)
|
||||
find_package(PythonInterp REQUIRED)
|
||||
find_package(PythonLibs REQUIRED)
|
||||
set(CUSTOM_PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
|
||||
else()
|
||||
find_program(CUSTOM_PYTHON_EXECUTABLE python)
|
||||
message("-- Found Python program: ${CUSTOM_PYTHON_EXECUTABLE}")
|
||||
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
|
||||
"import sys; print 'python' + sys.version[0:3]"
|
||||
OUTPUT_VARIABLE PYTHON_LIBRARY_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
|
||||
"import sys; print sys.exec_prefix"
|
||||
OUTPUT_VARIABLE PYTHON_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
FIND_LIBRARY(PYTHON_LIBRARIES
|
||||
NAMES ${PYTHON_LIBRARY_NAME}
|
||||
HINTS "${PYTHON_PREFIX}"
|
||||
PATH_SUFFIXES "lib" "libs"
|
||||
NO_DEFAULT_PATH)
|
||||
execute_process(COMMAND ${CUSTOM_PYTHON_EXECUTABLE} -c
|
||||
"from distutils.sysconfig import *; print get_python_inc()"
|
||||
OUTPUT_VARIABLE PYTHON_INCLUDE_DIRS OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(PYTHON_LIBRARIES AND PYTHON_INCLUDE_DIRS)
|
||||
SET(PYTHONLIBS_FOUND TRUE)
|
||||
message("-- Found PythonLibs: " ${PYTHON_LIBRARIES})
|
||||
message("-- -- Used custom search path")
|
||||
else()
|
||||
find_package(PythonLibs REQUIRED)
|
||||
message("-- -- Used find_package(PythonLibs)")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(NumPy REQUIRED)
|
||||
find_package(Boost REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
|
||||
|
||||
include_directories("${CMAKE_SOURCE_DIR}/include")
|
||||
include_directories("${CMAKE_SOURCE_DIR}/thirdparty/grpc/include/")
|
||||
include_directories("${CMAKE_SOURCE_DIR}/thirdparty/grpc/third_party/protobuf/src")
|
||||
include_directories("${PYTHON_INCLUDE_DIRS}")
|
||||
include_directories("${NUMPY_INCLUDE_DIR}")
|
||||
include_directories("/usr/local/include")
|
||||
include_directories("${Boost_INCLUDE_DIRS}")
|
||||
|
||||
set(PROTO_PATH "${CMAKE_SOURCE_DIR}/protos")
|
||||
|
||||
set(GRAPH_PROTO "${PROTO_PATH}/graph.proto")
|
||||
set(RAY_PROTO "${PROTO_PATH}/ray.proto")
|
||||
set(TYPES_PROTO "${PROTO_PATH}/types.proto")
|
||||
set(GENERATED_PROTOBUF_PATH "${CMAKE_BINARY_DIR}/generated")
|
||||
file(MAKE_DIRECTORY ${GENERATED_PROTOBUF_PATH})
|
||||
|
||||
set(GRAPH_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/graph.pb.cc")
|
||||
set(GRAPH_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/graph.pb.h")
|
||||
|
||||
set(RAY_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/ray.pb.cc")
|
||||
set(RAY_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/ray.pb.h")
|
||||
set(RAY_GRPC_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/ray.grpc.pb.cc")
|
||||
set(RAY_GRPC_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/ray.grpc.pb.h")
|
||||
|
||||
set(TYPES_PB_CPP_FILE "${GENERATED_PROTOBUF_PATH}/types.pb.cc")
|
||||
set(TYPES_PB_H_FILE "${GENERATED_PROTOBUF_PATH}/types.pb.h")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${GRAPH_PB_H_FILE}"
|
||||
"${GRAPH_PB_CPP_FILE}"
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
|
||||
ARGS "--proto_path=${PROTO_PATH}"
|
||||
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
|
||||
"${GRAPH_PROTO}"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${RAY_PB_H_FILE}"
|
||||
"${RAY_PB_CPP_FILE}"
|
||||
"${RAY_GRPC_PB_H_FILE}"
|
||||
"${RAY_GRPC_PB_CPP_FILE}"
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
|
||||
ARGS "--proto_path=${PROTO_PATH}"
|
||||
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
|
||||
"${RAY_PROTO}"
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
|
||||
ARGS "--proto_path=${PROTO_PATH}"
|
||||
"--grpc_out=${GENERATED_PROTOBUF_PATH}"
|
||||
"--plugin=protoc-gen-grpc=${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/grpc_cpp_plugin"
|
||||
"${RAY_PROTO}"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${TYPES_PB_H_FILE}"
|
||||
"${TYPES_PB_CPP_FILE}"
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/grpc/bins/opt/protobuf/protoc
|
||||
ARGS "--proto_path=${PROTO_PATH}"
|
||||
"--cpp_out=${GENERATED_PROTOBUF_PATH}"
|
||||
"${TYPES_PROTO}"
|
||||
)
|
||||
|
||||
set(GENERATED_PROTOBUF_FILES
|
||||
${GRAPH_PB_H_FILE} ${GRAPH_PB_CPP_FILE}
|
||||
${RAY_PB_H_FILE} ${RAY_PB_CPP_FILE}
|
||||
${RAY_GRPC_PB_H_FILE} ${RAY_GRPC_PB_CPP_FILE}
|
||||
${TYPES_PB_H_FILE} ${TYPES_PB_CPP_FILE})
|
||||
|
||||
include_directories(${GENERATED_PROTOBUF_PATH})
|
||||
|
||||
link_libraries(${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc++_unsecure.a
|
||||
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc++.a
|
||||
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/libgrpc.a
|
||||
${CMAKE_SOURCE_DIR}/thirdparty/grpc/libs/opt/protobuf/libprotobuf.a
|
||||
${CMAKE_SOURCE_DIR}/thirdparty/hiredis/libhiredis.a
|
||||
pthread)
|
||||
|
||||
if(UNIX AND NOT APPLE)
|
||||
link_libraries(rt)
|
||||
endif()
|
||||
|
||||
if(APPLE)
|
||||
SET(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
|
||||
endif(APPLE)
|
||||
|
||||
set(ARROW_LIB ${CMAKE_SOURCE_DIR}/thirdparty/arrow-old/cpp/build/release/libarrow.a)
|
||||
|
||||
add_definitions(-fPIC)
|
||||
|
||||
add_executable(objstore src/objstore.cc src/ipc.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
|
||||
add_executable(scheduler src/scheduler.cc src/computation_graph.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
|
||||
add_library(raylib SHARED src/raylib.cc src/worker.cc src/ipc.cc src/utils.cc ${GENERATED_PROTOBUF_FILES})
|
||||
target_link_libraries(raylib ${PYTHON_LIBRARIES})
|
||||
|
||||
get_filename_component(PYTHON_SHARED_LIBRARY ${PYTHON_LIBRARIES} NAME)
|
||||
if(APPLE)
|
||||
add_custom_command(TARGET raylib
|
||||
POST_BUILD COMMAND
|
||||
${CMAKE_INSTALL_NAME_TOOL} -change ${PYTHON_SHARED_LIBRARY} ${PYTHON_LIBRARIES} libraylib.so)
|
||||
endif(APPLE)
|
||||
|
||||
install(TARGETS objstore scheduler raylib DESTINATION ${CMAKE_SOURCE_DIR}/lib/python/ray)
|
||||
@@ -1,5 +0,0 @@
|
||||
# Data for Ray
|
||||
|
||||
This folder contains data neccessary to run tests, etc. Only very small amounts
|
||||
of data should be stored here and if a loader for a large dataset is tested, a
|
||||
miniature version of this dataset should be created.
|
||||
Binary file not shown.
@@ -31,14 +31,14 @@ if [[ $platform == "linux" ]]; then
|
||||
# These commands must be kept in sync with the installation instructions.
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y git cmake build-essential autoconf curl libtool python-dev python-numpy python-pip libboost-all-dev unzip graphviz
|
||||
sudo pip install ipython funcsigs subprocess32 protobuf colorama graphviz
|
||||
sudo pip install ipython funcsigs subprocess32 protobuf colorama graphviz redis
|
||||
sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
|
||||
elif [[ $platform == "macosx" ]]; then
|
||||
# These commands must be kept in sync with the installation instructions.
|
||||
brew install git cmake automake autoconf libtool boost graphviz
|
||||
sudo easy_install pip
|
||||
sudo pip install ipython --user
|
||||
sudo pip install numpy funcsigs subprocess32 protobuf colorama graphviz --ignore-installed six
|
||||
sudo pip install numpy funcsigs subprocess32 protobuf colorama graphviz redis --ignore-installed six
|
||||
sudo pip install --upgrade git+git://github.com/cloudpipe/cloudpickle.git@0d225a4695f1f65ae1cbb2e0bbc145e10167cce4 # We use the latest version of cloudpickle because it can serialize named tuples.
|
||||
fi
|
||||
|
||||
|
||||
@@ -11,8 +11,6 @@ if hasattr(ctypes, "windll"):
|
||||
|
||||
import config
|
||||
import serialization
|
||||
from worker import scheduler_info, register_class, visualize_computation_graph, task_info, init, connect, disconnect, get, put, wait, remote, kill_workers, restart_workers_local
|
||||
from worker import register_class, error_info, init, connect, disconnect, get, put, wait, remote
|
||||
from worker import Reusable, reusables
|
||||
from libraylib import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
|
||||
from libraylib import ObjectID
|
||||
import internal
|
||||
from worker import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
|
||||
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
|
||||
parser.add_argument("--redis-port", required=True, type=int, help="the port to use for Redis")
|
||||
parser.add_argument("--object-store-name", type=str, help="the object store's name")
|
||||
parser.add_argument("--object-store-manager-name", type=str, help="the object store manager's name")
|
||||
parser.add_argument("--local-scheduler-name", type=str, help="the local scheduler's name")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
address_info = {"node_ip_address": args.node_ip_address,
|
||||
"redis_port": args.redis_port,
|
||||
"object_store_name": args.object_store_name,
|
||||
"object_store_manager_name": args.object_store_manager_name,
|
||||
"local_scheduler_name": args.local_scheduler_name}
|
||||
ray.worker.connect(address_info, ray.WORKER_MODE)
|
||||
|
||||
ray.worker.main_loop()
|
||||
@@ -1,34 +0,0 @@
|
||||
# Utilities to deal with computation graphs
|
||||
|
||||
import graphviz
|
||||
|
||||
def graph_to_graphviz(computation_graph):
|
||||
"""
|
||||
Convert the computation graph to graphviz format.
|
||||
|
||||
Args:
|
||||
computation_graph [graph_pb2.CompGraph]: protocol buffer description of
|
||||
the computation graph
|
||||
|
||||
Returns:
|
||||
Graphviz description of the computation graph
|
||||
"""
|
||||
dot = graphviz.Digraph(format="pdf")
|
||||
dot.node("op-root", shape="box")
|
||||
for (i, op) in enumerate(computation_graph.operation):
|
||||
if op.HasField("task"):
|
||||
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + op.task.name.split(".")[-1])
|
||||
for res in op.task.result:
|
||||
dot.edge("op" + str(i), str(res))
|
||||
elif op.HasField("put"):
|
||||
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + "put")
|
||||
dot.edge("op" + str(i), str(op.put.objectid))
|
||||
elif op.HasField("get"):
|
||||
dot.node("op" + str(i), shape="box", label=str(i) + "\n" + "get")
|
||||
creator_operationid = op.creator_operationid if op.creator_operationid != 2 ** 64 - 1 else "-root"
|
||||
dot.edge("op" + str(creator_operationid), "op" + str(i), style="dotted", constraint="false")
|
||||
for arg in op.task.arg:
|
||||
if len(arg.serialized_arg) == 0:
|
||||
dot.node(str(arg.objectid))
|
||||
dot.edge(str(arg.objectid), "op" + str(i))
|
||||
return dot
|
||||
@@ -1,89 +1,9 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pickling
|
||||
import libraylib as raylib
|
||||
import numbuf
|
||||
|
||||
def is_argument_serializable(value):
|
||||
"""Checks if value is a composition of primitive types.
|
||||
|
||||
This will return True if the argument is one of the following:
|
||||
- An int
|
||||
- A float
|
||||
- A bool
|
||||
- None
|
||||
- A list of length at most 100 whose elements are serializable
|
||||
- A tuple of length at most 100 whose elements are serializable
|
||||
- A dict of length at most 100 whose keys and values are serializable
|
||||
- A string of length at most 100.
|
||||
- A unicode string of length at most 100.
|
||||
|
||||
Args:
|
||||
value: A Python object.
|
||||
|
||||
Returns:
|
||||
True if the object can be serialized as a composition of primitive types and
|
||||
False otherwise.
|
||||
"""
|
||||
t = type(value)
|
||||
if t is int or t is float or t is long or t is bool or value is None:
|
||||
return True
|
||||
if t is list:
|
||||
if len(value) <= 100:
|
||||
for element in value:
|
||||
if not is_argument_serializable(element):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
if t is tuple:
|
||||
if len(value) <= 100:
|
||||
for element in value:
|
||||
if not is_argument_serializable(element):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
if t is dict:
|
||||
if len(value) <= 100:
|
||||
for k, v in value.iteritems():
|
||||
if not is_argument_serializable(k) or not is_argument_serializable(v):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
if t is str:
|
||||
return len(value) <= 100
|
||||
if t is unicode:
|
||||
return len(value) <= 100
|
||||
return False
|
||||
|
||||
def serialize_argument_if_possible(value):
|
||||
"""This method serializes arguments that are passed by value.
|
||||
|
||||
The result will be deserialized by deserialize_argument.
|
||||
|
||||
Returns:
|
||||
None if value cannot be efficiently serialized or is too big, and otherwise
|
||||
this returns the serialized value as a string.
|
||||
"""
|
||||
if not is_argument_serializable(value):
|
||||
# The argument is not obviously serializable using __repr__, so we will not
|
||||
# serialize it.
|
||||
return None
|
||||
serialized_value = value.__repr__()
|
||||
if len(serialized_value) > 1000:
|
||||
# The argument is too big, so we will not pass it by value.
|
||||
return None
|
||||
# Return the serialized argument.
|
||||
return serialized_value
|
||||
|
||||
def deserialize_argument(serialized_value):
|
||||
"""This method deserializes arguments that are passed by value.
|
||||
|
||||
The argument will have been serialized by serialize_argument.
|
||||
"""
|
||||
return eval(serialized_value)
|
||||
|
||||
def check_serializable(cls):
|
||||
"""Throws an exception if Ray cannot serialize this class efficiently.
|
||||
|
||||
|
||||
+80
-115
@@ -1,31 +1,29 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess32 as subprocess
|
||||
import subprocess
|
||||
import string
|
||||
import random
|
||||
|
||||
# Ray modules
|
||||
import config
|
||||
|
||||
_services_env = os.environ.copy()
|
||||
_services_env["PATH"] = os.pathsep.join([os.path.dirname(os.path.abspath(__file__)), _services_env["PATH"]])
|
||||
# Make GRPC only print error messages.
|
||||
_services_env["GRPC_VERBOSITY"] = "ERROR"
|
||||
|
||||
# all_processes is a list of the scheduler, object store, and worker processes
|
||||
# that have been started by this services module if Ray is being used in local
|
||||
# mode.
|
||||
all_processes = []
|
||||
|
||||
TIMEOUT_SECONDS = 5
|
||||
|
||||
def address(host, port):
|
||||
return host + ":" + str(port)
|
||||
|
||||
def new_scheduler_port():
|
||||
def new_port():
|
||||
return random.randint(10000, 65535)
|
||||
|
||||
def random_name():
|
||||
return str(random.randint(0, 99999999))
|
||||
|
||||
def cleanup():
|
||||
"""When running in local mode, shutdown the Ray processes.
|
||||
|
||||
@@ -36,7 +34,8 @@ def cleanup():
|
||||
"""
|
||||
global all_processes
|
||||
successfully_shut_down = True
|
||||
for p in all_processes:
|
||||
# Terminate the processes in reverse order.
|
||||
for p in all_processes[::-1]:
|
||||
if p.poll() is not None: # process has already terminated
|
||||
continue
|
||||
p.kill()
|
||||
@@ -49,146 +48,112 @@ def cleanup():
|
||||
continue
|
||||
successfully_shut_down = False
|
||||
if successfully_shut_down:
|
||||
print "Successfully shut down Ray."
|
||||
print("Successfully shut down Ray.")
|
||||
else:
|
||||
print "Ray did not shut down properly."
|
||||
print("Ray did not shut down properly.")
|
||||
all_processes = []
|
||||
|
||||
def start_scheduler(scheduler_address, cleanup):
|
||||
"""This method starts a scheduler process.
|
||||
|
||||
Args:
|
||||
scheduler_address (str): The ip address and port to use for the scheduler.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
"""
|
||||
scheduler_port = scheduler_address.split(":")[1]
|
||||
p = subprocess.Popen(["scheduler", scheduler_address, "--log-file-name", config.get_log_file_path("scheduler-" + scheduler_port + ".log")], env=_services_env)
|
||||
def start_redis(port):
|
||||
redis_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../common/thirdparty/redis-3.2.3/src/redis-server")
|
||||
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning"])
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_objstore(scheduler_address, node_ip_address, cleanup):
|
||||
def start_local_scheduler(redis_address, plasma_store_name):
|
||||
local_scheduler_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../photon/build/photon_scheduler")
|
||||
local_scheduler_name = "/tmp/scheduler{}".format(random_name())
|
||||
p = subprocess.Popen([local_scheduler_filepath, "-s", local_scheduler_name, "-r", redis_address, "-p", plasma_store_name])
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
return local_scheduler_name
|
||||
|
||||
def start_objstore(node_ip_address, redis_address, cleanup):
|
||||
"""This method starts an object store process.
|
||||
|
||||
Args:
|
||||
scheduler_address (str): The ip address and port of the scheduler to connect
|
||||
to.
|
||||
node_ip_address (str): The ip address of the node running the object store.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
"""
|
||||
random_string = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
|
||||
p = subprocess.Popen(["objstore", scheduler_address, node_ip_address, "--log-file-name", config.get_log_file_path("-".join(["objstore", random_string]) + ".log")], env=_services_env)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
plasma_store_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../plasma/build/plasma_store")
|
||||
store_name = "/tmp/ray_plasma_store{}".format(random_name())
|
||||
p1 = subprocess.Popen([plasma_store_executable, "-s", store_name])
|
||||
|
||||
def start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=None, cleanup=True):
|
||||
plasma_manager_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../plasma/build/plasma_manager")
|
||||
manager_name = "/tmp/ray_plasma_manager{}".format(random_name())
|
||||
manager_port = new_port()
|
||||
p2 = subprocess.Popen([plasma_manager_executable,
|
||||
"-s", store_name,
|
||||
"-m", manager_name,
|
||||
"-h", node_ip_address,
|
||||
"-p", str(manager_port),
|
||||
"-r", redis_address])
|
||||
|
||||
if cleanup:
|
||||
all_processes.append(p1)
|
||||
all_processes.append(p2)
|
||||
|
||||
return store_name, manager_name, manager_port
|
||||
|
||||
def start_worker(address_info, worker_path, cleanup=True):
|
||||
"""This method starts a worker process.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The IP address of the node that the worker runs on.
|
||||
address_info (dict): This dictionary contains the node_ip_address,
|
||||
redis_port, object_store_name, object_store_manager_name, and
|
||||
local_scheduler_name.
|
||||
worker_path (str): The path of the source code which the worker process will
|
||||
run.
|
||||
scheduler_address (str): The ip address and port of the scheduler to connect
|
||||
to.
|
||||
objstore_address (Optional[str]): The ip address and port of the object
|
||||
store to connect to.
|
||||
cleanup (Optional[bool]): True if using Ray in local mode. If cleanup is
|
||||
true, then this process will be killed by serices.cleanup() when the
|
||||
Python process that imported services exits. This is True by default.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by services.cleanup() when the Python process
|
||||
that imported services exits. This is True by default.
|
||||
"""
|
||||
command = ["python",
|
||||
worker_path,
|
||||
"--node-ip-address=" + node_ip_address,
|
||||
"--scheduler-address=" + scheduler_address]
|
||||
if objstore_address is not None:
|
||||
command.append("--objstore-address=" + objstore_address)
|
||||
"--node-ip-address=" + address_info["node_ip_address"],
|
||||
"--object-store-name=" + address_info["object_store_name"],
|
||||
"--object-store-manager-name=" + address_info["object_store_manager_name"],
|
||||
"--local-scheduler-name=" + address_info["local_scheduler_name"],
|
||||
"--redis-port=" + str(address_info["redis_port"])]
|
||||
p = subprocess.Popen(command)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None, cleanup=False):
|
||||
"""Start an object store and associated workers in the cluster setting.
|
||||
|
||||
This starts an object store and the associated workers when Ray is being used
|
||||
in the cluster setting. This assumes the scheduler has already been started.
|
||||
|
||||
Args:
|
||||
scheduler_address (str): IP address and port of the scheduler (which may run
|
||||
on a different node).
|
||||
node_ip_address (str): IP address (without port) of the node this function
|
||||
is run on.
|
||||
num_workers (int): The number of workers to be started on this node.
|
||||
worker_path (str): Path of the Python worker script that will be run on the
|
||||
worker.
|
||||
cleanup (bool): If cleanup is True, then the processes started by this
|
||||
command will be killed when the process that imported services exits.
|
||||
"""
|
||||
start_objstore(scheduler_address, node_ip_address, cleanup=cleanup)
|
||||
time.sleep(0.2)
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
|
||||
for _ in range(num_workers):
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=cleanup)
|
||||
time.sleep(0.5)
|
||||
|
||||
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
|
||||
"""Start a new set of workers on this node.
|
||||
|
||||
Start a new set of workers on this node. This assumes that the scheduler is
|
||||
already running and that the object store on this node is already running. The
|
||||
intended use case is that a developer wants to update the code running on the
|
||||
worker processes so first kills all of the workers and then runs this method.
|
||||
|
||||
Args:
|
||||
scheduler_address (str): ip address and port of the scheduler (which may run
|
||||
on a different node)
|
||||
objstore_address (str): ip address and port of the object store (which runs
|
||||
on the same node)
|
||||
num_workers (int): the number of workers to be started on this node
|
||||
worker_path (str): path of the source code that will be run on the worker
|
||||
"""
|
||||
node_ip_address = objstore_address.split(":")[0]
|
||||
for _ in range(num_workers):
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=False)
|
||||
|
||||
def start_ray_local(node_ip_address="127.0.0.1", num_objstores=1, num_workers=0, worker_path=None):
|
||||
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
This method starts Ray in local mode (as opposed to cluster mode, which is
|
||||
handled by cluster.py).
|
||||
|
||||
Args:
|
||||
num_objstores (int): The number of object stores to start. Aside from
|
||||
testing, this should be one.
|
||||
num_workers (int): The number of workers to start.
|
||||
worker_path (str): The path of the source code that will be run by the
|
||||
worker.
|
||||
|
||||
Returns:
|
||||
The address of the scheduler and the addresses of all of the object stores.
|
||||
This returns a tuple of three things. The first element is a tuple of the
|
||||
Redis hostname and port. The second
|
||||
"""
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
|
||||
if num_objstores < 1:
|
||||
raise Exception("`num_objstores` is {}, but should be at least 1.".format(num_objstores))
|
||||
scheduler_address = address(node_ip_address, new_scheduler_port())
|
||||
start_scheduler(scheduler_address, cleanup=True)
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "default_worker.py")
|
||||
# Start Redis.
|
||||
redis_port = new_port()
|
||||
redis_address = address(node_ip_address, redis_port)
|
||||
start_redis(redis_port)
|
||||
time.sleep(0.1)
|
||||
# create objstores
|
||||
for i in range(num_objstores):
|
||||
start_objstore(scheduler_address, node_ip_address, cleanup=True)
|
||||
time.sleep(0.2)
|
||||
if i < num_objstores - 1:
|
||||
num_workers_to_start = num_workers / num_objstores
|
||||
else:
|
||||
# In case num_workers is not divisible by num_objstores, start the correct
|
||||
# remaining number of workers.
|
||||
num_workers_to_start = num_workers - (num_objstores - 1) * (num_workers / num_objstores)
|
||||
for _ in range(num_workers_to_start):
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, cleanup=True)
|
||||
time.sleep(0.3)
|
||||
|
||||
return scheduler_address
|
||||
# Start Plasma.
|
||||
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
|
||||
# Start the local scheduler.
|
||||
time.sleep(0.1)
|
||||
local_scheduler_name = start_local_scheduler(redis_address, object_store_name)
|
||||
time.sleep(0.2)
|
||||
# Aggregate the address information together.
|
||||
address_info = {"node_ip_address": node_ip_address,
|
||||
"redis_port": redis_port,
|
||||
"object_store_name": object_store_name,
|
||||
"object_store_manager_name": object_store_manager_name,
|
||||
"local_scheduler_name": local_scheduler_name}
|
||||
# Start the workers.
|
||||
for _ in range(num_workers):
|
||||
start_worker(address_info, worker_path, cleanup=True)
|
||||
time.sleep(0.3)
|
||||
# Return the addresses of the relevant processes.
|
||||
return address_info
|
||||
|
||||
+432
-454
File diff suppressed because it is too large
Load Diff
@@ -1,16 +0,0 @@
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
|
||||
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
|
||||
parser.add_argument("--scheduler-address", required=True, type=str, help="the scheduler's address")
|
||||
parser.add_argument("--objstore-address", type=str, help="the objstore's address")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.worker.connect(args.node_ip_address, args.scheduler_address)
|
||||
|
||||
ray.worker.main_loop()
|
||||
@@ -1,19 +0,0 @@
|
||||
# NO shebang! Force the user to run this using the 'source' command without spawning a new shell; otherwise, variable exports won't persist.
|
||||
|
||||
echo "Adding Ray to PYTHONPATH" 1>&2
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
|
||||
export PYTHONPATH="$ROOT_DIR/lib/python/:$ROOT_DIR/thirdparty/numbuf/build:$PYTHONPATH"
|
||||
|
||||
# Print instructions for adding Ray to your bashrc.
|
||||
unamestr="$(uname)"
|
||||
if [[ "$unamestr" == "Linux" ]]; then
|
||||
BASH_RC="~/.bashrc"
|
||||
elif [[ "$unamestr" == "Darwin" ]]; then
|
||||
BASH_RC="~/.bash_profile"
|
||||
fi
|
||||
echo "To permanently add Ray to your Python path, run,
|
||||
|
||||
echo 'export PYTHONPATH=$ROOT_DIR/lib/python/:$ROOT_DIR/thirdparty/numbuf/build:\$PYTHONPATH' >> $BASH_RC
|
||||
"
|
||||
@@ -169,7 +169,7 @@ void handle_worker_available(scheduler_info *info,
|
||||
/* Add client_sock to a list of available workers. This struct will be freed
|
||||
* when a task is assigned to this worker. */
|
||||
utarray_push_back(state->available_workers, &worker_index);
|
||||
LOG_INFO("Adding worker_index %d to available workers.\n", worker_index);
|
||||
LOG_DEBUG("Adding worker_index %d to available workers.\n", worker_index);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +148,7 @@ void new_client_connection(event_loop *loop, int listener_sock, void *context,
|
||||
local_scheduler_state *s = context;
|
||||
int new_socket = accept_client(listener_sock);
|
||||
event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message, s);
|
||||
LOG_INFO("new connection with fd %d", new_socket);
|
||||
LOG_DEBUG("new connection with fd %d", new_socket);
|
||||
/* Add worker to list of workers. */
|
||||
/* TODO(pcm): Where shall we free this? */
|
||||
worker_index *new_worker_index = malloc(sizeof(worker_index));
|
||||
|
||||
@@ -40,8 +40,12 @@ class PlasmaBuffer(object):
|
||||
self.plasma_client = plasma_client
|
||||
|
||||
def __del__(self):
|
||||
"""Notify Plasma that the object is no longer needed."""
|
||||
self.plasma_client.client.plasma_release(self.plasma_client.plasma_conn, self.plasma_id)
|
||||
"""Notify Plasma that the object is no longer needed.
|
||||
|
||||
If the plasma client has been shut down, then don't do anything.
|
||||
"""
|
||||
if self.plasma_client.alive:
|
||||
self.plasma_client.client.plasma_release(self.plasma_client.plasma_conn, self.plasma_id)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Read from the PlasmaBuffer as if it were just a regular buffer."""
|
||||
@@ -73,7 +77,7 @@ class PlasmaClient(object):
|
||||
store_socket_name (str): Name of the socket the plasma store is listening at.
|
||||
manager_socket_name (str): Name of the socket the plasma manager is listening at.
|
||||
"""
|
||||
|
||||
self.alive = True
|
||||
plasma_client_library = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../build/plasma_client.so")
|
||||
self.client = ctypes.cdll.LoadLibrary(plasma_client_library)
|
||||
|
||||
@@ -85,6 +89,7 @@ class PlasmaClient(object):
|
||||
self.client.plasma_seal.restype = None
|
||||
self.client.plasma_delete.restype = None
|
||||
self.client.plasma_subscribe.restype = ctypes.c_int
|
||||
self.client.plasma_wait.restype = ctypes.c_int
|
||||
|
||||
self.buffer_from_memory = ctypes.pythonapi.PyBuffer_FromMemory
|
||||
self.buffer_from_memory.argtypes = [ctypes.c_void_p, ctypes.c_int64]
|
||||
@@ -101,6 +106,15 @@ class PlasmaClient(object):
|
||||
self.has_manager_conn = False
|
||||
self.plasma_conn = ctypes.c_void_p(self.client.plasma_connect(store_socket_name, None))
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown the client so that it does not send messages.
|
||||
|
||||
If we kill the Plasma store and Plasma manager that this client is connected
|
||||
to, then we can use this method to prevent the client from trying to send
|
||||
messages to the killed processes.
|
||||
"""
|
||||
self.alive = False
|
||||
|
||||
def create(self, object_id, size, metadata=None):
|
||||
"""Create a new buffer in the PlasmaStore for a particular object ID.
|
||||
|
||||
@@ -233,6 +247,12 @@ class PlasmaClient(object):
|
||||
"""
|
||||
if not self.has_manager_conn:
|
||||
raise Exception("Not connected to the plasma manager socket")
|
||||
if num_returns < 0:
|
||||
raise Exception("The argument num_returns cannot be less than one.")
|
||||
if num_returns > len(object_ids):
|
||||
raise Exception("The argument num_returns cannot be greater than len(object_ids): num_returns is {}, len(object_ids) is {}.".format(num_returns, len(object_ids)))
|
||||
if timeout > 2 ** 36:
|
||||
raise Exception("The method wait currently cannot be used with a timeout greater than 2 ** 36.")
|
||||
object_id_array = (len(object_ids) * PlasmaID)()
|
||||
for i, object_id in enumerate(object_ids):
|
||||
object_id_array[i] = make_plasma_id(object_id)
|
||||
@@ -240,7 +260,9 @@ class PlasmaClient(object):
|
||||
num_return_objects = self.client.plasma_wait(self.plasma_conn,
|
||||
object_id_array._length_,
|
||||
object_id_array,
|
||||
timeout, num_returns, return_id_array)
|
||||
ctypes.c_int64(timeout),
|
||||
num_returns,
|
||||
return_id_array)
|
||||
ready_ids = map(plasma_id_to_str, return_id_array[num_returns-num_return_objects:])
|
||||
return ready_ids, list(set(object_ids) - set(ready_ids))
|
||||
|
||||
|
||||
+1
-1
@@ -58,7 +58,7 @@ class DistributedArrayTest(unittest.TestCase):
|
||||
def testMethods(self):
|
||||
for module in [ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg]:
|
||||
reload(module)
|
||||
ray.init(start_ray_local=True, num_objstores=2, num_workers=10)
|
||||
ray.init(start_ray_local=True, num_workers=10)
|
||||
|
||||
x = da.zeros.remote([9, 25, 51], "float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
|
||||
|
||||
+27
-39
@@ -4,16 +4,24 @@ import time
|
||||
|
||||
import test_functions
|
||||
|
||||
def wait_for_errors(error_type, num_errors, timeout=10):
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
error_info = ray.error_info()
|
||||
if len(error_info[error_type]) >= num_errors:
|
||||
return
|
||||
time.sleep(0.1)
|
||||
print("Timing out of wait.")
|
||||
|
||||
class FailureTest(unittest.TestCase):
|
||||
def testUnknownSerialization(self):
|
||||
reload(test_functions)
|
||||
ray.init(start_ray_local=True, num_workers=1, driver_mode=ray.SILENT_MODE)
|
||||
|
||||
test_functions.test_unknown_type.remote()
|
||||
time.sleep(0.2)
|
||||
task_info = ray.task_info()
|
||||
self.assertEqual(len(task_info["failed_tasks"]), 1)
|
||||
self.assertEqual(len(task_info["running_tasks"]), 0)
|
||||
wait_for_errors("TaskError", 1)
|
||||
error_info = ray.error_info()
|
||||
self.assertEqual(len(error_info["TaskError"]), 1)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@@ -45,19 +53,11 @@ class TaskStatusTest(unittest.TestCase):
|
||||
|
||||
test_functions.throw_exception_fct1.remote()
|
||||
test_functions.throw_exception_fct1.remote()
|
||||
for _ in range(100): # Retry if we need to wait longer.
|
||||
if len(ray.task_info()["failed_tasks"]) >= 2:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
result = ray.task_info()
|
||||
self.assertEqual(len(result["failed_tasks"]), 2)
|
||||
task_ids = set()
|
||||
for task in result["failed_tasks"]:
|
||||
self.assertTrue(task.has_key("worker_address"))
|
||||
self.assertTrue(task.has_key("operationid"))
|
||||
self.assertTrue("Test function 1 intentionally failed." in task.get("error_message"))
|
||||
self.assertTrue(task["operationid"] not in task_ids)
|
||||
task_ids.add(task["operationid"])
|
||||
wait_for_errors("TaskError", 2)
|
||||
result = ray.error_info()
|
||||
self.assertEqual(len(result["TaskError"]), 2)
|
||||
for task in result["TaskError"]:
|
||||
self.assertTrue("Test function 1 intentionally failed." in task.get("message"))
|
||||
|
||||
x = test_functions.throw_exception_fct2.remote()
|
||||
try:
|
||||
@@ -96,11 +96,8 @@ class TaskStatusTest(unittest.TestCase):
|
||||
def __call__(self):
|
||||
return
|
||||
ray.remote(Foo())
|
||||
for _ in range(100): # Retry if we need to wait longer.
|
||||
if len(ray.task_info()["failed_remote_function_imports"]) >= 1:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"])
|
||||
wait_for_errors("RemoteFunctionImportError", 1)
|
||||
self.assertTrue("There is a problem here." in ray.error_info()["RemoteFunctionImportError"][0]["message"])
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@@ -114,12 +111,9 @@ class TaskStatusTest(unittest.TestCase):
|
||||
raise Exception("The initializer failed.")
|
||||
return 0
|
||||
ray.reusables.foo = ray.Reusable(initializer)
|
||||
for _ in range(100): # Retry if we need to wait longer.
|
||||
if len(ray.task_info()["failed_reusable_variable_imports"]) >= 1:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
wait_for_errors("ReusableVariableImportError", 1)
|
||||
# Check that the error message is in the task info.
|
||||
self.assertTrue("The initializer failed." in ray.task_info()["failed_reusable_variable_imports"][0]["error_message"])
|
||||
self.assertTrue("The initializer failed." in ray.error_info()["ReusableVariableImportError"][0]["message"])
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@@ -135,12 +129,9 @@ class TaskStatusTest(unittest.TestCase):
|
||||
def use_foo():
|
||||
ray.reusables.foo
|
||||
use_foo.remote()
|
||||
for _ in range(100): # Retry if we need to wait longer.
|
||||
if len(ray.task_info()["failed_reinitialize_reusable_variables"]) >= 1:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
wait_for_errors("ReusableVariableReinitializeError", 1)
|
||||
# Check that the error message is in the task info.
|
||||
self.assertTrue("The reinitializer failed." in ray.task_info()["failed_reinitialize_reusable_variables"][0]["error_message"])
|
||||
self.assertTrue("The reinitializer failed." in ray.error_info()["ReusableVariableReinitializeError"][0]["message"])
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@@ -151,14 +142,11 @@ class TaskStatusTest(unittest.TestCase):
|
||||
if ray.worker.global_worker.mode == ray.WORKER_MODE:
|
||||
raise Exception("Function to run failed.")
|
||||
ray.worker.global_worker.run_function_on_all_workers(f)
|
||||
for _ in range(100): # Retry if we need to wait longer.
|
||||
if len(ray.task_info()["failed_function_to_runs"]) >= 2:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
wait_for_errors("FunctionToRunError", 2)
|
||||
# Check that the error message is in the task info.
|
||||
self.assertEqual(len(ray.task_info()["failed_function_to_runs"]), 2)
|
||||
self.assertTrue("Function to run failed." in ray.task_info()["failed_function_to_runs"][0]["error_message"])
|
||||
self.assertTrue("Function to run failed." in ray.task_info()["failed_function_to_runs"][1]["error_message"])
|
||||
self.assertEqual(len(ray.error_info()["FunctionToRunError"]), 2)
|
||||
self.assertTrue("Function to run failed." in ray.error_info()["FunctionToRunError"][0]["message"])
|
||||
self.assertTrue("Function to run failed." in ray.error_info()["FunctionToRunError"][1]["message"])
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
|
||||
+21
-286
@@ -1,3 +1,5 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import ray
|
||||
import numpy as np
|
||||
@@ -142,64 +144,6 @@ class SerializationTest(unittest.TestCase):
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
class ObjStoreTest(unittest.TestCase):
|
||||
|
||||
# Test setting up object stores, transfering data between them and retrieving data to a client
|
||||
def testObjStore(self):
|
||||
node_ip_address = "127.0.0.1"
|
||||
scheduler_address = ray.services.start_ray_local(num_objstores=2, num_workers=0, worker_path=None)
|
||||
ray.connect(node_ip_address, scheduler_address, mode=ray.SCRIPT_MODE)
|
||||
objstore_addresses = [objstore_info["address"] for objstore_info in ray.scheduler_info()["objstores"]]
|
||||
w1 = ray.worker.Worker()
|
||||
w2 = ray.worker.Worker()
|
||||
ray.reusables._cached_reusables = [] # This is a hack to make the test run.
|
||||
ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[0], mode=ray.SCRIPT_MODE, worker=w1)
|
||||
ray.reusables._cached_reusables = [] # This is a hack to make the test run.
|
||||
ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[1], mode=ray.SCRIPT_MODE, worker=w2)
|
||||
|
||||
for cls in [Foo, Bar, Baz, Qux, SubQux, Exception, CustomError, Point, NamedTupleExample]:
|
||||
ray.register_class(cls)
|
||||
|
||||
# putting and getting an object shouldn't change it
|
||||
for data in RAY_TEST_OBJECTS:
|
||||
objectid = ray.put(data, w1)
|
||||
result = ray.get(objectid, w1)
|
||||
assert_equal(result, data)
|
||||
|
||||
# putting an object, shipping it to another worker, and getting it shouldn't change it
|
||||
for data in RAY_TEST_OBJECTS:
|
||||
objectid = ray.put(data, w1)
|
||||
result = ray.get(objectid, w2)
|
||||
assert_equal(result, data)
|
||||
|
||||
# putting an object, shipping it to another worker, and getting it shouldn't change it
|
||||
for data in RAY_TEST_OBJECTS:
|
||||
objectid = ray.put(data, w2)
|
||||
result = ray.get(objectid, w1)
|
||||
assert_equal(result, data)
|
||||
|
||||
# This test fails. See https://github.com/ray-project/ray/issues/159.
|
||||
# getting multiple times shouldn't matter
|
||||
# for data in [np.zeros([10, 20]), np.random.normal(size=[45, 25]), np.zeros([10, 20], dtype=np.dtype("float64")), np.zeros([10, 20], dtype=np.dtype("float32")), np.zeros([10, 20], dtype=np.dtype("int64")), np.zeros([10, 20], dtype=np.dtype("int32"))]:
|
||||
# objectid = worker.put(data, w1)
|
||||
# result = worker.get(objectid, w2)
|
||||
# result = worker.get(objectid, w2)
|
||||
# result = worker.get(objectid, w2)
|
||||
# assert_equal(result, data)
|
||||
|
||||
# Getting a buffer after modifying it before it finishes should return updated buffer
|
||||
objectid = ray.libraylib.get_objectid(w1.handle)
|
||||
buf = ray.libraylib.allocate_buffer(w1.handle, objectid, 100)
|
||||
buf[0][0] = 1
|
||||
ray.libraylib.finish_buffer(w1.handle, objectid, buf[1], 0)
|
||||
completedbuffer = ray.libraylib.get_buffer(w1.handle, objectid)
|
||||
self.assertEqual(completedbuffer[0][0], 1)
|
||||
|
||||
# We started multiple drivers manually, so we will disconnect them manually.
|
||||
ray.disconnect(worker=w1)
|
||||
ray.disconnect(worker=w2)
|
||||
ray.worker.cleanup()
|
||||
|
||||
class WorkerTest(unittest.TestCase):
|
||||
|
||||
def testPutGet(self):
|
||||
@@ -233,29 +177,6 @@ class WorkerTest(unittest.TestCase):
|
||||
|
||||
class APITest(unittest.TestCase):
|
||||
|
||||
def testPassingArgumentsByValue(self):
|
||||
ray.init(start_ray_local=True, num_workers=0)
|
||||
|
||||
# The types that can be passed by value are defined by
|
||||
# is_argument_serializable in serialization.py.
|
||||
class Foo(object):
|
||||
pass
|
||||
CAN_PASS_BY_VALUE = [1, 1L, 1.0, True, False, None, [1L, 1.0, True, None],
|
||||
([1, 2, 3], {False: [1.0, u"hi", ()]}), 100 * ["a"]]
|
||||
CANNOT_PASS_BY_VALUE = [int, np.int64(0), np.float64(0), Foo(), [Foo()],
|
||||
(Foo()), {0: Foo()}, [[[int]]], 101 * [1],
|
||||
np.zeros(10)]
|
||||
|
||||
for obj in CAN_PASS_BY_VALUE:
|
||||
self.assertTrue(ray.serialization.is_argument_serializable(obj))
|
||||
self.assertEqual(obj, ray.serialization.deserialize_argument(ray.serialization.serialize_argument_if_possible(obj)))
|
||||
|
||||
for obj in CANNOT_PASS_BY_VALUE:
|
||||
self.assertFalse(ray.serialization.is_argument_serializable(obj))
|
||||
self.assertEqual(None, ray.serialization.serialize_argument_if_possible(obj))
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testRegisterClass(self):
|
||||
ray.init(start_ray_local=True, num_workers=0)
|
||||
|
||||
@@ -328,11 +249,7 @@ class APITest(unittest.TestCase):
|
||||
reload(test_functions)
|
||||
ray.init(start_ray_local=True, num_workers=1)
|
||||
|
||||
test_functions.no_op.remote()
|
||||
time.sleep(0.2)
|
||||
task_info = ray.task_info()
|
||||
self.assertEqual(len(task_info["failed_tasks"]), 0)
|
||||
self.assertEqual(len(task_info["running_tasks"]), 0)
|
||||
ray.get(test_functions.no_op.remote())
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@@ -400,22 +317,22 @@ class APITest(unittest.TestCase):
|
||||
|
||||
objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
|
||||
ready_ids, remaining_ids = ray.wait(objectids)
|
||||
self.assertTrue(len(ready_ids) == 1)
|
||||
self.assertTrue(len(remaining_ids) == 3)
|
||||
self.assertEqual(len(ready_ids), 1)
|
||||
self.assertEqual(len(remaining_ids), 3)
|
||||
ready_ids, remaining_ids = ray.wait(objectids, num_returns=4)
|
||||
self.assertEqual(ready_ids, objectids)
|
||||
self.assertEqual(set(ready_ids), set([object_id.id() for object_id in objectids]))
|
||||
self.assertEqual(remaining_ids, [])
|
||||
|
||||
objectids = [f.remote(0.5), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
|
||||
start_time = time.time()
|
||||
ready_ids, remaining_ids = ray.wait(objectids, timeout=1.75, num_returns=4)
|
||||
self.assertTrue(time.time() - start_time < 2)
|
||||
ready_ids, remaining_ids = ray.wait(objectids, timeout=1750, num_returns=4)
|
||||
self.assertLess(time.time() - start_time, 2)
|
||||
self.assertEqual(len(ready_ids), 3)
|
||||
self.assertEqual(len(remaining_ids), 1)
|
||||
ray.wait(objectids)
|
||||
objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
|
||||
start_time = time.time()
|
||||
ready_ids, remaining_ids = ray.wait(objectids, timeout=5)
|
||||
ready_ids, remaining_ids = ray.wait(objectids, timeout=5000)
|
||||
self.assertTrue(time.time() - start_time < 5)
|
||||
self.assertEqual(len(ready_ids), 1)
|
||||
self.assertEqual(len(remaining_ids), 3)
|
||||
@@ -504,150 +421,6 @@ class APITest(unittest.TestCase):
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testComputationGraph(self):
|
||||
ray.init(start_ray_local=True, num_workers=1)
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return x
|
||||
@ray.remote
|
||||
def g(x, y):
|
||||
return x, y
|
||||
a = f.remote(1)
|
||||
b = f.remote(1)
|
||||
c = g.remote(a, b)
|
||||
c = g.remote(a, 1)
|
||||
# Make sure that we can produce a computation_graph visualization.
|
||||
ray.visualize_computation_graph(view=False)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
class ReferenceCountingTest(unittest.TestCase):
|
||||
|
||||
def testDeallocation(self):
|
||||
reload(test_functions)
|
||||
for module in [ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg]:
|
||||
reload(module)
|
||||
ray.init(start_ray_local=True, num_workers=1)
|
||||
|
||||
def check_not_deallocated(object_ids):
|
||||
reference_counts = ray.scheduler_info()["reference_counts"]
|
||||
for object_id in object_ids:
|
||||
self.assertGreater(reference_counts[object_id.id], 0)
|
||||
|
||||
def check_everything_deallocated():
|
||||
reference_counts = ray.scheduler_info()["reference_counts"]
|
||||
self.assertEqual(reference_counts, len(reference_counts) * [-1])
|
||||
|
||||
z = da.zeros.remote([da.BLOCK_SIZE, 2 * da.BLOCK_SIZE])
|
||||
time.sleep(0.1)
|
||||
objectid_val = z.id
|
||||
time.sleep(0.1)
|
||||
check_not_deallocated([z])
|
||||
del z
|
||||
time.sleep(0.1)
|
||||
check_everything_deallocated()
|
||||
|
||||
x = ra.zeros.remote([10, 10])
|
||||
y = ra.zeros.remote([10, 10])
|
||||
z = ra.dot.remote(x, y)
|
||||
objectid_val = x.id
|
||||
time.sleep(0.1)
|
||||
check_not_deallocated([x, y, z])
|
||||
del x
|
||||
time.sleep(0.1)
|
||||
check_not_deallocated([y, z])
|
||||
del y
|
||||
time.sleep(0.1)
|
||||
check_not_deallocated([z])
|
||||
del z
|
||||
time.sleep(0.1)
|
||||
check_everything_deallocated()
|
||||
|
||||
z = da.zeros.remote([4 * da.BLOCK_SIZE])
|
||||
time.sleep(0.1)
|
||||
check_not_deallocated(ray.get(z).objectids.tolist())
|
||||
del z
|
||||
time.sleep(0.1)
|
||||
check_everything_deallocated()
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testGet(self):
|
||||
ray.init(start_ray_local=True, num_workers=3)
|
||||
|
||||
for cls in [Foo, Bar, Baz, Qux, SubQux, Exception, CustomError, Point, NamedTupleExample]:
|
||||
ray.register_class(cls)
|
||||
|
||||
# Remote objects should be deallocated when the corresponding ObjectID goes
|
||||
# out of scope, and all results of ray.get called on the ID go out of scope.
|
||||
for val in RAY_TEST_OBJECTS:
|
||||
x = ray.put(val)
|
||||
objectid = x.id
|
||||
xval = ray.get(x)
|
||||
del x, xval
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
|
||||
|
||||
# Remote objects that do not contain numpy arrays should be deallocated when
|
||||
# the corresponding ObjectID goes out of scope, even if ray.get has been
|
||||
# called on the ObjectID.
|
||||
for val in [True, False, None, 1, 1.0, 1L, "hi", u"hi", [1, 2, 3], (1, 2, 3), [(), {(): ()}]]:
|
||||
x = ray.put(val)
|
||||
objectid = x.id
|
||||
xval = ray.get(x)
|
||||
del x
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
|
||||
|
||||
# Remote objects that contain numpy arrays should not be deallocated when
|
||||
# the corresponding ObjectID goes out of scope, if ray.get has been called
|
||||
# on the ObjectID and the result of that call is still in scope.
|
||||
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
|
||||
x = ray.put(val)
|
||||
objectid = x.id
|
||||
xval = ray.get(x)
|
||||
del x
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
|
||||
|
||||
# Getting an object multiple times should not be a problem. And the remote
|
||||
# object should not be deallocated until both of the results are out of scope.
|
||||
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
|
||||
x = ray.put(val)
|
||||
objectid = x.id
|
||||
xval1 = ray.get(x)
|
||||
xval2 = ray.get(x)
|
||||
del xval1
|
||||
# Make sure we can still access xval2.
|
||||
xval2
|
||||
del xval2
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
|
||||
xval3 = ray.get(x)
|
||||
xval4 = ray.get(x)
|
||||
xval5 = ray.get(x)
|
||||
del x
|
||||
del xval4, xval5
|
||||
# Make sure we can still access xval3.
|
||||
xval3
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
|
||||
del xval3
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
|
||||
|
||||
# Getting an object multiple times and assigning it to the same name should
|
||||
# work. This was a problem in https://github.com/ray-project/ray/issues/159.
|
||||
for val in [np.zeros(10), [np.zeros(10)], (((np.zeros(10)),),), {(): np.zeros(10)}, [1, 2, 3, np.zeros(1)]]:
|
||||
x = ray.put(val)
|
||||
objectid = x.id
|
||||
xval = ray.get(x)
|
||||
xval = ray.get(x)
|
||||
xval = ray.get(x)
|
||||
xval = ray.get(x)
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
|
||||
del x
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], 1)
|
||||
del xval
|
||||
self.assertEqual(ray.scheduler_info()["reference_counts"][objectid], -1)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
class PythonModeTest(unittest.TestCase):
|
||||
|
||||
def testPythonMode(self):
|
||||
@@ -712,18 +485,18 @@ class PythonModeTest(unittest.TestCase):
|
||||
|
||||
class PythonCExtensionTest(unittest.TestCase):
|
||||
|
||||
def testReferenceCountNone(self):
|
||||
ray.init(start_ray_local=True, num_workers=1)
|
||||
|
||||
# Make sure that we aren't accidentally messing up Python's reference counts.
|
||||
@ray.remote
|
||||
def f():
|
||||
return sys.getrefcount(None)
|
||||
first_count = ray.get(f.remote())
|
||||
second_count = ray.get(f.remote())
|
||||
self.assertEqual(first_count, second_count)
|
||||
|
||||
ray.worker.cleanup()
|
||||
# def testReferenceCountNone(self):
|
||||
# ray.init(start_ray_local=True, num_workers=1)
|
||||
#
|
||||
# # Make sure that we aren't accidentally messing up Python's reference counts.
|
||||
# @ray.remote
|
||||
# def f():
|
||||
# return sys.getrefcount(None)
|
||||
# first_count = ray.get(f.remote())
|
||||
# second_count = ray.get(f.remote())
|
||||
# self.assertEqual(first_count, second_count)
|
||||
#
|
||||
# ray.worker.cleanup()
|
||||
|
||||
def testReferenceCountTrue(self):
|
||||
ray.init(start_ray_local=True, num_workers=1)
|
||||
@@ -867,43 +640,5 @@ class ReusablesTest(unittest.TestCase):
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
class ClusterAttachingTest(unittest.TestCase):
|
||||
|
||||
def testAttachingToCluster(self):
|
||||
node_ip_address = "127.0.0.1"
|
||||
scheduler_port = np.random.randint(40000, 50000)
|
||||
scheduler_address = "{}:{}".format(node_ip_address, scheduler_port)
|
||||
ray.services.start_scheduler(scheduler_address, cleanup=True)
|
||||
time.sleep(0.1)
|
||||
ray.services.start_node(scheduler_address, node_ip_address, num_workers=1, cleanup=True)
|
||||
|
||||
ray.init(node_ip_address=node_ip_address, scheduler_address=scheduler_address)
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return x + 1
|
||||
self.assertEqual(ray.get(f.remote(0)), 1)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testAttachingToClusterWithMultipleObjectStores(self):
|
||||
node_ip_address = "127.0.0.1"
|
||||
scheduler_port = np.random.randint(40000, 50000)
|
||||
scheduler_address = "{}:{}".format(node_ip_address, scheduler_port)
|
||||
ray.services.start_scheduler(scheduler_address, cleanup=True)
|
||||
time.sleep(0.1)
|
||||
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
|
||||
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
|
||||
ray.services.start_node(scheduler_address, node_ip_address, num_workers=5, cleanup=True)
|
||||
|
||||
ray.init(node_ip_address=node_ip_address, scheduler_address=scheduler_address)
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return x + 1
|
||||
self.assertEqual(ray.get(f.remote(0)), 1)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
Reference in New Issue
Block a user