Remove legacy Ray code. (#3121)

* Remove legacy Ray code.

* Fix cmake and simplify monitor.

* Fix linting

* Updates

* Fix

* Implement some methods.

* Remove more plasma manager references.

* Fix

* Linting

* Fix

* Fix

* Make sure class IDs are strings.

* Some path fixes

* Fix

* Path fixes and update arrow

* Fixes.

* linting

* Fixes

* Java fixes

* Some java fixes

* TaskLanguage -> Language

* Minor

* Fix python test and remove unused method signature.

* Fix java tests

* Fix jenkins tests

* Remove commented out code.
This commit is contained in:
Robert Nishihara
2018-10-26 13:36:58 -07:00
committed by Philipp Moritz
parent 055daf17a0
commit 658c14282c
289 changed files with 2460 additions and 40708 deletions
+1 -1
View File
@@ -46,7 +46,7 @@ except ImportError as e:
e.args += (helpful_message, )
raise
from ray.local_scheduler import ObjectID, _config # noqa: E402
from ray.raylet import ObjectID, _config # noqa: E402
from ray.profiling import profile # noqa: E402
from ray.worker import (error_info, init, connect, disconnect, get, put, wait,
remote, get_gpu_ids, get_resource_ids, get_webui_url,
+1 -1
View File
@@ -9,7 +9,7 @@ import traceback
import ray.cloudpickle as pickle
from ray.function_manager import FunctionActorManager
import ray.local_scheduler
import ray.raylet
import ray.ray_constants as ray_constants
import ray.signature as signature
import ray.worker
-451
View File
@@ -1,451 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import redis
import sys
import time
import unittest
import ray.gcs_utils
import ray.services
def integerToAsciiHex(num, numbytes):
retstr = b""
# Support 32 and 64 bit architecture.
assert (numbytes == 4 or numbytes == 8)
for i in range(numbytes):
curbyte = num & 0xff
if sys.version_info >= (3, 0):
retstr += bytes([curbyte])
else:
retstr += chr(curbyte)
num = num >> 8
return retstr
def get_next_message(pubsub_client, timeout_seconds=10):
"""Block until the next message is available on the pubsub channel."""
start_time = time.time()
while True:
message = pubsub_client.get_message()
if message is not None:
return message
time.sleep(0.1)
if time.time() - start_time > timeout_seconds:
raise Exception("Timed out while waiting for next message.")
class TestGlobalStateStore(unittest.TestCase):
def setUp(self):
unused_primary_redis_addr, redis_shards = ray.services.start_redis(
"localhost", use_credis="RAY_USE_NEW_GCS" in os.environ)
self.redis = redis.StrictRedis(
host="localhost", port=redis_shards[0].split(":")[-1], db=0)
def tearDown(self):
ray.services.cleanup()
def testInvalidObjectTableAdd(self):
# Check that Redis returns an error when RAY.OBJECT_TABLE_ADD is called
# with the wrong arguments.
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "hello")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id2",
"one", "hash2", "manager_id1")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id2", 1,
"hash2", "manager_id1",
"extra argument")
# Check that Redis returns an error when RAY.OBJECT_TABLE_ADD adds an
# object ID that is already present with a different hash.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id1")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1"})
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash2", "manager_id2")
# Check that the second manager was added, even though the hash was
# mismatched.
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Check that it is fine if we add the same object ID multiple times
# with the most recent hash.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash2", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash2", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash2", "manager_id2")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2,
"hash2", "manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
def testObjectTableAddAndLookup(self):
# Try calling RAY.OBJECT_TABLE_LOOKUP with an object ID that has not
# been added yet.
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(response, None)
# Add some managers and try again.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Add a manager that already exists again and try again.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Check that we properly handle NULL characters. In the past, NULL
# characters were handled improperly causing a "hash mismatch" error if
# two object IDs that agreed up to the NULL character were inserted
# with different hashes.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "\x00object_id3", 1,
"hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "\x00object_id4", 1,
"hash2", "manager_id1")
# Check that NULL characters in the hash are handled properly.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id3", 1,
"\x00hash1", "manager_id1")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id3", 1,
"\x00hash2", "manager_id1")
def testObjectTableAddAndRemove(self):
# Try removing a manager from an object ID that has not been added yet.
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id1")
# Try calling RAY.OBJECT_TABLE_LOOKUP with an object ID that has not
# been added yet.
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(response, None)
# Add some managers and try again.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Remove a manager that doesn't exist, and make sure we still have the
# same set.
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id3")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Remove a manager that does exist. Make sure it gets removed the first
# time and does nothing the second time.
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id1")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id2"})
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id1")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), {b"manager_id2"})
# Remove the last manager, and make sure we have an empty set.
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), set())
# Remove a manager from an empty set, and make sure we now have an
# empty set.
self.redis.execute_command("RAY.OBJECT_TABLE_REMOVE", "object_id1",
"manager_id3")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP",
"object_id1")
self.assertEqual(set(response), set())
def testObjectTableSubscribeToNotifications(self):
# Define a helper method for checking the contents of object
# notifications.
def check_object_notification(notification_message, object_id,
object_size, manager_ids):
notification_object = (ray.gcs_utils.SubscribeToNotificationsReply.
GetRootAsSubscribeToNotificationsReply(
notification_message, 0))
self.assertEqual(notification_object.ObjectId(), object_id)
self.assertEqual(notification_object.ObjectSize(), object_size)
self.assertEqual(notification_object.ManagerIdsLength(),
len(manager_ids))
for i in range(len(manager_ids)):
self.assertEqual(
notification_object.ManagerIds(i), manager_ids[i])
data_size = 0xf1f0
p = self.redis.pubsub()
# Subscribe to an object ID.
p.psubscribe("{}manager_id1".format(
ray.gcs_utils.OBJECT_CHANNEL_PREFIX))
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1",
data_size, "hash1", "manager_id2")
# Receive the acknowledgement message.
self.assertEqual(get_next_message(p)["data"], 1)
# Request a notification and receive the data.
self.redis.execute_command("RAY.OBJECT_TABLE_REQUEST_NOTIFICATIONS",
"manager_id1", "object_id1")
# Verify that the notification is correct.
check_object_notification(
get_next_message(p)["data"], b"object_id1", data_size,
[b"manager_id2"])
# Request a notification for an object that isn't there. Then add the
# object and receive the data. Only the first call to
# RAY.OBJECT_TABLE_ADD should trigger notifications.
self.redis.execute_command("RAY.OBJECT_TABLE_REQUEST_NOTIFICATIONS",
"manager_id1", "object_id2", "object_id3")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id3",
data_size, "hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id3",
data_size, "hash1", "manager_id2")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id3",
data_size, "hash1", "manager_id3")
# Verify that the notification is correct.
check_object_notification(
get_next_message(p)["data"], b"object_id3", data_size,
[b"manager_id1"])
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id2",
data_size, "hash1", "manager_id3")
# Verify that the notification is correct.
check_object_notification(
get_next_message(p)["data"], b"object_id2", data_size,
[b"manager_id3"])
# Request notifications for object_id3 again.
self.redis.execute_command("RAY.OBJECT_TABLE_REQUEST_NOTIFICATIONS",
"manager_id1", "object_id3")
# Verify that the notification is correct.
check_object_notification(
get_next_message(p)["data"], b"object_id3", data_size,
[b"manager_id1", b"manager_id2", b"manager_id3"])
def testResultTableAddAndLookup(self):
def check_result_table_entry(message, task_id, is_put):
result_table_reply = (
ray.gcs_utils.ResultTableReply.GetRootAsResultTableReply(
message, 0))
self.assertEqual(result_table_reply.TaskId(), task_id)
self.assertEqual(result_table_reply.IsPut(), is_put)
# Try looking up something in the result table before anything is
# added.
response = self.redis.execute_command("RAY.RESULT_TABLE_LOOKUP",
"object_id1")
self.assertIsNone(response)
# Adding the object to the object table should have no effect.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1,
"hash1", "manager_id1")
response = self.redis.execute_command("RAY.RESULT_TABLE_LOOKUP",
"object_id1")
self.assertIsNone(response)
# Add the result to the result table. The lookup now returns the task
# ID.
task_id = b"task_id1"
self.redis.execute_command("RAY.RESULT_TABLE_ADD", "object_id1",
task_id, 0)
response = self.redis.execute_command("RAY.RESULT_TABLE_LOOKUP",
"object_id1")
check_result_table_entry(response, task_id, False)
# Doing it again should still work.
response = self.redis.execute_command("RAY.RESULT_TABLE_LOOKUP",
"object_id1")
check_result_table_entry(response, task_id, False)
# Try another result table lookup. This should succeed.
task_id = b"task_id2"
self.redis.execute_command("RAY.RESULT_TABLE_ADD", "object_id2",
task_id, 1)
response = self.redis.execute_command("RAY.RESULT_TABLE_LOOKUP",
"object_id2")
check_result_table_entry(response, task_id, True)
def testInvalidTaskTableAdd(self):
# Check that Redis returns an error when RAY.TASK_TABLE_ADD is called
# with the wrong arguments.
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.TASK_TABLE_ADD")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.TASK_TABLE_ADD", "hello")
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id", 3,
"node_id")
with self.assertRaises(redis.ResponseError):
# Non-integer scheduling states should not be added.
self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
"invalid_state", "node_id", "task_spec")
with self.assertRaises(redis.ResponseError):
# Should not be able to update a non-existent task.
self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", 10,
"node_id", b"")
def testTaskTableAddAndLookup(self):
TASK_STATUS_WAITING = 1
TASK_STATUS_SCHEDULED = 2
TASK_STATUS_QUEUED = 4
# make sure somebody will get a notification (checked in the redis
# module)
p = self.redis.pubsub()
p.psubscribe("{prefix}*:*".format(prefix=ray.gcs_utils.TASK_PREFIX))
def check_task_reply(message, task_args, updated=False):
(task_status, local_scheduler_id, execution_dependencies_string,
spillback_count, task_spec) = task_args
task_reply_object = ray.gcs_utils.TaskReply.GetRootAsTaskReply(
message, 0)
self.assertEqual(task_reply_object.State(), task_status)
self.assertEqual(task_reply_object.LocalSchedulerId(),
local_scheduler_id)
self.assertEqual(task_reply_object.SpillbackCount(),
spillback_count)
self.assertEqual(task_reply_object.TaskSpec(), task_spec)
self.assertEqual(task_reply_object.Updated(), updated)
# Check that task table adds, updates, and lookups work correctly.
task_args = [TASK_STATUS_WAITING, b"node_id", b"", 0, b"task_spec"]
response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
*task_args)
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
check_task_reply(response, task_args)
task_args[0] = TASK_STATUS_SCHEDULED
self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id",
*task_args[:4])
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
check_task_reply(response, task_args)
# If the current value, test value, and set value are all the same, the
# update happens, and the response is still the same task.
task_args = [task_args[0]] + task_args
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id", *task_args[:3])
check_task_reply(response, task_args[1:], updated=True)
# Check that the task entry is still the same.
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET",
"task_id")
check_task_reply(get_response, task_args[1:])
# If the current value is the same as the test value, and the set value
# is different, the update happens, and the response is the entire
# task.
task_args[1] = TASK_STATUS_QUEUED
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id", *task_args[:3])
check_task_reply(response, task_args[1:], updated=True)
# Check that the update happened.
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET",
"task_id")
check_task_reply(get_response, task_args[1:])
# If the current value is no longer the same as the test value, the
# response is the same task as before the test-and-set.
new_task_args = task_args[:]
new_task_args[1] = TASK_STATUS_WAITING
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id", *new_task_args[:3])
check_task_reply(response, task_args[1:], updated=False)
# Check that the update did not happen.
get_response2 = self.redis.execute_command("RAY.TASK_TABLE_GET",
"task_id")
self.assertEqual(get_response2, get_response)
# If the test value is a bitmask that matches the current value, the
# update happens.
task_args = new_task_args
task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id", *task_args[:3])
check_task_reply(response, task_args[1:], updated=True)
# If the test value is a bitmask that does not match the current value,
# the update does not happen, and the response is the same task as
# before the test-and-set.
new_task_args = task_args[:]
new_task_args[0] = TASK_STATUS_SCHEDULED
old_response = response
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id", *new_task_args[:3])
check_task_reply(response, task_args[1:], updated=False)
# Check that the update did not happen.
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET",
"task_id")
self.assertNotEqual(get_response, old_response)
check_task_reply(get_response, task_args[1:])
def check_task_subscription(self, p, scheduling_state, local_scheduler_id):
task_args = [
b"task_id", scheduling_state,
local_scheduler_id.encode("ascii"), b"", 0, b"task_spec"
]
self.redis.execute_command("RAY.TASK_TABLE_ADD", *task_args)
# Receive the data.
message = get_next_message(p)["data"]
# Check that the notification object is correct.
notification_object = ray.gcs_utils.TaskReply.GetRootAsTaskReply(
message, 0)
self.assertEqual(notification_object.TaskId(), task_args[0])
self.assertEqual(notification_object.State(), task_args[1])
self.assertEqual(notification_object.LocalSchedulerId(), task_args[2])
self.assertEqual(notification_object.ExecutionDependencies(),
task_args[3])
self.assertEqual(notification_object.TaskSpec(), task_args[-1])
def testTaskTableSubscribe(self):
scheduling_state = 1
local_scheduler_id = "local_scheduler_id"
# Subscribe to the task table.
p = self.redis.pubsub()
p.psubscribe("{prefix}*:*".format(prefix=ray.gcs_utils.TASK_PREFIX))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 1)
self.check_task_subscription(p, scheduling_state, local_scheduler_id)
# unsubscribe to make sure there is only one subscriber at a given time
p.punsubscribe("{prefix}*:*".format(prefix=ray.gcs_utils.TASK_PREFIX))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 0)
p.psubscribe("{prefix}*:{state}".format(
prefix=ray.gcs_utils.TASK_PREFIX, state=scheduling_state))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 1)
self.check_task_subscription(p, scheduling_state, local_scheduler_id)
p.punsubscribe("{prefix}*:{state}".format(
prefix=ray.gcs_utils.TASK_PREFIX, state=scheduling_state))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 0)
p.psubscribe("{prefix}{local_scheduler_id}:*".format(
prefix=ray.gcs_utils.TASK_PREFIX,
local_scheduler_id=local_scheduler_id))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 1)
self.check_task_subscription(p, scheduling_state, local_scheduler_id)
p.punsubscribe("{prefix}{local_scheduler_id}:*".format(
prefix=ray.gcs_utils.TASK_PREFIX,
local_scheduler_id=local_scheduler_id))
# Receive acknowledgment.
self.assertEqual(get_next_message(p)["data"], 0)
if __name__ == "__main__":
unittest.main(verbosity=2)
-181
View File
@@ -1,181 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pickle
import sys
import unittest
import ray.local_scheduler as local_scheduler
import ray.ray_constants as ray_constants
def random_object_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_function_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_driver_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_task_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
BASE_SIMPLE_OBJECTS = [
0, 1, 100000, 0.0, 0.5, 0.9, 100000.1, (), [], {}, "", 990 * "h", u"",
990 * u"h",
np.ones(3),
np.array([True, False]), None, True, False
]
if sys.version_info < (3, 0):
BASE_SIMPLE_OBJECTS += [
long(0), # noqa: E501,F821
long(1), # noqa: E501,F821
long(100000), # noqa: E501,F821
long(1 << 100) # noqa: E501,F821
]
LIST_SIMPLE_OBJECTS = [[obj] for obj in BASE_SIMPLE_OBJECTS]
TUPLE_SIMPLE_OBJECTS = [(obj, ) for obj in BASE_SIMPLE_OBJECTS]
DICT_SIMPLE_OBJECTS = [{(): obj} for obj in BASE_SIMPLE_OBJECTS]
SIMPLE_OBJECTS = (BASE_SIMPLE_OBJECTS + LIST_SIMPLE_OBJECTS +
TUPLE_SIMPLE_OBJECTS + DICT_SIMPLE_OBJECTS)
# Create some complex objects that cannot be serialized by value in tasks.
lst = []
lst.append(lst)
class Foo(object):
def __init__(self):
pass
BASE_COMPLEX_OBJECTS = [
15000 * "h", 15000 * u"h", lst,
Foo(), 100 * [100 * [10 * [1]]],
np.array([Foo()])
]
LIST_COMPLEX_OBJECTS = [[obj] for obj in BASE_COMPLEX_OBJECTS]
TUPLE_COMPLEX_OBJECTS = [(obj, ) for obj in BASE_COMPLEX_OBJECTS]
DICT_COMPLEX_OBJECTS = [{(): obj} for obj in BASE_COMPLEX_OBJECTS]
COMPLEX_OBJECTS = (BASE_COMPLEX_OBJECTS + LIST_COMPLEX_OBJECTS +
TUPLE_COMPLEX_OBJECTS + DICT_COMPLEX_OBJECTS)
class TestSerialization(unittest.TestCase):
def test_serialize_by_value(self):
for val in SIMPLE_OBJECTS:
self.assertTrue(local_scheduler.check_simple_value(val))
for val in COMPLEX_OBJECTS:
self.assertFalse(local_scheduler.check_simple_value(val))
class TestObjectID(unittest.TestCase):
def test_create_object_id(self):
random_object_id()
def test_cannot_pickle_object_ids(self):
object_ids = [random_object_id() for _ in range(256)]
def f():
return object_ids
def g(val=object_ids):
return 1
def h():
object_ids[0]
return 1
# Make sure that object IDs cannot be pickled (including functions that
# close over object IDs).
self.assertRaises(Exception, lambda: pickle.dumps(object_ids[0]))
self.assertRaises(Exception, lambda: pickle.dumps(object_ids))
self.assertRaises(Exception, lambda: pickle.dumps(f))
self.assertRaises(Exception, lambda: pickle.dumps(g))
self.assertRaises(Exception, lambda: pickle.dumps(h))
def test_equality_comparisons(self):
x1 = local_scheduler.ObjectID(ray_constants.ID_SIZE * b"a")
x2 = local_scheduler.ObjectID(ray_constants.ID_SIZE * b"a")
y1 = local_scheduler.ObjectID(ray_constants.ID_SIZE * b"b")
y2 = local_scheduler.ObjectID(ray_constants.ID_SIZE * b"b")
self.assertEqual(x1, x2)
self.assertEqual(y1, y2)
self.assertNotEqual(x1, y1)
random_strings = [
np.random.bytes(ray_constants.ID_SIZE) for _ in range(256)
]
object_ids1 = [
local_scheduler.ObjectID(random_strings[i]) for i in range(256)
]
object_ids2 = [
local_scheduler.ObjectID(random_strings[i]) for i in range(256)
]
self.assertEqual(len(set(object_ids1)), 256)
self.assertEqual(len(set(object_ids1 + object_ids2)), 256)
self.assertEqual(set(object_ids1), set(object_ids2))
def test_hashability(self):
x = random_object_id()
y = random_object_id()
{x: y}
{x, y}
class TestTask(unittest.TestCase):
def check_task(self, task, function_id, num_return_vals, args):
self.assertEqual(function_id.id(), task.function_id().id())
retrieved_args = task.arguments()
self.assertEqual(num_return_vals, len(task.returns()))
self.assertEqual(len(args), len(retrieved_args))
for i in range(len(retrieved_args)):
if isinstance(retrieved_args[i], local_scheduler.ObjectID):
self.assertEqual(retrieved_args[i].id(), args[i].id())
else:
self.assertEqual(retrieved_args[i], args[i])
def test_create_and_serialize_task(self):
# TODO(rkn): The function ID should be a FunctionID object, not an
# ObjectID.
driver_id = random_driver_id()
parent_id = random_task_id()
function_id = random_function_id()
object_ids = [random_object_id() for _ in range(256)]
args_list = [[], 1 * [1], 10 * [1], 100 * [1], 1000 * [1], 1 * ["a"],
10 * ["a"], 100 * ["a"], 1000 * ["a"], [
1, 1.3, 2, 1 << 100, "hi", u"hi", [1, 2]
], object_ids[:1], object_ids[:2], object_ids[:3],
object_ids[:4], object_ids[:5], object_ids[:10],
object_ids[:100], object_ids[:256], [1, object_ids[0]], [
object_ids[0], "a"
], [1, object_ids[0], "a"], [
object_ids[0], 1, object_ids[1], "a"
], object_ids[:3] + [1, "hi", 2.3] + object_ids[:5],
object_ids + 100 * ["a"] + object_ids]
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = local_scheduler.Task(driver_id, function_id, args,
num_return_vals, parent_id, 0)
self.check_task(task, function_id, num_return_vals, args)
data = local_scheduler.task_to_string(task)
task2 = local_scheduler.task_from_string(data)
self.check_task(task2, function_id, num_return_vals, args)
if __name__ == "__main__":
unittest.main(verbosity=2)
View File
+2 -4
View File
@@ -108,8 +108,6 @@ class SGDWorker(object):
if plasma_op:
store_socket = (
ray.worker.global_worker.plasma_client.store_socket_name)
manager_socket = (
ray.worker.global_worker.plasma_client.manager_socket_name)
if not plasma.tf_plasma_op:
plasma.build_plasma_tensorflow_op()
@@ -130,7 +128,7 @@ class SGDWorker(object):
[grad],
self.plasma_in_grads_oids[j],
plasma_store_socket_name=store_socket,
plasma_manager_socket_name=manager_socket)
plasma_manager_socket_name="")
self.plasma_in_grads.append(plasma_grad)
# For applying grads <- plasma
@@ -149,7 +147,7 @@ class SGDWorker(object):
self.plasma_out_grads_oids[j],
dtype=tf.float32,
plasma_store_socket_name=store_socket,
plasma_manager_socket_name=manager_socket)
plasma_manager_socket_name="")
grad_ph = tf.reshape(grad_ph,
self.packed_grads_and_vars[0][j][0].shape)
logger.debug("Packed tensor {}".format(grad_ph))
+4 -9
View File
@@ -14,15 +14,10 @@ logger = logging.getLogger(__name__)
def fetch(oids):
if ray.global_state.use_raylet:
local_sched_client = ray.worker.global_worker.local_scheduler_client
for o in oids:
ray_obj_id = ray.ObjectID(o)
local_sched_client.reconstruct_objects([ray_obj_id], True)
else:
for o in oids:
plasma_id = ray.pyarrow.plasma.ObjectID(o)
ray.worker.global_worker.plasma_client.fetch([plasma_id])
local_sched_client = ray.worker.global_worker.local_scheduler_client
for o in oids:
ray_obj_id = ray.ObjectID(o)
local_sched_client.reconstruct_objects([ray_obj_id], True)
def run_timeline(sess, ops, feed_dict=None, write_timeline=False, name=""):
+128 -398
View File
@@ -6,8 +6,6 @@ import copy
from collections import defaultdict
import heapq
import json
import numbers
import os
import redis
import sys
import time
@@ -18,25 +16,6 @@ import ray.ray_constants as ray_constants
from ray.utils import (decode, binary_to_object_id, binary_to_hex,
hex_to_binary)
# This mapping from integer to task state string must be kept up-to-date with
# the scheduling_state enum in task.h.
TASK_STATUS_WAITING = 1
TASK_STATUS_SCHEDULED = 2
TASK_STATUS_QUEUED = 4
TASK_STATUS_RUNNING = 8
TASK_STATUS_DONE = 16
TASK_STATUS_LOST = 32
TASK_STATUS_RECONSTRUCTING = 64
TASK_STATUS_MAPPING = {
TASK_STATUS_WAITING: "WAITING",
TASK_STATUS_SCHEDULED: "SCHEDULED",
TASK_STATUS_QUEUED: "QUEUED",
TASK_STATUS_RUNNING: "RUNNING",
TASK_STATUS_DONE: "DONE",
TASK_STATUS_LOST: "LOST",
TASK_STATUS_RECONSTRUCTING: "RECONSTRUCTING",
}
class GlobalState(object):
"""A class used to interface with the Ray control state.
@@ -47,7 +26,6 @@ class GlobalState(object):
Attributes:
redis_client: The Redis client used to query the primary redis server.
redis_clients: Redis clients for each of the Redis shards.
use_raylet: True if we are using the raylet code path.
"""
def __init__(self):
@@ -57,8 +35,6 @@ class GlobalState(object):
self.redis_client = None
# Clients for the redis shards, storing the object table & task table.
self.redis_clients = None
# True if we are using the raylet code path and false otherwise.
self.use_raylet = None
def _check_connected(self):
"""Check that the object has been initialized before it is used.
@@ -130,18 +106,6 @@ class GlobalState(object):
"ip_address_ports = {}".format(
num_redis_shards, ip_address_ports))
use_raylet = self.redis_client.get("UseRaylet")
if use_raylet is not None:
self.use_raylet = bool(int(use_raylet))
elif os.environ.get("RAY_USE_XRAY") == "0":
# This environment variable is used in our testing setup.
print("Detected environment variable 'RAY_USE_XRAY' with value "
"{}. This turns OFF xray.".format(
os.environ.get("RAY_USE_XRAY")))
self.use_raylet = False
else:
self.use_raylet = True
# Get the rest of the information.
self.redis_clients = []
for ip_address_port in ip_address_ports:
@@ -195,51 +159,23 @@ class GlobalState(object):
object_id = ray.ObjectID(hex_to_binary(object_id))
# Return information about a single object ID.
if not self.use_raylet:
# Use the non-raylet code path.
object_locations = self._execute_command(
object_id, "RAY.OBJECT_TABLE_LOOKUP", object_id.id())
if object_locations is not None:
manager_ids = [
binary_to_hex(manager_id)
for manager_id in object_locations
]
else:
manager_ids = None
message = self._execute_command(object_id, "RAY.TABLE_LOOKUP",
ray.gcs_utils.TablePrefix.OBJECT, "",
object_id.id())
result = []
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
result_table_response = self._execute_command(
object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id())
result_table_message = (
ray.gcs_utils.ResultTableReply.GetRootAsResultTableReply(
result_table_response, 0))
result = {
"ManagerIDs": manager_ids,
"TaskID": binary_to_hex(result_table_message.TaskId()),
"IsPut": bool(result_table_message.IsPut()),
"DataSize": result_table_message.DataSize(),
"Hash": binary_to_hex(result_table_message.Hash())
for i in range(gcs_entry.EntriesLength()):
entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData(
gcs_entry.Entries(i), 0)
object_info = {
"DataSize": entry.ObjectSize(),
"Manager": entry.Manager(),
"IsEviction": entry.IsEviction(),
"NumEvictions": entry.NumEvictions()
}
else:
# Use the raylet code path.
message = self._execute_command(object_id, "RAY.TABLE_LOOKUP",
ray.gcs_utils.TablePrefix.OBJECT,
"", object_id.id())
result = []
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
for i in range(gcs_entry.EntriesLength()):
entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData(
gcs_entry.Entries(i), 0)
object_info = {
"DataSize": entry.ObjectSize(),
"Manager": entry.Manager(),
"IsEviction": entry.IsEviction(),
"NumEvictions": entry.NumEvictions()
}
result.append(object_info)
result.append(object_info)
return result
@@ -259,25 +195,12 @@ class GlobalState(object):
return self._object_table(object_id)
else:
# Return the entire object table.
if not self.use_raylet:
object_info_keys = self._keys(
ray.gcs_utils.OBJECT_INFO_PREFIX + "*")
object_location_keys = self._keys(
ray.gcs_utils.OBJECT_LOCATION_PREFIX + "*")
object_ids_binary = set([
key[len(ray.gcs_utils.OBJECT_INFO_PREFIX):]
for key in object_info_keys
] + [
key[len(ray.gcs_utils.OBJECT_LOCATION_PREFIX):]
for key in object_location_keys
])
else:
object_keys = self._keys(
ray.gcs_utils.TablePrefix_OBJECT_string + "*")
object_ids_binary = {
key[len(ray.gcs_utils.TablePrefix_OBJECT_string):]
for key in object_keys
}
object_keys = self._keys(ray.gcs_utils.TablePrefix_OBJECT_string +
"*")
object_ids_binary = {
key[len(ray.gcs_utils.TablePrefix_OBJECT_string):]
for key in object_keys
}
results = {}
for object_id_binary in object_ids_binary:
@@ -294,21 +217,21 @@ class GlobalState(object):
Returns:
A dictionary with information about the task ID in question.
TASK_STATUS_MAPPING should be used to parse the "State" field
into a human-readable string.
"""
if not self.use_raylet:
# Use the non-raylet code path.
task_table_response = self._execute_command(
task_id, "RAY.TASK_TABLE_GET", task_id.id())
if task_table_response is None:
raise Exception("There is no entry for task ID {} in the task "
"table.".format(binary_to_hex(task_id.id())))
task_table_message = ray.gcs_utils.TaskReply.GetRootAsTaskReply(
task_table_response, 0)
task_spec = task_table_message.TaskSpec()
task_spec = ray.local_scheduler.task_from_string(task_spec)
message = self._execute_command(task_id, "RAY.TABLE_LOOKUP",
ray.gcs_utils.TablePrefix.RAYLET_TASK,
"", task_id.id())
gcs_entries = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
info = []
for i in range(gcs_entries.EntriesLength()):
task_table_message = ray.gcs_utils.Task.GetRootAsTask(
gcs_entries.Entries(i), 0)
execution_spec = task_table_message.TaskExecutionSpec()
task_spec = task_table_message.TaskSpecification()
task_spec = ray.raylet.task_from_string(task_spec)
task_spec_info = {
"DriverID": binary_to_hex(task_spec.driver_id().id()),
"TaskID": binary_to_hex(task_spec.task_id().id()),
@@ -326,80 +249,19 @@ class GlobalState(object):
"RequiredResources": task_spec.required_resources()
}
execution_dependencies_message = (
ray.gcs_utils.TaskExecutionDependencies.
GetRootAsTaskExecutionDependencies(
task_table_message.ExecutionDependencies(), 0))
execution_dependencies = [
ray.ObjectID(
execution_dependencies_message.ExecutionDependencies(i))
for i in range(execution_dependencies_message.
ExecutionDependenciesLength())
]
# TODO(rkn): The return fields ExecutionDependenciesString and
# ExecutionDependencies are redundant, so we should remove
# ExecutionDependencies. However, it is currently used in
# monitor.py.
return {
"State": task_table_message.State(),
"LocalSchedulerID": binary_to_hex(
task_table_message.LocalSchedulerId()),
"ExecutionDependenciesString": task_table_message.
ExecutionDependencies(),
"ExecutionDependencies": execution_dependencies,
"SpillbackCount": task_table_message.SpillbackCount(),
info.append({
"ExecutionSpec": {
"Dependencies": [
execution_spec.Dependencies(i)
for i in range(execution_spec.DependenciesLength())
],
"LastTimestamp": execution_spec.LastTimestamp(),
"NumForwards": execution_spec.NumForwards()
},
"TaskSpec": task_spec_info
}
})
else:
# Use the raylet code path.
message = self._execute_command(
task_id, "RAY.TABLE_LOOKUP",
ray.gcs_utils.TablePrefix.RAYLET_TASK, "", task_id.id())
gcs_entries = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
info = []
for i in range(gcs_entries.EntriesLength()):
task_table_message = ray.gcs_utils.Task.GetRootAsTask(
gcs_entries.Entries(i), 0)
execution_spec = task_table_message.TaskExecutionSpec()
task_spec = task_table_message.TaskSpecification()
task_spec = ray.local_scheduler.task_from_string(task_spec)
task_spec_info = {
"DriverID": binary_to_hex(task_spec.driver_id().id()),
"TaskID": binary_to_hex(task_spec.task_id().id()),
"ParentTaskID": binary_to_hex(
task_spec.parent_task_id().id()),
"ParentCounter": task_spec.parent_counter(),
"ActorID": binary_to_hex(task_spec.actor_id().id()),
"ActorCreationID": binary_to_hex(
task_spec.actor_creation_id().id()),
"ActorCreationDummyObjectID": binary_to_hex(
task_spec.actor_creation_dummy_object_id().id()),
"ActorCounter": task_spec.actor_counter(),
"FunctionID": binary_to_hex(task_spec.function_id().id()),
"Args": task_spec.arguments(),
"ReturnObjectIDs": task_spec.returns(),
"RequiredResources": task_spec.required_resources()
}
info.append({
"ExecutionSpec": {
"Dependencies": [
execution_spec.Dependencies(i)
for i in range(execution_spec.DependenciesLength())
],
"LastTimestamp": execution_spec.LastTimestamp(),
"NumForwards": execution_spec.NumForwards()
},
"TaskSpec": task_spec_info
})
return info
return info
def task_table(self, task_id=None):
"""Fetch and parse the task table information for one or more task IDs.
@@ -416,19 +278,12 @@ class GlobalState(object):
task_id = ray.ObjectID(hex_to_binary(task_id))
return self._task_table(task_id)
else:
if not self.use_raylet:
task_table_keys = self._keys(ray.gcs_utils.TASK_PREFIX + "*")
task_ids_binary = [
key[len(ray.gcs_utils.TASK_PREFIX):]
for key in task_table_keys
]
else:
task_table_keys = self._keys(
ray.gcs_utils.TablePrefix_RAYLET_TASK_string + "*")
task_ids_binary = [
key[len(ray.gcs_utils.TablePrefix_RAYLET_TASK_string):]
for key in task_table_keys
]
task_table_keys = self._keys(
ray.gcs_utils.TablePrefix_RAYLET_TASK_string + "*")
task_ids_binary = [
key[len(ray.gcs_utils.TablePrefix_RAYLET_TASK_string):]
for key in task_table_keys
]
results = {}
for task_id_binary in task_ids_binary:
@@ -464,95 +319,54 @@ class GlobalState(object):
Information about the Ray clients in the cluster.
"""
self._check_connected()
if not self.use_raylet:
db_client_keys = self.redis_client.keys(
ray.gcs_utils.DB_CLIENT_PREFIX + "*")
node_info = {}
for key in db_client_keys:
client_info = self.redis_client.hgetall(key)
node_ip_address = decode(client_info[b"node_ip_address"])
if node_ip_address not in node_info:
node_info[node_ip_address] = []
client_info_parsed = {}
assert b"client_type" in client_info
assert b"deleted" in client_info
assert b"ray_client_id" in client_info
for field, value in client_info.items():
if field == b"node_ip_address":
pass
elif field == b"client_type":
client_info_parsed["ClientType"] = decode(value)
elif field == b"deleted":
client_info_parsed["Deleted"] = bool(
int(decode(value)))
elif field == b"ray_client_id":
client_info_parsed["DBClientID"] = binary_to_hex(value)
elif field == b"manager_address":
client_info_parsed["AuxAddress"] = decode(value)
elif field == b"local_scheduler_socket_name":
client_info_parsed["LocalSchedulerSocketName"] = (
decode(value))
elif client_info[b"client_type"] == b"local_scheduler":
# The remaining fields are resource types.
client_info_parsed[decode(field)] = float(
decode(value))
else:
client_info_parsed[decode(field)] = decode(value)
node_info[node_ip_address].append(client_info_parsed)
NIL_CLIENT_ID = ray_constants.ID_SIZE * b"\xff"
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.CLIENT, "",
NIL_CLIENT_ID)
return node_info
# Handle the case where no clients are returned. This should only
# occur potentially immediately after the cluster is started.
if message is None:
return []
else:
# This is the raylet code path.
NIL_CLIENT_ID = ray_constants.ID_SIZE * b"\xff"
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.CLIENT, "",
NIL_CLIENT_ID)
node_info = {}
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
# Handle the case where no clients are returned. This should only
# occur potentially immediately after the cluster is started.
if message is None:
return []
# Since GCS entries are append-only, we override so that
# only the latest entries are kept.
for i in range(gcs_entry.EntriesLength()):
client = (ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
gcs_entry.Entries(i), 0))
node_info = {}
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
resources = {
decode(client.ResourcesTotalLabel(i)):
client.ResourcesTotalCapacity(i)
for i in range(client.ResourcesTotalLabelLength())
}
client_id = ray.utils.binary_to_hex(client.ClientId())
# Since GCS entries are append-only, we override so that
# only the latest entries are kept.
for i in range(gcs_entry.EntriesLength()):
client = (
ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
gcs_entry.Entries(i), 0))
# If this client is being removed, then it must
# have previously been inserted, and
# it cannot have previously been removed.
if not client.IsInsertion():
assert client_id in node_info, "Client removed not found!"
assert node_info[client_id]["IsInsertion"], (
"Unexpected duplicate removal of client.")
resources = {
decode(client.ResourcesTotalLabel(i)):
client.ResourcesTotalCapacity(i)
for i in range(client.ResourcesTotalLabelLength())
}
client_id = ray.utils.binary_to_hex(client.ClientId())
# If this client is being removed, then it must
# have previously been inserted, and
# it cannot have previously been removed.
if not client.IsInsertion():
assert client_id in node_info, "Client removed not found!"
assert node_info[client_id]["IsInsertion"], (
"Unexpected duplicate removal of client.")
node_info[client_id] = {
"ClientID": client_id,
"IsInsertion": client.IsInsertion(),
"NodeManagerAddress": decode(client.NodeManagerAddress()),
"NodeManagerPort": client.NodeManagerPort(),
"ObjectManagerPort": client.ObjectManagerPort(),
"ObjectStoreSocketName": decode(
client.ObjectStoreSocketName()),
"RayletSocketName": decode(client.RayletSocketName()),
"Resources": resources
}
return list(node_info.values())
node_info[client_id] = {
"ClientID": client_id,
"IsInsertion": client.IsInsertion(),
"NodeManagerAddress": decode(client.NodeManagerAddress()),
"NodeManagerPort": client.NodeManagerPort(),
"ObjectManagerPort": client.ObjectManagerPort(),
"ObjectStoreSocketName": decode(
client.ObjectStoreSocketName()),
"RayletSocketName": decode(client.RayletSocketName()),
"Resources": resources
}
return list(node_info.values())
def log_files(self):
"""Fetch and return a dictionary of log file names to outputs.
@@ -755,10 +569,6 @@ class GlobalState(object):
return profile_events
def profile_table(self):
if not self.use_raylet:
raise Exception("This method is only supported in the raylet "
"code path.")
profile_table_keys = self._keys(
ray.gcs_utils.TablePrefix_PROFILE_string + "*")
component_identifiers_binary = [
@@ -1207,23 +1017,6 @@ class GlobalState(object):
info[key] = cur
latest_timestamp = cur
def local_schedulers(self):
"""Get a list of live local schedulers.
Returns:
A list of the live local schedulers.
"""
if self.use_raylet:
raise Exception("The local_schedulers() method is deprecated.")
clients = self.client_table()
local_schedulers = []
for ip_address, client_list in clients.items():
for client in client_list:
if (client["ClientType"] == "local_scheduler"
and not client["Deleted"]):
local_schedulers.append(client)
return local_schedulers
def workers(self):
"""Get a dictionary mapping worker ID to worker information."""
worker_keys = self.redis_client.keys("Worker*")
@@ -1237,8 +1030,6 @@ class GlobalState(object):
"local_scheduler_socket": (decode(
worker_info[b"local_scheduler_socket"])),
"node_ip_address": decode(worker_info[b"node_ip_address"]),
"plasma_manager_socket": decode(
worker_info[b"plasma_manager_socket"]),
"plasma_store_socket": decode(
worker_info[b"plasma_store_socket"])
}
@@ -1298,24 +1089,12 @@ class GlobalState(object):
resource in the cluster.
"""
resources = defaultdict(int)
if not self.use_raylet:
local_schedulers = self.local_schedulers()
for local_scheduler in local_schedulers:
for key, value in local_scheduler.items():
if key not in [
"ClientType", "Deleted", "DBClientID",
"AuxAddress", "LocalSchedulerSocketName"
]:
resources[key] += value
else:
clients = self.client_table()
for client in clients:
# Only count resources from live clients.
if client["IsInsertion"]:
for key, value in client["Resources"].items():
resources[key] += value
clients = self.client_table()
for client in clients:
# Only count resources from live clients.
if client["IsInsertion"]:
for key, value in client["Resources"].items():
resources[key] += value
return dict(resources)
@@ -1340,93 +1119,48 @@ class GlobalState(object):
"""
available_resources_by_id = {}
if not self.use_raylet:
subscribe_client = self.redis_client.pubsub()
subscribe_client.subscribe(
ray.gcs_utils.LOCAL_SCHEDULER_INFO_CHANNEL)
subscribe_clients = [
redis_client.pubsub(ignore_subscribe_messages=True)
for redis_client in self.redis_clients
]
for subscribe_client in subscribe_clients:
subscribe_client.subscribe(ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL)
local_scheduler_ids = {
local_scheduler["DBClientID"]
for local_scheduler in self.local_schedulers()
}
client_ids = self._live_client_ids()
while set(available_resources_by_id.keys()) != local_scheduler_ids:
while set(available_resources_by_id.keys()) != client_ids:
for subscribe_client in subscribe_clients:
# Parse client message
raw_message = subscribe_client.get_message()
if raw_message is None:
if (raw_message is None or raw_message["channel"] !=
ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL):
continue
data = raw_message["data"]
# Ignore subscribtion success message from Redis
# This is a long in python 2 and an int in python 3
if isinstance(data, numbers.Number):
continue
message = (ray.gcs_utils.LocalSchedulerInfoMessage.
GetRootAsLocalSchedulerInfoMessage(data, 0))
num_resources = message.DynamicResourcesLength()
gcs_entries = (
ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
data, 0))
heartbeat_data = gcs_entries.Entries(0)
message = (ray.gcs_utils.HeartbeatTableData.
GetRootAsHeartbeatTableData(heartbeat_data, 0))
# Calculate available resources for this client
num_resources = message.ResourcesAvailableLabelLength()
dynamic_resources = {}
for i in range(num_resources):
dyn = message.DynamicResources(i)
resource_id = decode(dyn.Key())
dynamic_resources[resource_id] = dyn.Value()
resource_id = decode(message.ResourcesAvailableLabel(i))
dynamic_resources[resource_id] = (
message.ResourcesAvailableCapacity(i))
# Update available resources for this local scheduler
client_id = binary_to_hex(message.DbClientId())
# Update available resources for this client
client_id = ray.utils.binary_to_hex(message.ClientId())
available_resources_by_id[client_id] = dynamic_resources
# Update local schedulers in cluster
local_scheduler_ids = {
local_scheduler["DBClientID"]
for local_scheduler in self.local_schedulers()
}
# Remove disconnected local schedulers
for local_scheduler_id in available_resources_by_id.keys():
if local_scheduler_id not in local_scheduler_ids:
del available_resources_by_id[local_scheduler_id]
else:
subscribe_clients = [
redis_client.pubsub(ignore_subscribe_messages=True)
for redis_client in self.redis_clients
]
for subscribe_client in subscribe_clients:
subscribe_client.subscribe(
ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL)
# Update clients in cluster
client_ids = self._live_client_ids()
while set(available_resources_by_id.keys()) != client_ids:
for subscribe_client in subscribe_clients:
# Parse client message
raw_message = subscribe_client.get_message()
if (raw_message is None or raw_message["channel"] !=
ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL):
continue
data = raw_message["data"]
gcs_entries = (
ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
data, 0))
heartbeat_data = gcs_entries.Entries(0)
message = (ray.gcs_utils.HeartbeatTableData.
GetRootAsHeartbeatTableData(heartbeat_data, 0))
# Calculate available resources for this client
num_resources = message.ResourcesAvailableLabelLength()
dynamic_resources = {}
for i in range(num_resources):
resource_id = decode(
message.ResourcesAvailableLabel(i))
dynamic_resources[resource_id] = (
message.ResourcesAvailableCapacity(i))
# Update available resources for this client
client_id = ray.utils.binary_to_hex(message.ClientId())
available_resources_by_id[client_id] = dynamic_resources
# Update clients in cluster
client_ids = self._live_client_ids()
# Remove disconnected clients
for client_id in available_resources_by_id.keys():
if client_id not in client_ids:
del available_resources_by_id[client_id]
# Remove disconnected clients
for client_id in available_resources_by_id.keys():
if client_id not in client_ids:
del available_resources_by_id[client_id]
# Calculate total available resources
total_available_resources = defaultdict(int)
@@ -1479,10 +1213,6 @@ class GlobalState(object):
A dictionary mapping job ID to a list of the error messages for
that job.
"""
if not self.use_raylet:
raise Exception("The error_messages method is only supported in "
"the raylet code path.")
if job_id is not None:
return self._error_messages(job_id)
-29
View File
@@ -4,19 +4,6 @@ from __future__ import print_function
import flatbuffers
from ray.core.generated.ResultTableReply import ResultTableReply
from ray.core.generated.SubscribeToNotificationsReply \
import SubscribeToNotificationsReply
from ray.core.generated.TaskExecutionDependencies import \
TaskExecutionDependencies
from ray.core.generated.TaskReply import TaskReply
from ray.core.generated.DriverTableMessage import DriverTableMessage
from ray.core.generated.LocalSchedulerInfoMessage import \
LocalSchedulerInfoMessage
from ray.core.generated.SubscribeToDBClientTableReply import \
SubscribeToDBClientTableReply
from ray.core.generated.TaskInfo import TaskInfo
import ray.core.generated.ErrorTableData
from ray.core.generated.GcsTableEntry import GcsTableEntry
@@ -32,29 +19,13 @@ from ray.core.generated.TablePrefix import TablePrefix
from ray.core.generated.TablePubsub import TablePubsub
__all__ = [
"SubscribeToNotificationsReply", "ResultTableReply",
"TaskExecutionDependencies", "TaskReply", "DriverTableMessage",
"LocalSchedulerInfoMessage", "SubscribeToDBClientTableReply", "TaskInfo",
"GcsTableEntry", "ClientTableData", "ErrorTableData", "HeartbeatTableData",
"DriverTableData", "ProfileTableData", "ObjectTableData", "Task",
"TablePrefix", "TablePubsub", "construct_error_message"
]
# These prefixes must be kept up-to-date with the definitions in
# ray_redis_module.cc.
DB_CLIENT_PREFIX = "CL:"
TASK_PREFIX = "TT:"
OBJECT_CHANNEL_PREFIX = "OC:"
OBJECT_INFO_PREFIX = "OI:"
OBJECT_LOCATION_PREFIX = "OL:"
FUNCTION_PREFIX = "RemoteFunction:"
# These prefixes must be kept up-to-date with the definitions in
# common/state/redis.cc
LOCAL_SCHEDULER_INFO_CHANNEL = b"local_schedulers"
PLASMA_MANAGER_HEARTBEAT_CHANNEL = b"plasma_managers"
DRIVER_DEATH_CHANNEL = b"driver_deaths"
# xray heartbeats
XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
-7
View File
@@ -1,7 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .global_scheduler_services import start_global_scheduler
__all__ = ["start_global_scheduler"]
@@ -1,61 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import subprocess
import time
def start_global_scheduler(redis_address,
node_ip_address,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
stderr_file=None):
"""Start a global scheduler process.
Args:
redis_address (str): The address of the Redis instance.
node_ip_address: The IP address of the node that this scheduler will
run on.
use_valgrind (bool): True if the global scheduler should be started
inside of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the global scheduler should be started
inside a profiler. If this is True, use_valgrind must be False.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
Return:
The process ID of the global scheduler process.
"""
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
global_scheduler_executable = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"../core/src/global_scheduler/global_scheduler")
command = [
global_scheduler_executable, "-r", redis_address, "-h", node_ip_address
]
if use_valgrind:
pid = subprocess.Popen(
[
"valgrind", "--track-origins=yes", "--leak-check=full",
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
"--error-exitcode=1"
] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
elif use_profiler:
pid = subprocess.Popen(
["valgrind", "--tool=callgrind"] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
else:
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
time.sleep(0.1)
return pid
-332
View File
@@ -1,332 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import random
import signal
import sys
import time
import unittest
# The ray import must come before the pyarrow import because ray modifies the
# python path so that the right version of pyarrow is found.
import ray.global_scheduler as global_scheduler
import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
from ray.plasma.utils import create_object
from ray import services
from ray.experimental import state
import ray.ray_constants as ray_constants
import pyarrow as pa
USE_VALGRIND = False
PLASMA_STORE_MEMORY = 1000000000
NUM_CLUSTER_NODES = 2
NIL_WORKER_ID = ray_constants.ID_SIZE * b"\xff"
NIL_OBJECT_ID = ray_constants.ID_SIZE * b"\xff"
NIL_ACTOR_ID = ray_constants.ID_SIZE * b"\xff"
def random_driver_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_task_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_function_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_object_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def new_port():
return random.randint(10000, 65535)
class TestGlobalScheduler(unittest.TestCase):
def setUp(self):
# Start one Redis server and N pairs of (plasma, local_scheduler)
self.node_ip_address = "127.0.0.1"
redis_address, redis_shards = services.start_redis(
self.node_ip_address, use_raylet=False)
redis_port = services.get_port(redis_address)
time.sleep(0.1)
# Create a client for the global state store.
self.state = state.GlobalState()
self.state._initialize_global_state(self.node_ip_address, redis_port)
# Start one global scheduler.
self.p1 = global_scheduler.start_global_scheduler(
redis_address, self.node_ip_address, use_valgrind=USE_VALGRIND)
self.plasma_store_pids = []
self.plasma_manager_pids = []
self.local_scheduler_pids = []
self.plasma_clients = []
self.local_scheduler_clients = []
for i in range(NUM_CLUSTER_NODES):
# Start the Plasma store. Plasma store name is randomly generated.
plasma_store_name, p2 = plasma.start_plasma_store()
self.plasma_store_pids.append(p2)
# Start the Plasma manager.
# Assumption: Plasma manager name and port are randomly generated
# by the plasma module.
manager_info = plasma.start_plasma_manager(plasma_store_name,
redis_address)
plasma_manager_name, p3, plasma_manager_port = manager_info
self.plasma_manager_pids.append(p3)
plasma_address = "{}:{}".format(self.node_ip_address,
plasma_manager_port)
plasma_client = pa.plasma.connect(plasma_store_name,
plasma_manager_name, 64)
self.plasma_clients.append(plasma_client)
# Start the local scheduler.
local_scheduler_name, p4 = local_scheduler.start_local_scheduler(
plasma_store_name,
plasma_manager_name=plasma_manager_name,
plasma_address=plasma_address,
redis_address=redis_address,
static_resources={"CPU": 10})
# Connect to the scheduler.
local_scheduler_client = local_scheduler.LocalSchedulerClient(
local_scheduler_name, NIL_WORKER_ID, False, random_task_id(),
False)
self.local_scheduler_clients.append(local_scheduler_client)
self.local_scheduler_pids.append(p4)
def tearDown(self):
# Check that the processes are still alive.
self.assertEqual(self.p1.poll(), None)
for p2 in self.plasma_store_pids:
self.assertEqual(p2.poll(), None)
for p3 in self.plasma_manager_pids:
self.assertEqual(p3.poll(), None)
for p4 in self.local_scheduler_pids:
self.assertEqual(p4.poll(), None)
redis_processes = services.all_processes[
services.PROCESS_TYPE_REDIS_SERVER]
for redis_process in redis_processes:
self.assertEqual(redis_process.poll(), None)
# Kill the global scheduler.
if USE_VALGRIND:
self.p1.send_signal(signal.SIGTERM)
self.p1.wait()
if self.p1.returncode != 0:
os._exit(-1)
else:
self.p1.kill()
# Kill local schedulers, plasma managers, and plasma stores.
for p2 in self.local_scheduler_pids:
p2.kill()
for p3 in self.plasma_manager_pids:
p3.kill()
for p4 in self.plasma_store_pids:
p4.kill()
# Kill Redis. In the event that we are using valgrind, this needs to
# happen after we kill the global scheduler.
while redis_processes:
redis_process = redis_processes.pop()
redis_process.kill()
def get_plasma_manager_id(self):
"""Get the db_client_id with client_type equal to plasma_manager.
Iterates over all the client table keys, gets the db_client_id for the
client with client_type matching plasma_manager. Strips the client
table prefix. TODO(atumanov): write a separate function to get all
plasma manager client IDs.
Returns:
The db_client_id if one is found and otherwise None.
"""
db_client_id = None
client_list = self.state.client_table()[self.node_ip_address]
for client in client_list:
if client["ClientType"] == "plasma_manager":
db_client_id = client["DBClientID"]
break
return db_client_id
def test_task_default_resources(self):
task1 = local_scheduler.Task(
random_driver_id(), random_function_id(), [random_object_id()], 0,
random_task_id(), 0)
self.assertEqual(task1.required_resources(), {"CPU": 1})
task2 = local_scheduler.Task(
random_driver_id(), random_function_id(), [random_object_id()], 0,
random_task_id(), 0, local_scheduler.ObjectID(NIL_ACTOR_ID),
local_scheduler.ObjectID(NIL_OBJECT_ID),
local_scheduler.ObjectID(NIL_ACTOR_ID),
local_scheduler.ObjectID(NIL_ACTOR_ID), 0, 0, [], {
"CPU": 1,
"GPU": 2
})
self.assertEqual(task2.required_resources(), {"CPU": 1, "GPU": 2})
def test_redis_only_single_task(self):
# Tests global scheduler functionality by interacting with Redis and
# checking task state transitions in Redis only. TODO(atumanov):
# implement.
# Check precondition for this test:
# There should be 2n+1 db clients: the global scheduler + one local
# scheduler and one plasma per node.
self.assertEqual(
len(self.state.client_table()[self.node_ip_address]),
2 * NUM_CLUSTER_NODES + 1)
db_client_id = self.get_plasma_manager_id()
assert (db_client_id is not None)
@unittest.skipIf(
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_single_task(self):
# There should be three db clients, the global scheduler, the local
# scheduler, and the plasma manager.
self.assertEqual(
len(self.state.client_table()[self.node_ip_address]),
2 * NUM_CLUSTER_NODES + 1)
num_return_vals = [0, 1, 2, 3, 5, 10]
# Insert the object into Redis.
data_size = 0xf1f0
metadata_size = 0x40
plasma_client = self.plasma_clients[0]
object_dep, memory_buffer, metadata = create_object(
plasma_client, data_size, metadata_size, seal=True)
# Sleep before submitting task to local scheduler.
time.sleep(0.1)
# Submit a task to Redis.
task = local_scheduler.Task(
random_driver_id(), random_function_id(),
[local_scheduler.ObjectID(object_dep.binary())],
num_return_vals[0], random_task_id(), 0)
self.local_scheduler_clients[0].submit(task)
time.sleep(0.1)
# There should now be a task in Redis, and it should get assigned to
# the local scheduler
num_retries = 10
while num_retries > 0:
task_entries = self.state.task_table()
self.assertLessEqual(len(task_entries), 1)
if len(task_entries) == 1:
task_id, task = task_entries.popitem()
task_status = task["State"]
self.assertTrue(task_status in [
state.TASK_STATUS_WAITING, state.TASK_STATUS_SCHEDULED,
state.TASK_STATUS_QUEUED
])
if task_status == state.TASK_STATUS_QUEUED:
break
else:
print(task_status)
print("The task has not been scheduled yet, trying again.")
num_retries -= 1
time.sleep(1)
if num_retries <= 0 and task_status != state.TASK_STATUS_QUEUED:
# Failed to submit and schedule a single task -- bail.
self.tearDown()
sys.exit(1)
def integration_many_tasks_helper(self, timesync=True):
# There should be three db clients, the global scheduler, the local
# scheduler, and the plasma manager.
self.assertEqual(
len(self.state.client_table()[self.node_ip_address]),
2 * NUM_CLUSTER_NODES + 1)
num_return_vals = [0, 1, 2, 3, 5, 10]
# Submit a bunch of tasks to Redis.
num_tasks = 1000
for _ in range(num_tasks):
# Create a new object for each task.
data_size = np.random.randint(1 << 12)
metadata_size = np.random.randint(1 << 9)
plasma_client = self.plasma_clients[0]
object_dep, memory_buffer, metadata = create_object(
plasma_client, data_size, metadata_size, seal=True)
if timesync:
# Give 10ms for object info handler to fire (long enough to
# yield CPU).
time.sleep(0.010)
task = local_scheduler.Task(
random_driver_id(), random_function_id(),
[local_scheduler.ObjectID(object_dep.binary())],
num_return_vals[0], random_task_id(), 0)
self.local_scheduler_clients[0].submit(task)
# Check that there are the correct number of tasks in Redis and that
# they all get assigned to the local scheduler.
num_retries = 20
num_tasks_done = 0
while num_retries > 0:
task_entries = self.state.task_table()
self.assertLessEqual(len(task_entries), num_tasks)
# First, check if all tasks made it to Redis.
if len(task_entries) == num_tasks:
task_statuses = [
task_entry["State"]
for task_entry in task_entries.values()
]
self.assertTrue(
all(status in [
state.TASK_STATUS_WAITING, state.TASK_STATUS_SCHEDULED,
state.TASK_STATUS_QUEUED
] for status in task_statuses))
num_tasks_done = task_statuses.count(state.TASK_STATUS_QUEUED)
num_tasks_scheduled = task_statuses.count(
state.TASK_STATUS_SCHEDULED)
num_tasks_waiting = task_statuses.count(
state.TASK_STATUS_WAITING)
print("tasks in Redis = {}, tasks waiting = {}, "
"tasks scheduled = {}, "
"tasks queued = {}, retries left = {}".format(
len(task_entries), num_tasks_waiting,
num_tasks_scheduled, num_tasks_done, num_retries))
if all(status == state.TASK_STATUS_QUEUED
for status in task_statuses):
# We're done, so pass.
break
num_retries -= 1
time.sleep(0.1)
# Tasks can either be queued or in the global scheduler due to
# spillback.
self.assertEqual(num_tasks_done + num_tasks_waiting, num_tasks)
@unittest.skipIf(
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_many_tasks_handler_sync(self):
self.integration_many_tasks_helper(timesync=True)
@unittest.skipIf(
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_many_tasks(self):
# More realistic case: should handle out of order object and task
# notifications.
self.integration_many_tasks_helper(timesync=False)
if __name__ == "__main__":
if len(sys.argv) > 1:
# Pop the argument so we don't mess with unittest's own argument
# parser.
if sys.argv[-1] == "valgrind":
arg = sys.argv.pop()
USE_VALGRIND = True
print("Using valgrind for tests")
unittest.main(verbosity=2)
+2 -5
View File
@@ -2,7 +2,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray.local_scheduler
import ray.raylet
import ray.worker
from ray import profiling
@@ -42,7 +42,4 @@ def free(object_ids, local_only=False, worker=None):
if len(object_ids) == 0:
return
if worker.use_raylet:
worker.local_scheduler_client.free(object_ids, local_only)
else:
raise Exception("Free is not supported in legacy backend.")
worker.local_scheduler_client.free(object_ids, local_only)
@@ -1,132 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import os
import subprocess
import sys
import time
from ray.tempfile_services import (get_local_scheduler_socket_name,
get_temp_root)
def start_local_scheduler(plasma_store_name,
plasma_manager_name=None,
worker_path=None,
plasma_address=None,
node_ip_address="127.0.0.1",
redis_address=None,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
stderr_file=None,
static_resources=None,
num_workers=0):
"""Start a local scheduler process.
Args:
plasma_store_name (str): The name of the plasma store socket to connect
to.
plasma_manager_name (str): The name of the plasma manager to connect
to. This does not need to be provided, but if it is, then the Redis
address must be provided as well.
worker_path (str): The path of the worker script to use when the local
scheduler starts up new workers.
plasma_address (str): The address of the plasma manager to connect to.
This is only used by the global scheduler to figure out which
plasma managers are connected to which local schedulers.
node_ip_address (str): The address of the node that this local
scheduler is running on.
redis_address (str): The address of the Redis instance to connect to.
If this is not provided, then the local scheduler will not connect
to Redis.
use_valgrind (bool): True if the local scheduler should be started
inside of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the local scheduler should be started
inside a profiler. If this is True, use_valgrind must be False.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
static_resources: A dictionary specifying the local scheduler's
resource capacities. This maps resource names (strings) to
integers or floats.
num_workers (int): The number of workers that the local scheduler
should start.
Return:
A tuple of the name of the local scheduler socket and the process ID of
the local scheduler process.
"""
if (plasma_manager_name is None) != (redis_address is None):
raise Exception("If one of the plasma_manager_name and the "
"redis_address is provided, then both must be "
"provided.")
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
local_scheduler_executable = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"../core/src/local_scheduler/local_scheduler")
local_scheduler_name = get_local_scheduler_socket_name()
command = [
local_scheduler_executable, "-s", local_scheduler_name, "-p",
plasma_store_name, "-h", node_ip_address, "-n",
str(num_workers)
]
if plasma_manager_name is not None:
command += ["-m", plasma_manager_name]
if worker_path is not None:
assert plasma_store_name is not None
assert plasma_manager_name is not None
assert redis_address is not None
start_worker_command = ("{} {} "
"--node-ip-address={} "
"--object-store-name={} "
"--object-store-manager-name={} "
"--local-scheduler-name={} "
"--redis-address={} "
"--temp-dir={}".format(
sys.executable, worker_path,
node_ip_address, plasma_store_name,
plasma_manager_name, local_scheduler_name,
redis_address, get_temp_root()))
command += ["-w", start_worker_command]
if redis_address is not None:
command += ["-r", redis_address]
if plasma_address is not None:
command += ["-a", plasma_address]
if static_resources is not None:
resource_argument = ""
for resource_name, resource_quantity in static_resources.items():
assert (isinstance(resource_quantity, int)
or isinstance(resource_quantity, float))
resource_argument = ",".join([
resource_name + "," + str(resource_quantity)
for resource_name, resource_quantity in static_resources.items()
])
else:
resource_argument = "CPU,{}".format(multiprocessing.cpu_count())
command += ["-c", resource_argument]
if use_valgrind:
pid = subprocess.Popen(
[
"valgrind", "--track-origins=yes", "--leak-check=full",
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
"--error-exitcode=1"
] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
elif use_profiler:
pid = subprocess.Popen(
["valgrind", "--tool=callgrind"] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
else:
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
time.sleep(0.1)
return local_scheduler_name, pid
-206
View File
@@ -1,206 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import signal
import sys
import threading
import time
import unittest
import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
import ray.ray_constants as ray_constants
import pyarrow as pa
USE_VALGRIND = False
NIL_WORKER_ID = ray_constants.ID_SIZE * b"\xff"
def random_object_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_driver_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_task_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def random_function_id():
return local_scheduler.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
class TestLocalSchedulerClient(unittest.TestCase):
def setUp(self):
# Start Plasma store.
plasma_store_name, self.p1 = plasma.start_plasma_store()
self.plasma_client = pa.plasma.connect(plasma_store_name, "", 0)
# Start a local scheduler.
scheduler_name, self.p2 = local_scheduler.start_local_scheduler(
plasma_store_name, use_valgrind=USE_VALGRIND)
# Connect to the scheduler.
self.local_scheduler_client = local_scheduler.LocalSchedulerClient(
scheduler_name, NIL_WORKER_ID, False, random_task_id(), False)
def tearDown(self):
# Check that the processes are still alive.
self.assertEqual(self.p1.poll(), None)
self.assertEqual(self.p2.poll(), None)
# Kill Plasma.
self.p1.kill()
# Kill the local scheduler.
if USE_VALGRIND:
self.p2.send_signal(signal.SIGTERM)
self.p2.wait()
if self.p2.returncode != 0:
os._exit(-1)
else:
self.p2.kill()
def test_submit_and_get_task(self):
function_id = random_function_id()
object_ids = [random_object_id() for i in range(256)]
# Create and seal the objects in the object store so that we can
# schedule all of the subsequent tasks.
for object_id in object_ids:
self.plasma_client.create(pa.plasma.ObjectID(object_id.id()), 0)
self.plasma_client.seal(pa.plasma.ObjectID(object_id.id()))
# Define some arguments to use for the tasks.
args_list = [[], [{}], [()], 1 * [1], 10 * [1], 100 * [1], 1000 * [1],
1 * ["a"], 10 * ["a"], 100 * ["a"], 1000 * ["a"], [
1, 1.3, 1 << 100, "hi", u"hi", [1, 2]
], object_ids[:1], object_ids[:2], object_ids[:3],
object_ids[:4], object_ids[:5], object_ids[:10],
object_ids[:100], object_ids[:256], [1, object_ids[0]], [
object_ids[0], "a"
], [1, object_ids[0], "a"], [
object_ids[0], 1, object_ids[1], "a"
], object_ids[:3] + [1, "hi", 2.3] + object_ids[:5],
object_ids + 100 * ["a"] + object_ids]
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = local_scheduler.Task(random_driver_id(), function_id,
args, num_return_vals,
random_task_id(), 0)
# Submit a task.
self.local_scheduler_client.submit(task)
# Get the task.
new_task = self.local_scheduler_client.get_task()
self.assertEqual(task.function_id().id(),
new_task.function_id().id())
retrieved_args = new_task.arguments()
returns = new_task.returns()
self.assertEqual(len(args), len(retrieved_args))
self.assertEqual(num_return_vals, len(returns))
for i in range(len(retrieved_args)):
if isinstance(args[i], local_scheduler.ObjectID):
self.assertEqual(args[i].id(), retrieved_args[i].id())
else:
self.assertEqual(args[i], retrieved_args[i])
# Submit all of the tasks.
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
task = local_scheduler.Task(random_driver_id(), function_id,
args, num_return_vals,
random_task_id(), 0)
self.local_scheduler_client.submit(task)
# Get all of the tasks.
for args in args_list:
for num_return_vals in [0, 1, 2, 3, 5, 10, 100]:
new_task = self.local_scheduler_client.get_task()
def test_scheduling_when_objects_ready(self):
# Create a task and submit it.
object_id = random_object_id()
task = local_scheduler.Task(random_driver_id(), random_function_id(),
[object_id], 0, random_task_id(), 0)
self.local_scheduler_client.submit(task)
# Launch a thread to get the task.
def get_task():
self.local_scheduler_client.get_task()
t = threading.Thread(target=get_task)
t.start()
# Sleep to give the thread time to call get_task.
time.sleep(0.1)
# Create and seal the object ID in the object store. This should
# trigger a scheduling event.
self.plasma_client.create(pa.plasma.ObjectID(object_id.id()), 0)
self.plasma_client.seal(pa.plasma.ObjectID(object_id.id()))
# Wait until the thread finishes so that we know the task was
# scheduled.
t.join()
def test_scheduling_when_objects_evicted(self):
# Create a task with two dependencies and submit it.
object_id1 = random_object_id()
object_id2 = random_object_id()
task = local_scheduler.Task(random_driver_id(), random_function_id(),
[object_id1, object_id2], 0,
random_task_id(), 0)
self.local_scheduler_client.submit(task)
# Launch a thread to get the task.
def get_task():
self.local_scheduler_client.get_task()
t = threading.Thread(target=get_task)
t.start()
# Make one of the dependencies available.
buf = self.plasma_client.create(pa.plasma.ObjectID(object_id1.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id1.id()))
# Release the object.
del buf
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())
# Force eviction of the first dependency.
self.plasma_client.evict(plasma.DEFAULT_PLASMA_STORE_MEMORY)
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())
# Check that the first object dependency was evicted.
object1 = self.plasma_client.get_buffers(
[pa.plasma.ObjectID(object_id1.id())], timeout_ms=0)
self.assertEqual(object1, [None])
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())
# Create the second dependency.
self.plasma_client.create(pa.plasma.ObjectID(object_id2.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id2.id()))
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())
# Create the first dependency again. Both dependencies are now
# available.
self.plasma_client.create(pa.plasma.ObjectID(object_id1.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id1.id()))
# Wait until the thread finishes so that we know the task was
# scheduled.
t.join()
if __name__ == "__main__":
if len(sys.argv) > 1:
# Pop the argument so we don't mess with unittest's own argument
# parser.
if sys.argv[-1] == "valgrind":
arg = sys.argv.pop()
USE_VALGRIND = True
print("Using valgrind for tests")
unittest.main(verbosity=2)
+11 -446
View File
@@ -3,11 +3,9 @@ from __future__ import division
from __future__ import print_function
import argparse
import binascii
import logging
import os
import time
from collections import Counter, defaultdict
import traceback
import redis
@@ -20,27 +18,6 @@ import ray.utils
import ray.ray_constants as ray_constants
from ray.services import get_ip_address, get_port
from ray.utils import binary_to_hex, binary_to_object_id, hex_to_binary
from ray.worker import NIL_ACTOR_ID
# These variables must be kept in sync with the C codebase.
# common/common.h
NIL_ID = b"\xff" * ray_constants.ID_SIZE
# common/task.h
TASK_STATUS_LOST = 32
# common/redis_module/ray_redis_module.cc
OBJECT_INFO_PREFIX = b"OI:"
OBJECT_LOCATION_PREFIX = b"OL:"
TASK_TABLE_PREFIX = b"TT:"
DB_CLIENT_PREFIX = b"CL:"
DB_CLIENT_TABLE_NAME = b"db_clients"
# local_scheduler/local_scheduler.h
LOCAL_SCHEDULER_CLIENT_TYPE = b"local_scheduler"
# plasma/plasma_manager.cc
PLASMA_MANAGER_CLIENT_TYPE = b"plasma_manager"
# Set up logging.
logger = logging.getLogger(__name__)
@@ -55,19 +32,8 @@ class Monitor(object):
Attributes:
redis: A connection to the Redis server.
use_raylet: A bool indicating whether to use the raylet code path or
not.
subscribe_client: A pubsub client for the Redis server. This is used to
receive notifications about failed components.
dead_local_schedulers: A set of the local scheduler IDs of all of the
local schedulers that were up at one point and have died since
then.
live_plasma_managers: A counter mapping live plasma manager IDs to the
number of heartbeats that have passed since we last heard from that
plasma manager. A plasma manager is live if we received a heartbeat
from it at any point, and if it has not timed out.
dead_plasma_managers: A set of the plasma manager IDs of all the plasma
managers that were up at one point and have died since then.
"""
def __init__(self,
@@ -79,26 +45,16 @@ class Monitor(object):
self.state = ray.experimental.state.GlobalState()
self.state._initialize_global_state(
redis_address, redis_port, redis_password=redis_password)
self.use_raylet = self.state.use_raylet
self.redis = redis.StrictRedis(
host=redis_address, port=redis_port, db=0, password=redis_password)
# Setup subscriptions to the primary Redis server and the Redis shards.
self.primary_subscribe_client = self.redis.pubsub(
ignore_subscribe_messages=True)
if self.use_raylet:
self.shard_subscribe_clients = []
for redis_client in self.state.redis_clients:
subscribe_client = redis_client.pubsub(
ignore_subscribe_messages=True)
self.shard_subscribe_clients.append(subscribe_client)
else:
# We don't need to subscribe to the shards in legacy Ray.
self.shard_subscribe_clients = []
# Initialize data structures to keep track of the active database
# clients.
self.dead_local_schedulers = set()
self.live_plasma_managers = Counter()
self.dead_plasma_managers = set()
self.shard_subscribe_clients = []
for redis_client in self.state.redis_clients:
subscribe_client = redis_client.pubsub(
ignore_subscribe_messages=True)
self.shard_subscribe_clients.append(subscribe_client)
# Keep a mapping from local scheduler client ID to IP address to use
# for updating the load metrics.
self.local_scheduler_id_to_ip_map = {}
@@ -152,170 +108,6 @@ class Monitor(object):
for subscribe_client in self.shard_subscribe_clients:
subscribe_client.subscribe(channel)
def cleanup_task_table(self):
"""Clean up global state for failed local schedulers.
This marks any tasks that were scheduled on dead local schedulers as
TASK_STATUS_LOST. A local scheduler is deemed dead if it is in
self.dead_local_schedulers.
"""
tasks = self.state.task_table()
num_tasks_updated = 0
for task_id, task in tasks.items():
# See if the corresponding local scheduler is alive.
if task["LocalSchedulerID"] not in self.dead_local_schedulers:
continue
# Remove dummy objects returned by actor tasks from any plasma
# manager. Although the objects may still exist in that object
# store, this deletion makes them effectively unreachable by any
# local scheduler connected to a different store.
# TODO(swang): Actually remove the objects from the object store,
# so that the reconstructed actor can reuse the same object store.
if hex_to_binary(task["TaskSpec"]["ActorID"]) != NIL_ACTOR_ID:
dummy_object_id = task["TaskSpec"]["ReturnObjectIDs"][-1]
obj = self.state.object_table(dummy_object_id)
manager_ids = obj["ManagerIDs"]
if manager_ids is not None:
# The dummy object should exist on at most one plasma
# manager, the manager associated with the local scheduler
# that died.
assert len(manager_ids) <= 1
# Remove the dummy object from the plasma manager
# associated with the dead local scheduler, if any.
for manager in manager_ids:
ok = self.state._execute_command(
dummy_object_id, "RAY.OBJECT_TABLE_REMOVE",
dummy_object_id.id(), hex_to_binary(manager))
if ok != b"OK":
logger.warn("Failed to remove object location for "
"dead plasma manager.")
# If the task is scheduled on a dead local scheduler, mark the
# task as lost.
key = binary_to_object_id(hex_to_binary(task_id))
ok = self.state._execute_command(
key, "RAY.TASK_TABLE_UPDATE", hex_to_binary(task_id),
ray.experimental.state.TASK_STATUS_LOST, NIL_ID,
task["ExecutionDependenciesString"], task["SpillbackCount"])
if ok != b"OK":
logger.warn("Failed to update lost task for dead scheduler.")
num_tasks_updated += 1
if num_tasks_updated > 0:
logger.warn("Marked {} tasks as lost.".format(num_tasks_updated))
def cleanup_object_table(self):
"""Clean up global state for failed plasma managers.
This removes dead plasma managers from any location entries in the
object table. A plasma manager is deemed dead if it is in
self.dead_plasma_managers.
"""
# TODO(swang): Also kill the associated plasma store, since it's no
# longer reachable without a plasma manager.
objects = self.state.object_table()
num_objects_removed = 0
for object_id, obj in objects.items():
manager_ids = obj["ManagerIDs"]
if manager_ids is None:
continue
for manager in manager_ids:
if manager in self.dead_plasma_managers:
# If the object was on a dead plasma manager, remove that
# location entry.
ok = self.state._execute_command(
object_id, "RAY.OBJECT_TABLE_REMOVE", object_id.id(),
hex_to_binary(manager))
if ok != b"OK":
logger.warn("Failed to remove object location for "
"dead plasma manager.")
num_objects_removed += 1
if num_objects_removed > 0:
logger.warn("Marked {} objects as lost."
.format(num_objects_removed))
def scan_db_client_table(self):
"""Scan the database client table for dead clients.
After subscribing to the client table, it's necessary to call this
before reading any messages from the subscription channel. This ensures
that we do not miss any notifications for deleted clients that occurred
before we subscribed.
"""
# Exit if we are using the raylet code path because client_table is
# implemented differently. TODO(rkn): Fix this.
if self.use_raylet:
return
clients = self.state.client_table()
for node_ip_address, node_clients in clients.items():
for client in node_clients:
db_client_id = client["DBClientID"]
client_type = client["ClientType"]
if client["Deleted"]:
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
self.dead_local_schedulers.add(db_client_id)
elif client_type == PLASMA_MANAGER_CLIENT_TYPE:
self.dead_plasma_managers.add(db_client_id)
def db_client_notification_handler(self, unused_channel, data):
"""Handle a notification from the db_client table from Redis.
This handler processes notifications from the db_client table.
Notifications should be parsed using the SubscribeToDBClientTableReply
flatbuffer. Deletions are processed, insertions are ignored. Cleanup of
the associated state in the state tables should be handled by the
caller.
"""
notification_object = (ray.gcs_utils.SubscribeToDBClientTableReply.
GetRootAsSubscribeToDBClientTableReply(data, 0))
db_client_id = binary_to_hex(notification_object.DbClientId())
client_type = notification_object.ClientType()
is_insertion = notification_object.IsInsertion()
# If the update was an insertion, we ignore it.
if is_insertion:
return
# If the update was a deletion, add them to our accounting for dead
# local schedulers and plasma managers.
logger.warn("Removed {}, client ID {}".format(client_type,
db_client_id))
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
if db_client_id not in self.dead_local_schedulers:
self.dead_local_schedulers.add(db_client_id)
elif client_type == PLASMA_MANAGER_CLIENT_TYPE:
if db_client_id not in self.dead_plasma_managers:
self.dead_plasma_managers.add(db_client_id)
# Stop tracking this plasma manager's heartbeats, since it's
# already dead.
del self.live_plasma_managers[db_client_id]
def local_scheduler_info_handler(self, unused_channel, data):
"""Handle a local scheduler heartbeat from Redis."""
message = (ray.gcs_utils.LocalSchedulerInfoMessage.
GetRootAsLocalSchedulerInfoMessage(data, 0))
num_resources = message.DynamicResourcesLength()
static_resources = {}
dynamic_resources = {}
for i in range(num_resources):
dyn = message.DynamicResources(i)
static = message.StaticResources(i)
dynamic_resources[dyn.Key().decode("utf-8")] = dyn.Value()
static_resources[static.Key().decode("utf-8")] = static.Value()
# Update the load metrics for this local scheduler.
client_id = binascii.hexlify(message.DbClientId()).decode("utf-8")
ip = self.local_scheduler_id_to_ip_map.get(client_id)
if ip:
self.load_metrics.update(ip, static_resources, dynamic_resources)
else:
logger.warning(
"Warning: could not find ip for client {} in {}.".format(
client_id, self.local_scheduler_id_to_ip_map))
def xray_heartbeat_handler(self, unused_channel, data):
"""Handle an xray heartbeat message from Redis."""
@@ -342,160 +134,6 @@ class Monitor(object):
print("Warning: could not find ip for client {} in {}.".format(
client_id, self.local_scheduler_id_to_ip_map))
def plasma_manager_heartbeat_handler(self, unused_channel, data):
"""Handle a plasma manager heartbeat from Redis.
This resets the number of heartbeats that we've missed from this plasma
manager.
"""
# The first ray_constants.ID_SIZE characters are the client ID.
db_client_id = data[:ray_constants.ID_SIZE]
# Reset the number of heartbeats that we've missed from this plasma
# manager.
self.live_plasma_managers[db_client_id] = 0
def _entries_for_driver_in_shard(self, driver_id, redis_shard_index):
"""Collect IDs of control-state entries for a driver from a shard.
Args:
driver_id: The ID of the driver.
redis_shard_index: The index of the Redis shard to query.
Returns:
Lists of IDs: (returned_object_ids, task_ids, put_objects). The
first two are relevant to the driver and are safe to delete.
The last contains all "put" objects in this redis shard; each
element is an (object_id, corresponding task_id) pair.
"""
# TODO(zongheng): consider adding save & restore functionalities.
redis = self.state.redis_clients[redis_shard_index]
task_table_infos = {} # task id -> TaskInfo messages
# Scan the task table & filter to get the list of tasks belong to this
# driver. Use a cursor in order not to block the redis shards.
for key in redis.scan_iter(match=TASK_TABLE_PREFIX + b"*"):
entry = redis.hgetall(key)
task_info = ray.gcs_utils.TaskInfo.GetRootAsTaskInfo(
entry[b"TaskSpec"], 0)
if driver_id != task_info.DriverId():
# Ignore tasks that aren't from this driver.
continue
task_table_infos[task_info.TaskId()] = task_info
# Get the list of objects returned by these tasks. Note these might
# not belong to this redis shard.
returned_object_ids = []
for task_info in task_table_infos.values():
returned_object_ids.extend([
task_info.Returns(i) for i in range(task_info.ReturnsLength())
])
# Also record all the ray.put()'d objects.
put_objects = []
for key in redis.scan_iter(match=OBJECT_INFO_PREFIX + b"*"):
entry = redis.hgetall(key)
if entry[b"is_put"] == "0":
continue
object_id = key.split(OBJECT_INFO_PREFIX)[1]
task_id = entry[b"task"]
put_objects.append((object_id, task_id))
return returned_object_ids, task_table_infos.keys(), put_objects
def _clean_up_entries_from_shard(self, object_ids, task_ids, shard_index):
redis = self.state.redis_clients[shard_index]
# Clean up (in the future, save) entries for non-empty objects.
object_ids_locs = set()
object_ids_infos = set()
for object_id in object_ids:
# OL.
obj_loc = redis.zrange(OBJECT_LOCATION_PREFIX + object_id, 0, -1)
if obj_loc:
object_ids_locs.add(object_id)
# OI.
obj_info = redis.hgetall(OBJECT_INFO_PREFIX + object_id)
if obj_info:
object_ids_infos.add(object_id)
# Form the redis keys to delete.
keys = [TASK_TABLE_PREFIX + k for k in task_ids]
keys.extend([OBJECT_LOCATION_PREFIX + k for k in object_ids_locs])
keys.extend([OBJECT_INFO_PREFIX + k for k in object_ids_infos])
if not keys:
return
# Remove with best effort.
num_deleted = redis.delete(*keys)
logger.info(
"Removed {} dead redis entries of the driver from redis shard {}.".
format(num_deleted, shard_index))
if num_deleted != len(keys):
logger.warning(
"Failed to remove {} relevant redis entries"
" from redis shard {}.".format(len(keys) - num_deleted))
def _clean_up_entries_for_driver(self, driver_id):
"""Remove this driver's object/task entries from all redis shards.
Specifically, removes control-state entries of:
* all objects (OI and OL entries) created by `ray.put()` from the
driver
* all tasks belonging to the driver.
"""
# TODO(zongheng): handle function_table, client_table, log_files --
# these are in the metadata redis server, not in the shards.
driver_object_ids = []
driver_task_ids = []
all_put_objects = []
# Collect relevant ids.
# TODO(zongheng): consider parallelizing this loop.
for shard_index in range(len(self.state.redis_clients)):
returned_object_ids, task_ids, put_objects = \
self._entries_for_driver_in_shard(driver_id, shard_index)
driver_object_ids.extend(returned_object_ids)
driver_task_ids.extend(task_ids)
all_put_objects.extend(put_objects)
# For the put objects, keep those from relevant tasks.
driver_task_ids_set = set(driver_task_ids)
for object_id, task_id in all_put_objects:
if task_id in driver_task_ids_set:
driver_object_ids.append(object_id)
# Partition IDs and distribute to shards.
object_ids_per_shard = defaultdict(list)
task_ids_per_shard = defaultdict(list)
def ToShardIndex(index):
return binary_to_object_id(index).redis_shard_hash() % len(
self.state.redis_clients)
for object_id in driver_object_ids:
object_ids_per_shard[ToShardIndex(object_id)].append(object_id)
for task_id in driver_task_ids:
task_ids_per_shard[ToShardIndex(task_id)].append(task_id)
# TODO(zongheng): consider parallelizing this loop.
for shard_index in range(len(self.state.redis_clients)):
self._clean_up_entries_from_shard(
object_ids_per_shard[shard_index],
task_ids_per_shard[shard_index], shard_index)
def driver_removed_handler(self, unused_channel, data):
"""Handle a notification that a driver has been removed.
This releases any GPU resources that were reserved for that driver in
Redis.
"""
message = ray.gcs_utils.DriverTableMessage.GetRootAsDriverTableMessage(
data, 0)
driver_id = message.DriverId()
logger.info("Driver {} has been removed.".format(
binary_to_hex(driver_id)))
self._clean_up_entries_for_driver(driver_id)
def _xray_clean_up_entries_for_driver(self, driver_id):
"""Remove this driver's object/task entries from redis.
@@ -529,7 +167,7 @@ class Monitor(object):
driver_object_id_bins = set()
for object_id, object_table_object in object_table_objects.items():
assert len(object_table_object) > 0
task_id_bin = ray.local_scheduler.compute_task_id(object_id).id()
task_id_bin = ray.raylet.compute_task_id(object_id).id()
if task_id_bin in driver_task_id_bins:
driver_object_id_bins.add(object_id.id())
@@ -602,20 +240,7 @@ class Monitor(object):
# Determine the appropriate message handler.
message_handler = None
if channel == ray.gcs_utils.PLASMA_MANAGER_HEARTBEAT_CHANNEL:
# The message was a heartbeat from a plasma manager.
message_handler = self.plasma_manager_heartbeat_handler
elif channel == ray.gcs_utils.LOCAL_SCHEDULER_INFO_CHANNEL:
# The message was a heartbeat from a local scheduler
message_handler = self.local_scheduler_info_handler
elif channel == DB_CLIENT_TABLE_NAME:
# The message was a notification from the db_client table.
message_handler = self.db_client_notification_handler
elif channel == ray.gcs_utils.DRIVER_DEATH_CHANNEL:
# The message was a notification that a driver was removed.
logger.info("message-handler: driver_removed_handler")
message_handler = self.driver_removed_handler
elif channel == ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL:
if channel == ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL:
# Similar functionality as local scheduler info channel
message_handler = self.xray_heartbeat_handler
elif channel == ray.gcs_utils.XRAY_DRIVER_CHANNEL:
@@ -629,10 +254,7 @@ class Monitor(object):
message_handler(channel, data)
def update_local_scheduler_map(self):
if self.use_raylet:
local_schedulers = self.state.client_table()
else:
local_schedulers = self.state.local_schedulers()
local_schedulers = self.state.client_table()
self.local_scheduler_id_to_ip_map = {}
for local_scheduler_info in local_schedulers:
client_id = local_scheduler_info.get("DBClientID") or \
@@ -680,33 +302,11 @@ class Monitor(object):
clients and cleaning up state accordingly.
"""
# Initialize the subscription channel.
self.subscribe(DB_CLIENT_TABLE_NAME)
self.subscribe(ray.gcs_utils.LOCAL_SCHEDULER_INFO_CHANNEL)
self.subscribe(ray.gcs_utils.PLASMA_MANAGER_HEARTBEAT_CHANNEL)
self.subscribe(ray.gcs_utils.DRIVER_DEATH_CHANNEL)
self.subscribe(ray.gcs_utils.XRAY_HEARTBEAT_CHANNEL, primary=False)
self.subscribe(ray.gcs_utils.XRAY_DRIVER_CHANNEL)
# Scan the database table for dead database clients. NOTE: This must be
# called before reading any messages from the subscription channel.
# This ensures that we start in a consistent state, since we may have
# missed notifications that were sent before we connected to the
# subscription channel.
self.scan_db_client_table()
# If there were any dead clients at startup, clean up the associated
# state in the state tables.
if len(self.dead_local_schedulers) > 0:
self.cleanup_task_table()
if len(self.dead_plasma_managers) > 0:
self.cleanup_object_table()
num_plasma_managers = len(self.live_plasma_managers) + len(
self.dead_plasma_managers)
logger.debug("{} dead local schedulers, {} plasma managers total, {} "
"dead plasma managers".format(
len(self.dead_local_schedulers), num_plasma_managers,
len(self.dead_plasma_managers)))
# TODO(rkn): If there were any dead clients at startup, we should clean
# up the associated state in the state tables.
# Handle messages from the subscription channels.
while True:
@@ -720,43 +320,9 @@ class Monitor(object):
self._maybe_flush_gcs()
# Record how many dead local schedulers and plasma managers we had
# at the beginning of this round.
num_dead_local_schedulers = len(self.dead_local_schedulers)
num_dead_plasma_managers = len(self.dead_plasma_managers)
# Process a round of messages.
self.process_messages()
# If any new local schedulers or plasma managers were marked as
# dead in this round, clean up the associated state.
if len(self.dead_local_schedulers) > num_dead_local_schedulers:
self.cleanup_task_table()
if len(self.dead_plasma_managers) > num_dead_plasma_managers:
self.cleanup_object_table()
# Handle plasma managers that timed out during this round.
plasma_manager_ids = list(self.live_plasma_managers.keys())
for plasma_manager_id in plasma_manager_ids:
if ((self.live_plasma_managers[plasma_manager_id]) >=
ray._config.num_heartbeats_timeout()):
logger.warn("Timed out {}"
.format(PLASMA_MANAGER_CLIENT_TYPE))
# Remove the plasma manager from the managers whose
# heartbeats we're tracking.
del self.live_plasma_managers[plasma_manager_id]
# Remove the plasma manager from the db_client table. The
# corresponding state in the object table will be cleaned
# up once we receive the notification for this db_client
# deletion.
self.redis.execute_command("RAY.DISCONNECT",
plasma_manager_id)
# Increment the number of heartbeats that we've missed from each
# plasma manager.
for plasma_manager_id in self.live_plasma_managers:
self.live_plasma_managers[plasma_manager_id] += 1
# Wait for a heartbeat interval before processing the next round of
# messages.
time.sleep(ray._config.heartbeat_timeout_milliseconds() * 1e-3)
@@ -827,6 +393,5 @@ if __name__ == "__main__":
message = "The monitor failed with the following error:\n{}".format(
traceback_str)
ray.utils.push_error_to_driver_through_redis(
redis_client, monitor.use_raylet, ray_constants.MONITOR_DIED_ERROR,
message)
redis_client, ray_constants.MONITOR_DIED_ERROR, message)
raise e
+2 -5
View File
@@ -2,9 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.plasma.plasma import (start_plasma_store, start_plasma_manager,
DEFAULT_PLASMA_STORE_MEMORY)
from ray.plasma.plasma import start_plasma_store, DEFAULT_PLASMA_STORE_MEMORY
__all__ = [
"start_plasma_store", "start_plasma_manager", "DEFAULT_PLASMA_STORE_MEMORY"
]
__all__ = ["start_plasma_store", "DEFAULT_PLASMA_STORE_MEMORY"]
+2 -101
View File
@@ -3,17 +3,13 @@ from __future__ import division
from __future__ import print_function
import os
import random
import subprocess
import sys
import time
from ray.tempfile_services import (get_object_store_socket_name,
get_plasma_manager_socket_name)
from ray.tempfile_services import get_object_store_socket_name
__all__ = [
"start_plasma_store", "start_plasma_manager", "DEFAULT_PLASMA_STORE_MEMORY"
]
__all__ = ["start_plasma_store", "DEFAULT_PLASMA_STORE_MEMORY"]
PLASMA_WAIT_TIMEOUT = 2**30
@@ -97,98 +93,3 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY,
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
time.sleep(0.1)
return plasma_store_name, pid
def new_port():
return random.randint(10000, 65535)
def start_plasma_manager(store_name,
redis_address,
node_ip_address="127.0.0.1",
plasma_manager_port=None,
num_retries=20,
use_valgrind=False,
run_profiler=False,
stdout_file=None,
stderr_file=None):
"""Start a plasma manager and return the ports it listens on.
Args:
store_name (str): The name of the plasma store socket.
redis_address (str): The address of the Redis server.
node_ip_address (str): The IP address of the node.
plasma_manager_port (int): The port to use for the plasma manager. If
this is not provided, a port will be generated at random.
use_valgrind (bool): True if the Plasma manager should be started
inside of valgrind and False otherwise.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
Returns:
A tuple of the Plasma manager socket name, the process ID of the
Plasma manager process, and the port that the manager is
listening on.
Raises:
Exception: An exception is raised if the manager could not be started.
"""
plasma_manager_executable = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"../core/src/plasma/plasma_manager")
plasma_manager_name = get_plasma_manager_socket_name()
if plasma_manager_port is not None:
if num_retries != 1:
raise Exception("num_retries must be 1 if port is specified.")
else:
plasma_manager_port = new_port()
process = None
counter = 0
while counter < num_retries:
if counter > 0:
print("Plasma manager failed to start, retrying now.")
command = [
plasma_manager_executable,
"-s",
store_name,
"-m",
plasma_manager_name,
"-h",
node_ip_address,
"-p",
str(plasma_manager_port),
"-r",
redis_address,
]
if use_valgrind:
process = subprocess.Popen(
[
"valgrind", "--track-origins=yes", "--leak-check=full",
"--show-leak-kinds=all", "--error-exitcode=1"
] + command,
stdout=stdout_file,
stderr=stderr_file)
elif run_profiler:
process = subprocess.Popen(
(["valgrind", "--tool=callgrind"] + command),
stdout=stdout_file,
stderr=stderr_file)
else:
process = subprocess.Popen(
command, stdout=stdout_file, stderr=stderr_file)
# This sleep is critical. If the plasma_manager fails to start because
# the port is already in use, then we need it to fail within 0.1
# seconds.
if use_valgrind:
time.sleep(1)
else:
time.sleep(0.1)
# See if the process has terminated
if process.poll() is None:
return plasma_manager_name, process, plasma_manager_port
# Generate a new port and try again.
plasma_manager_port = new_port()
counter += 1
raise Exception("Couldn't start plasma manager.")
-560
View File
@@ -1,560 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from numpy.testing import assert_equal
import os
import random
import signal
import subprocess
import sys
import threading
import time
import unittest
# The ray import must come before the pyarrow import because ray modifies the
# python path so that the right version of pyarrow is found.
import ray
from ray.plasma.utils import (random_object_id, create_object_with_id,
create_object)
import ray.ray_constants as ray_constants
from ray import services
import pyarrow as pa
import pyarrow.plasma as plasma
USE_VALGRIND = False
PLASMA_STORE_MEMORY = 1000000000
def random_name():
return str(random.randint(0, 99999999))
def assert_get_object_equal(unit_test,
client1,
client2,
object_id,
memory_buffer=None,
metadata=None):
client1_buff = client1.get_buffers([object_id])[0]
client2_buff = client2.get_buffers([object_id])[0]
client1_metadata = client1.get_metadata([object_id])[0]
client2_metadata = client2.get_metadata([object_id])[0]
unit_test.assertEqual(len(client1_buff), len(client2_buff))
unit_test.assertEqual(len(client1_metadata), len(client2_metadata))
# Check that the buffers from the two clients are the same.
assert_equal(
np.frombuffer(client1_buff, dtype="uint8"),
np.frombuffer(client2_buff, dtype="uint8"))
# Check that the metadata buffers from the two clients are the same.
assert_equal(
np.frombuffer(client1_metadata, dtype="uint8"),
np.frombuffer(client2_metadata, dtype="uint8"))
# If a reference buffer was provided, check that it is the same as well.
if memory_buffer is not None:
assert_equal(
np.frombuffer(memory_buffer, dtype="uint8"),
np.frombuffer(client1_buff, dtype="uint8"))
# If reference metadata was provided, check that it is the same as well.
if metadata is not None:
assert_equal(
np.frombuffer(metadata, dtype="uint8"),
np.frombuffer(client1_metadata, dtype="uint8"))
DEFAULT_PLASMA_STORE_MEMORY = 10**9
def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
stderr_file=None):
"""Start a plasma store process.
Args:
use_valgrind (bool): True if the plasma store should be started inside
of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the plasma store should be started inside
a profiler. If this is True, use_valgrind must be False.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
Return:
A tuple of the name of the plasma store socket and the process ID of
the plasma store process.
"""
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
plasma_store_executable = os.path.join(pa.__path__[0],
"plasma_store_server")
plasma_store_name = "/tmp/plasma_store{}".format(random_name())
command = [
plasma_store_executable, "-s", plasma_store_name, "-m",
str(plasma_store_memory)
]
if use_valgrind:
pid = subprocess.Popen(
[
"valgrind", "--track-origins=yes", "--leak-check=full",
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
"--error-exitcode=1"
] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
elif use_profiler:
pid = subprocess.Popen(
["valgrind", "--tool=callgrind"] + command,
stdout=stdout_file,
stderr=stderr_file)
time.sleep(1.0)
else:
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
time.sleep(0.1)
return plasma_store_name, pid
# Plasma client tests were moved into arrow
class TestPlasmaManager(unittest.TestCase):
def setUp(self):
# Start two PlasmaStores.
store_name1, self.p2 = start_plasma_store(use_valgrind=USE_VALGRIND)
store_name2, self.p3 = start_plasma_store(use_valgrind=USE_VALGRIND)
# Start a Redis server.
redis_address, _ = services.start_redis("127.0.0.1", use_raylet=False)
# Start two PlasmaManagers.
manager_name1, self.p4, self.port1 = ray.plasma.start_plasma_manager(
store_name1, redis_address, use_valgrind=USE_VALGRIND)
manager_name2, self.p5, self.port2 = ray.plasma.start_plasma_manager(
store_name2, redis_address, use_valgrind=USE_VALGRIND)
# Connect two PlasmaClients.
self.client1 = plasma.connect(store_name1, manager_name1, 64)
self.client2 = plasma.connect(store_name2, manager_name2, 64)
# Store the processes that will be explicitly killed during tearDown so
# that a test case can remove ones that will be killed during the test.
# NOTE: If this specific order is changed, valgrind will fail.
self.processes_to_kill = [self.p4, self.p5, self.p2, self.p3]
def tearDown(self):
# Check that the processes are still alive.
for process in self.processes_to_kill:
self.assertEqual(process.poll(), None)
# Kill the Plasma store and Plasma manager processes.
if USE_VALGRIND:
# Give processes opportunity to finish work.
time.sleep(1)
for process in self.processes_to_kill:
process.send_signal(signal.SIGTERM)
process.wait()
if process.returncode != 0:
print("aborting due to valgrind error")
os._exit(-1)
else:
for process in self.processes_to_kill:
process.kill()
# Clean up the Redis server.
services.cleanup()
def test_fetch(self):
for _ in range(10):
# Create an object.
object_id1, memory_buffer1, metadata1 = create_object(
self.client1, 2000, 2000)
self.client1.fetch([object_id1])
self.assertEqual(self.client1.contains(object_id1), True)
self.assertEqual(self.client2.contains(object_id1), False)
# Fetch the object from the other plasma manager.
# TODO(rkn): Right now we must wait for the object table to be
# updated.
while not self.client2.contains(object_id1):
self.client2.fetch([object_id1])
# Compare the two buffers.
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id1,
memory_buffer=memory_buffer1,
metadata=metadata1)
# Test that we can call fetch on object IDs that don't exist yet.
object_id2 = random_object_id()
self.client1.fetch([object_id2])
self.assertEqual(self.client1.contains(object_id2), False)
memory_buffer2, metadata2 = create_object_with_id(
self.client2, object_id2, 2000, 2000)
# # Check that the object has been fetched.
# self.assertEqual(self.client1.contains(object_id2), True)
# Compare the two buffers.
# assert_get_object_equal(self, self.client1, self.client2, object_id2,
# memory_buffer=memory_buffer2,
# metadata=metadata2)
# Test calling the same fetch request a bunch of times.
object_id3 = random_object_id()
self.assertEqual(self.client1.contains(object_id3), False)
self.assertEqual(self.client2.contains(object_id3), False)
for _ in range(10):
self.client1.fetch([object_id3])
self.client2.fetch([object_id3])
memory_buffer3, metadata3 = create_object_with_id(
self.client1, object_id3, 2000, 2000)
for _ in range(10):
self.client1.fetch([object_id3])
self.client2.fetch([object_id3])
# TODO(rkn): Right now we must wait for the object table to be updated.
while not self.client2.contains(object_id3):
self.client2.fetch([object_id3])
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id3,
memory_buffer=memory_buffer3,
metadata=metadata3)
def test_fetch_multiple(self):
for _ in range(20):
# Create two objects and a third fake one that doesn't exist.
object_id1, memory_buffer1, metadata1 = create_object(
self.client1, 2000, 2000)
missing_object_id = random_object_id()
object_id2, memory_buffer2, metadata2 = create_object(
self.client1, 2000, 2000)
object_ids = [object_id1, missing_object_id, object_id2]
# Fetch the objects from the other plasma store. The second object
# ID should timeout since it does not exist.
# TODO(rkn): Right now we must wait for the object table to be
# updated.
while ((not self.client2.contains(object_id1))
or (not self.client2.contains(object_id2))):
self.client2.fetch(object_ids)
# Compare the buffers of the objects that do exist.
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id1,
memory_buffer=memory_buffer1,
metadata=metadata1)
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id2,
memory_buffer=memory_buffer2,
metadata=metadata2)
# Fetch in the other direction. The fake object still does not
# exist.
self.client1.fetch(object_ids)
assert_get_object_equal(
self,
self.client2,
self.client1,
object_id1,
memory_buffer=memory_buffer1,
metadata=metadata1)
assert_get_object_equal(
self,
self.client2,
self.client1,
object_id2,
memory_buffer=memory_buffer2,
metadata=metadata2)
# Check that we can call fetch with duplicated object IDs.
object_id3 = random_object_id()
self.client1.fetch([object_id3, object_id3])
object_id4, memory_buffer4, metadata4 = create_object(
self.client1, 2000, 2000)
time.sleep(0.1)
# TODO(rkn): Right now we must wait for the object table to be updated.
while not self.client2.contains(object_id4):
self.client2.fetch(
[object_id3, object_id3, object_id4, object_id4])
assert_get_object_equal(
self,
self.client2,
self.client1,
object_id4,
memory_buffer=memory_buffer4,
metadata=metadata4)
def test_wait(self):
# Test timeout.
obj_id0 = random_object_id()
self.client1.wait([obj_id0], timeout=100, num_returns=1)
# If we get here, the test worked.
# Test wait if local objects available.
obj_id1 = random_object_id()
self.client1.create(obj_id1, 1000)
self.client1.seal(obj_id1)
ready, waiting = self.client1.wait(
[obj_id1], timeout=100, num_returns=1)
self.assertEqual(set(ready), {obj_id1})
self.assertEqual(waiting, [])
# Test wait if only one object available and only one object waited
# for.
obj_id2 = random_object_id()
self.client1.create(obj_id2, 1000)
# Don't seal.
ready, waiting = self.client1.wait(
[obj_id2, obj_id1], timeout=100, num_returns=1)
self.assertEqual(set(ready), {obj_id1})
self.assertEqual(set(waiting), {obj_id2})
# Test wait if object is sealed later.
obj_id3 = random_object_id()
def finish():
self.client2.create(obj_id3, 1000)
self.client2.seal(obj_id3)
t = threading.Timer(0.1, finish)
t.start()
ready, waiting = self.client1.wait(
[obj_id3, obj_id2, obj_id1], timeout=1000, num_returns=2)
self.assertEqual(set(ready), {obj_id1, obj_id3})
self.assertEqual(set(waiting), {obj_id2})
# Test if the appropriate number of objects is shown if some objects
# are not ready.
ready, waiting = self.client1.wait([obj_id3, obj_id2, obj_id1], 100, 3)
self.assertEqual(set(ready), {obj_id1, obj_id3})
self.assertEqual(set(waiting), {obj_id2})
# Don't forget to seal obj_id2.
self.client1.seal(obj_id2)
# Test calling wait a bunch of times.
object_ids = []
# TODO(rkn): Increasing n to 100 (or larger) will cause failures. The
# problem appears to be that the number of timers added to the manager
# event loop slow down the manager so much that some of the
# asynchronous Redis commands timeout triggering fatal failure
# callbacks.
n = 40
for i in range(n * (n + 1) // 2):
if i % 2 == 0:
object_id, _, _ = create_object(self.client1, 200, 200)
else:
object_id, _, _ = create_object(self.client2, 200, 200)
object_ids.append(object_id)
# Try waiting for all of the object IDs on the first client.
waiting = object_ids
retrieved = []
for i in range(1, n + 1):
ready, waiting = self.client1.wait(
waiting, timeout=1000, num_returns=i)
self.assertEqual(len(ready), i)
retrieved += ready
self.assertEqual(set(retrieved), set(object_ids))
ready, waiting = self.client1.wait(
object_ids, timeout=1000, num_returns=len(object_ids))
self.assertEqual(set(ready), set(object_ids))
self.assertEqual(waiting, [])
# Try waiting for all of the object IDs on the second client.
waiting = object_ids
retrieved = []
for i in range(1, n + 1):
ready, waiting = self.client2.wait(
waiting, timeout=1000, num_returns=i)
self.assertEqual(len(ready), i)
retrieved += ready
self.assertEqual(set(retrieved), set(object_ids))
ready, waiting = self.client2.wait(
object_ids, timeout=1000, num_returns=len(object_ids))
self.assertEqual(set(ready), set(object_ids))
self.assertEqual(waiting, [])
# Make sure that wait returns when the requested number of object IDs
# are available and does not wait for all object IDs to be available.
object_ids = [random_object_id() for _ in range(9)] + \
[plasma.ObjectID(ray_constants.ID_SIZE * b'\x00')]
object_ids_perm = object_ids[:]
random.shuffle(object_ids_perm)
for i in range(10):
if i % 2 == 0:
create_object_with_id(self.client1, object_ids_perm[i], 2000,
2000)
else:
create_object_with_id(self.client2, object_ids_perm[i], 2000,
2000)
ready, waiting = self.client1.wait(object_ids, num_returns=(i + 1))
self.assertEqual(set(ready), set(object_ids_perm[:(i + 1)]))
self.assertEqual(set(waiting), set(object_ids_perm[(i + 1):]))
def test_transfer(self):
num_attempts = 100
for _ in range(100):
# Create an object.
object_id1, memory_buffer1, metadata1 = create_object(
self.client1, 2000, 2000)
# Transfer the buffer to the the other Plasma store. There is a
# race condition on the create and transfer of the object, so keep
# trying until the object appears on the second Plasma store.
for i in range(num_attempts):
self.client1.transfer("127.0.0.1", self.port2, object_id1)
buff = self.client2.get_buffers(
[object_id1], timeout_ms=100)[0]
if buff is not None:
break
self.assertNotEqual(buff, None)
del buff
# Compare the two buffers.
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id1,
memory_buffer=memory_buffer1,
metadata=metadata1)
# # Transfer the buffer again.
# self.client1.transfer("127.0.0.1", self.port2, object_id1)
# # Compare the two buffers.
# assert_get_object_equal(self, self.client1, self.client2,
# object_id1,
# memory_buffer=memory_buffer1,
# metadata=metadata1)
# Create an object.
object_id2, memory_buffer2, metadata2 = create_object(
self.client2, 20000, 20000)
# Transfer the buffer to the the other Plasma store. There is a
# race condition on the create and transfer of the object, so keep
# trying until the object appears on the second Plasma store.
for i in range(num_attempts):
self.client2.transfer("127.0.0.1", self.port1, object_id2)
buff = self.client1.get_buffers(
[object_id2], timeout_ms=100)[0]
if buff is not None:
break
self.assertNotEqual(buff, None)
del buff
# Compare the two buffers.
assert_get_object_equal(
self,
self.client1,
self.client2,
object_id2,
memory_buffer=memory_buffer2,
metadata=metadata2)
def test_illegal_functionality(self):
# Create an object id string.
# object_id = random_object_id()
# Create a new buffer.
# memory_buffer = self.client1.create(object_id, 20000)
# This test is commented out because it currently fails.
# # Transferring the buffer before sealing it should fail.
# self.assertRaises(Exception,
# lambda : self.manager1.transfer(1, object_id))
pass
def test_stresstest(self):
a = time.time()
object_ids = []
for i in range(10000): # TODO(pcm): increase this to 100000.
object_id = random_object_id()
object_ids.append(object_id)
self.client1.create(object_id, 1)
self.client1.seal(object_id)
for object_id in object_ids:
self.client1.transfer("127.0.0.1", self.port2, object_id)
b = time.time() - a
print("it took", b, "seconds to put and transfer the objects")
class TestPlasmaManagerRecovery(unittest.TestCase):
def setUp(self):
# Start a Plasma store.
self.store_name, self.p2 = start_plasma_store(
use_valgrind=USE_VALGRIND)
# Start a Redis server.
self.redis_address, _ = services.start_redis(
"127.0.0.1", use_raylet=False)
# Start a PlasmaManagers.
manager_name, self.p3, self.port1 = ray.plasma.start_plasma_manager(
self.store_name, self.redis_address, use_valgrind=USE_VALGRIND)
# Connect a PlasmaClient.
self.client = plasma.connect(self.store_name, manager_name, 64)
# Store the processes that will be explicitly killed during tearDown so
# that a test case can remove ones that will be killed during the test.
# NOTE: The plasma managers must be killed before the plasma store
# since plasma store death will bring down the managers.
self.processes_to_kill = [self.p3, self.p2]
def tearDown(self):
# Check that the processes are still alive.
for process in self.processes_to_kill:
self.assertEqual(process.poll(), None)
# Kill the Plasma store and Plasma manager processes.
if USE_VALGRIND:
# Give processes opportunity to finish work.
time.sleep(1)
for process in self.processes_to_kill:
process.send_signal(signal.SIGTERM)
process.wait()
if process.returncode != 0:
print("aborting due to valgrind error")
os._exit(-1)
else:
for process in self.processes_to_kill:
process.kill()
# Clean up the Redis server.
services.cleanup()
def test_delayed_start(self):
num_objects = 10
# Create some objects using one client.
object_ids = [random_object_id() for _ in range(num_objects)]
for i in range(10):
create_object_with_id(self.client, object_ids[i], 2000, 2000)
# Wait until the objects have been sealed in the store.
ready, waiting = self.client.wait(object_ids, num_returns=num_objects)
self.assertEqual(set(ready), set(object_ids))
self.assertEqual(waiting, [])
# Start a second plasma manager attached to the same store.
manager_name, self.p5, self.port2 = ray.plasma.start_plasma_manager(
self.store_name, self.redis_address, use_valgrind=USE_VALGRIND)
self.processes_to_kill = [self.p5] + self.processes_to_kill
# Check that the second manager knows about existing objects.
client2 = plasma.connect(self.store_name, manager_name, 64)
ready, waiting = [], object_ids
while True:
ready, waiting = client2.wait(
object_ids, num_returns=num_objects, timeout=0)
if len(ready) == len(object_ids):
break
self.assertEqual(set(ready), set(object_ids))
self.assertEqual(waiting, [])
if __name__ == "__main__":
if len(sys.argv) > 1:
# Pop the argument so we don't mess with unittest's own argument
# parser.
if sys.argv[-1] == "valgrind":
arg = sys.argv.pop()
USE_VALGRIND = True
print("Using valgrind for tests")
unittest.main(verbosity=2)
-53
View File
@@ -1,53 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import pyarrow.plasma as plasma
import ray.ray_constants as ray_constants
def random_object_id():
return plasma.ObjectID(np.random.bytes(ray_constants.ID_SIZE))
def generate_metadata(length):
metadata_buffer = bytearray(length)
if length > 0:
metadata_buffer[0] = random.randint(0, 255)
metadata_buffer[-1] = random.randint(0, 255)
for _ in range(100):
metadata_buffer[random.randint(0, length - 1)] = (random.randint(
0, 255))
return metadata_buffer
def write_to_data_buffer(buff, length):
array = np.frombuffer(buff, dtype="uint8")
if length > 0:
array[0] = random.randint(0, 255)
array[-1] = random.randint(0, 255)
for _ in range(100):
array[random.randint(0, length - 1)] = random.randint(0, 255)
def create_object_with_id(client,
object_id,
data_size,
metadata_size,
seal=True):
metadata = generate_metadata(metadata_size)
memory_buffer = client.create(object_id, data_size, metadata)
write_to_data_buffer(memory_buffer, data_size)
if seal:
client.seal(object_id)
return memory_buffer, metadata
def create_object(client, data_size, metadata_size, seal=True):
object_id = random_object_id()
memory_buffer, metadata = create_object_with_id(
client, object_id, data_size, metadata_size, seal=seal)
return object_id, memory_buffer, metadata
+7 -84
View File
@@ -59,17 +59,7 @@ def profile(event_type, extra_data=None, worker=None):
"""
if worker is None:
worker = ray.worker.global_worker
if not worker.use_raylet:
# Log the event if this is a worker and not a driver, since the
# driver's event log never gets flushed.
if worker.mode == ray.WORKER_MODE:
return RayLogSpanNonRaylet(
worker.profiler, event_type, contents=extra_data)
else:
return NULL_LOG_SPAN
else:
return RayLogSpanRaylet(
worker.profiler, event_type, extra_data=extra_data)
return RayLogSpanRaylet(worker.profiler, event_type, extra_data=extra_data)
class Profiler(object):
@@ -124,87 +114,20 @@ class Profiler(object):
events = self.events
self.events = []
if not self.worker.use_raylet:
event_log_key = b"event_log:" + self.worker.worker_id
event_log_value = json.dumps(events)
self.worker.local_scheduler_client.log_event(
event_log_key, event_log_value, time.time())
if self.worker.mode == ray.WORKER_MODE:
component_type = "worker"
else:
if self.worker.mode == ray.WORKER_MODE:
component_type = "worker"
else:
component_type = "driver"
component_type = "driver"
self.worker.local_scheduler_client.push_profile_events(
component_type, ray.ObjectID(self.worker.worker_id),
self.worker.node_ip_address, events)
self.worker.local_scheduler_client.push_profile_events(
component_type, ray.ObjectID(self.worker.worker_id),
self.worker.node_ip_address, events)
def add_event(self, event):
with self.lock:
self.events.append(event)
class RayLogSpanNonRaylet(object):
"""An object used to enable logging a span of events with a with statement.
Attributes:
event_type (str): The type of the event being logged.
contents: Additional information to log.
"""
def __init__(self, profiler, event_type, contents=None):
"""Initialize a RayLogSpanNonRaylet object."""
self.profiler = profiler
self.event_type = event_type
self.contents = contents
def _log(self, event_type, kind, contents=None):
"""Log an event to the global state store.
This adds the event to a buffer of events locally. The buffer can be
flushed and written to the global state store by calling
flush_profile_data().
Args:
event_type (str): The type of the event.
contents: More general data to store with the event.
kind (int): Either LOG_POINT, LOG_SPAN_START, or LOG_SPAN_END. This
is LOG_POINT if the event being logged happens at a single
point in time. It is LOG_SPAN_START if we are starting to log a
span of time, and it is LOG_SPAN_END if we are finishing
logging a span of time.
"""
# TODO(rkn): This code currently takes around half a microsecond. Since
# we call it tens of times per task, this adds up. We will need to redo
# the logging code, perhaps in C.
contents = {} if contents is None else contents
assert isinstance(contents, dict)
# Make sure all of the keys and values in the dictionary are strings.
contents = {str(k): str(v) for k, v in contents.items()}
self.profiler.add_event((time.time(), event_type, kind, contents))
def __enter__(self):
"""Log the beginning of a span event."""
self._log(
event_type=self.event_type,
contents=self.contents,
kind=LOG_SPAN_START)
def __exit__(self, type, value, tb):
"""Log the end of a span event. Log any exception that occurred."""
if type is None:
self._log(event_type=self.event_type, kind=LOG_SPAN_END)
else:
self._log(
event_type=self.event_type,
contents={
"type": str(type),
"value": value,
"traceback": traceback.format_exc()
},
kind=LOG_SPAN_END)
class RayLogSpanRaylet(object):
"""An object used to enable logging a span of events with a with statement.
+1 -2
View File
@@ -5,7 +5,7 @@ from __future__ import print_function
import os
from ray.local_scheduler import ObjectID
from ray.raylet import ObjectID
def env_integer(key, default):
@@ -41,7 +41,6 @@ REGISTER_ACTOR_PUSH_ERROR = "register_actor"
WORKER_CRASH_PUSH_ERROR = "worker_crash"
WORKER_DIED_PUSH_ERROR = "worker_died"
PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
HASH_MISMATCH_PUSH_ERROR = "object_hash_mismatch"
INFEASIBLE_TASK_ERROR = "infeasible_task"
REMOVED_NODE_ERROR = "node_removed"
MONITOR_DIED_ERROR = "monitor_died"
@@ -2,10 +2,9 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.core.src.local_scheduler.liblocal_scheduler_library_python import (
from ray.core.src.ray.raylet.liblocal_scheduler_library_python import (
Task, LocalSchedulerClient, ObjectID, check_simple_value, compute_task_id,
task_from_string, task_to_string, _config, common_error)
from .local_scheduler_services import start_local_scheduler
__all__ = [
"Task", "LocalSchedulerClient", "ObjectID", "check_simple_value",
+2 -5
View File
@@ -39,11 +39,8 @@ class TaskPool(object):
for worker, obj_id in self.completed():
plasma_id = ray.pyarrow.plasma.ObjectID(obj_id.id())
if not ray.global_state.use_raylet:
ray.worker.global_worker.plasma_client.fetch([plasma_id])
else:
(ray.worker.global_worker.local_scheduler_client.
reconstruct_objects([obj_id], True))
(ray.worker.global_worker.local_scheduler_client.
reconstruct_objects([obj_id], True))
self._fetching.append((worker, obj_id))
remaining = []
+5 -33
View File
@@ -5,7 +5,6 @@ from __future__ import print_function
import click
import json
import logging
import os
import subprocess
import ray.services as services
@@ -20,7 +19,7 @@ logger = logging.getLogger(__name__)
def check_no_existing_redis_clients(node_ip_address, redis_client):
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
# "src/ray/gcs/redis_module/ray_redis_module.cc" where it is defined.
REDIS_CLIENT_TABLE_PREFIX = "CL:"
client_keys = redis_client.keys("{}*".format(REDIS_CLIENT_TABLE_PREFIX))
# Filter to clients on the same node and do some basic checking.
@@ -167,11 +166,6 @@ def cli(logging_level, logging_format):
required=False,
type=str,
help="the file that contains the autoscaling config")
@click.option(
"--use-raylet",
default=None,
type=bool,
help="use the raylet code path, this defaults to false")
@click.option(
"--no-redirect-worker-output",
is_flag=True,
@@ -198,31 +192,15 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients, redis_password, redis_shard_ports,
object_manager_port, object_store_memory, num_workers, num_cpus,
num_gpus, resources, head, no_ui, block, plasma_directory,
huge_pages, autoscaling_config, use_raylet,
no_redirect_worker_output, no_redirect_output,
plasma_store_socket_name, raylet_socket_name, temp_dir):
huge_pages, autoscaling_config, no_redirect_worker_output,
no_redirect_output, plasma_store_socket_name, raylet_socket_name,
temp_dir):
# Convert hostnames to numerical IP address.
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
if redis_address is not None:
redis_address = services.address_to_ip(redis_address)
if use_raylet is None:
if os.environ.get("RAY_USE_XRAY") == "0":
# This environment variable is used in our testing setup.
logger.info("Detected environment variable 'RAY_USE_XRAY' with "
"value {}. This turns OFF xray.".format(
os.environ.get("RAY_USE_XRAY")))
use_raylet = False
else:
use_raylet = True
if not use_raylet and redis_password is not None:
raise Exception("Setting the 'redis-password' argument is not "
"supported in legacy Ray. To run Ray with "
"password-protected Redis ports, pass "
"the '--use-raylet' flag.")
try:
resources = json.loads(resources)
except Exception:
@@ -290,7 +268,6 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
@@ -369,7 +346,6 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
@@ -387,11 +363,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
@cli.command()
def stop():
subprocess.call(
[
"killall global_scheduler plasma_store_server plasma_manager "
"local_scheduler raylet raylet_monitor"
],
shell=True)
["killall plasma_store_server raylet raylet_monitor"], shell=True)
# Find the PID of the monitor process and kill it.
subprocess.call(
+33 -316
View File
@@ -14,32 +14,25 @@ import subprocess
import sys
import threading
import time
from collections import OrderedDict, namedtuple
from collections import OrderedDict
import redis
import pyarrow
# Ray modules
import ray.ray_constants
import ray.global_scheduler as global_scheduler
import ray.local_scheduler
import ray.plasma
from ray.tempfile_services import (
get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
get_temp_root, new_global_scheduler_log_file, new_local_scheduler_log_file,
new_log_monitor_log_file, new_monitor_log_file,
new_plasma_manager_log_file, new_plasma_store_log_file,
new_raylet_log_file, new_redis_log_file, new_webui_log_file,
new_worker_log_file, set_temp_root)
get_temp_root, new_log_monitor_log_file, new_monitor_log_file,
new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
new_webui_log_file, set_temp_root)
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
PROCESS_TYPE_WORKER = "worker"
PROCESS_TYPE_RAYLET = "raylet"
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
PROCESS_TYPE_GLOBAL_SCHEDULER = "global_scheduler"
PROCESS_TYPE_REDIS_SERVER = "redis_server"
PROCESS_TYPE_WEB_UI = "web_ui"
@@ -51,23 +44,20 @@ PROCESS_TYPE_WEB_UI = "web_ui"
all_processes = OrderedDict(
[(PROCESS_TYPE_MONITOR, []), (PROCESS_TYPE_LOG_MONITOR, []),
(PROCESS_TYPE_WORKER, []), (PROCESS_TYPE_RAYLET, []),
(PROCESS_TYPE_LOCAL_SCHEDULER, []), (PROCESS_TYPE_PLASMA_MANAGER, []),
(PROCESS_TYPE_PLASMA_STORE, []), (PROCESS_TYPE_GLOBAL_SCHEDULER, []),
(PROCESS_TYPE_REDIS_SERVER, []), (PROCESS_TYPE_WEB_UI, [])], )
(PROCESS_TYPE_PLASMA_STORE, []), (PROCESS_TYPE_REDIS_SERVER, []),
(PROCESS_TYPE_WEB_UI, [])], )
# True if processes are run in the valgrind profiler.
RUN_RAYLET_PROFILER = False
RUN_LOCAL_SCHEDULER_PROFILER = False
RUN_PLASMA_MANAGER_PROFILER = False
RUN_PLASMA_STORE_PROFILER = False
# Location of the redis server and module.
REDIS_EXECUTABLE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/common/thirdparty/redis/src/redis-server")
"core/src/ray/thirdparty/redis/src/redis-server")
REDIS_MODULE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/common/redis_module/libray_redis_module.so")
"core/src/ray/gcs/redis_module/libray_redis_module.so")
# Location of the credis server and modules.
# credis will be enabled if the environment variable RAY_USE_NEW_GCS is set.
@@ -88,14 +78,6 @@ RAYLET_MONITOR_EXECUTABLE = os.path.join(
RAYLET_EXECUTABLE = os.path.join(
os.path.abspath(os.path.dirname(__file__)), "core/src/ray/raylet/raylet")
# ObjectStoreAddress tuples contain all information necessary to connect to an
# object store. The fields are:
# - name: The socket name for the object store
# - manager_name: The socket name for the object store manager
# - manager_port: The Internet port that the object store manager listens on
ObjectStoreAddress = namedtuple("ObjectStoreAddress",
["name", "manager_name", "manager_port"])
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray configures it by default automatically
# using logging.basicConfig in its entry/init points.
@@ -136,10 +118,7 @@ def kill_process(p):
if p.poll() is not None:
# The process has already terminated.
return True
if any([
RUN_RAYLET_PROFILER, RUN_LOCAL_SCHEDULER_PROFILER,
RUN_PLASMA_MANAGER_PROFILER, RUN_PLASMA_STORE_PROFILER
]):
if any([RUN_RAYLET_PROFILER, RUN_PLASMA_STORE_PROFILER]):
# Give process signal to write profiler data.
os.kill(p.pid, signal.SIGINT)
# Wait for profiling data to be written.
@@ -430,7 +409,6 @@ def start_redis(node_ip_address,
redis_shard_ports=None,
num_redis_shards=1,
redis_max_clients=None,
use_raylet=True,
redirect_output=False,
redirect_worker_output=False,
cleanup=True,
@@ -450,7 +428,6 @@ def start_redis(node_ip_address,
shard.
redis_max_clients: If this is provided, Ray will attempt to configure
Redis with this maxclients number.
use_raylet: True if the new raylet code path should be used.
redirect_output (bool): True if output should be redirected to a file
and false otherwise.
redirect_worker_output (bool): True if worker output should be
@@ -515,12 +492,6 @@ def start_redis(node_ip_address,
port = assigned_port
redis_address = address(node_ip_address, port)
redis_client = redis.StrictRedis(
host=node_ip_address, port=port, password=password)
# Store whether we're using the raylet code path or not.
redis_client.set("UseRaylet", 1 if use_raylet else 0)
# Register the number of Redis shards in the primary shard, so that clients
# know how many redis shards to expect under RedisShards.
primary_redis_client = redis.StrictRedis(
@@ -762,40 +733,6 @@ def start_log_monitor(redis_address,
password=redis_password)
def start_global_scheduler(redis_address,
node_ip_address,
stdout_file=None,
stderr_file=None,
cleanup=True,
redis_password=None):
"""Start a global scheduler process.
Args:
redis_address (str): The address of the Redis instance.
node_ip_address: The IP address of the node that this scheduler will
run on.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
cleanup (bool): True if using Ray in local mode. If cleanup is true,
then this process will be killed by services.cleanup() when the
Python process that imported services exits.
redis_password (str): The password of the redis server.
"""
p = global_scheduler.start_global_scheduler(
redis_address,
node_ip_address,
stdout_file=stdout_file,
stderr_file=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_GLOBAL_SCHEDULER].append(p)
record_log_files_in_redis(
redis_address,
node_ip_address, [stdout_file, stderr_file],
password=redis_password)
def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
"""Start a UI process.
@@ -856,13 +793,11 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
return webui_url
def check_and_update_resources(resources, use_raylet):
def check_and_update_resources(resources):
"""Sanity check a resource dictionary and add sensible defaults.
Args:
resources: A dictionary mapping resource names to resource quantities.
use_raylet: True if we are using the raylet code path and false
otherwise.
Returns:
A new resource dictionary.
@@ -901,79 +836,13 @@ def check_and_update_resources(resources, use_raylet):
and not resource_quantity.is_integer()):
raise ValueError("Resource quantities must all be whole numbers.")
if (use_raylet and
resource_quantity > ray.ray_constants.MAX_RESOURCE_QUANTITY):
if resource_quantity > ray.ray_constants.MAX_RESOURCE_QUANTITY:
raise ValueError("Resource quantities must be at most {}.".format(
ray.ray_constants.MAX_RESOURCE_QUANTITY))
return resources
def start_local_scheduler(redis_address,
node_ip_address,
plasma_store_name,
plasma_manager_name,
worker_path,
plasma_address=None,
stdout_file=None,
stderr_file=None,
cleanup=True,
resources=None,
num_workers=0,
redis_password=None):
"""Start a local scheduler process.
Args:
redis_address (str): The address of the Redis instance.
node_ip_address (str): The IP address of the node that this local
scheduler is running on.
plasma_store_name (str): The name of the plasma store socket to connect
to.
plasma_manager_name (str): The name of the plasma manager socket to
connect to.
worker_path (str): The path of the script to use when the local
scheduler starts up new workers.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
cleanup (bool): True if using Ray in local mode. If cleanup is true,
then this process will be killed by serices.cleanup() when the
Python process that imported services exits.
resources: A dictionary mapping the name of a resource to the available
quantity of that resource.
num_workers (int): The number of workers that the local scheduler
should start.
redis_password (str): The password of the redis server.
Return:
The name of the local scheduler socket.
"""
resources = check_and_update_resources(resources, False)
logger.info("Starting local scheduler with the following resources: {}."
.format(resources))
local_scheduler_name, p = ray.local_scheduler.start_local_scheduler(
plasma_store_name,
plasma_manager_name,
worker_path=worker_path,
node_ip_address=node_ip_address,
redis_address=redis_address,
plasma_address=plasma_address,
use_profiler=RUN_LOCAL_SCHEDULER_PROFILER,
stdout_file=stdout_file,
stderr_file=stderr_file,
static_resources=resources,
num_workers=num_workers)
if cleanup:
all_processes[PROCESS_TYPE_LOCAL_SCHEDULER].append(p)
record_log_files_in_redis(
redis_address,
node_ip_address, [stdout_file, stderr_file],
password=redis_password)
return local_scheduler_name
def start_raylet(redis_address,
node_ip_address,
raylet_name,
@@ -1017,7 +886,7 @@ def start_raylet(redis_address,
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
static_resources = check_and_update_resources(resources, True)
static_resources = check_and_update_resources(resources)
# Limit the number of workers that can be started in parallel by the
# raylet. However, make sure it is at least 1.
@@ -1093,13 +962,10 @@ def start_plasma_store(node_ip_address,
object_manager_port=None,
store_stdout_file=None,
store_stderr_file=None,
manager_stdout_file=None,
manager_stderr_file=None,
objstore_memory=None,
cleanup=True,
plasma_directory=None,
huge_pages=False,
use_raylet=True,
plasma_store_socket_name=None,
redis_password=None):
"""This method starts an object store process.
@@ -1114,12 +980,6 @@ def start_plasma_store(node_ip_address,
to. If no redirection should happen, then this should be None.
store_stderr_file: A file handle opened for writing to redirect stderr
to. If no redirection should happen, then this should be None.
manager_stdout_file: A file handle opened for writing to redirect
stdout to. If no redirection should happen, then this should be
None.
manager_stderr_file: A file handle opened for writing to redirect
stderr to. If no redirection should happen, then this should be
None.
objstore_memory: The amount of memory (in bytes) to start the object
store with.
cleanup (bool): True if using Ray in local mode. If cleanup is true,
@@ -1129,12 +989,10 @@ def start_plasma_store(node_ip_address,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used.
redis_password (str): The password of the redis server.
Return:
A tuple of the Plasma store socket name, the Plasma manager socket
name, and the plasma manager port.
The Plasma store socket name.
"""
if objstore_memory is None:
# Compute a fraction of the system memory for the Plasma store to use.
@@ -1177,32 +1035,6 @@ def start_plasma_store(node_ip_address,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
socket_name=plasma_store_socket_name)
# Start the plasma manager.
if not use_raylet:
if object_manager_port is not None:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
plasma_manager_port=object_manager_port,
node_ip_address=node_ip_address,
num_retries=1,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
assert plasma_manager_port == object_manager_port
else:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
node_ip_address=node_ip_address,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
else:
plasma_manager_port = None
plasma_manager_name = None
if cleanup:
all_processes[PROCESS_TYPE_PLASMA_STORE].append(p1)
@@ -1210,19 +1042,12 @@ def start_plasma_store(node_ip_address,
redis_address,
node_ip_address, [store_stdout_file, store_stderr_file],
password=redis_password)
if not use_raylet:
if cleanup:
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
record_log_files_in_redis(redis_address, node_ip_address,
[manager_stdout_file, manager_stderr_file])
return ObjectStoreAddress(plasma_store_name, plasma_manager_name,
plasma_manager_port)
return plasma_store_name
def start_worker(node_ip_address,
object_store_name,
object_store_manager_name,
local_scheduler_name,
redis_address,
worker_path,
@@ -1235,7 +1060,6 @@ def start_worker(node_ip_address,
node_ip_address (str): The IP address of the node that this worker is
running on.
object_store_name (str): The name of the object store.
object_store_manager_name (str): The name of the object store manager.
local_scheduler_name (str): The name of the local scheduler.
redis_address (str): The address that the Redis server is listening on.
worker_path (str): The path of the source code which the worker process
@@ -1253,7 +1077,6 @@ def start_worker(node_ip_address,
sys.executable, "-u", worker_path,
"--node-ip-address=" + node_ip_address,
"--object-store-name=" + object_store_name,
"--object-store-manager-name=" + object_store_manager_name,
"--local-scheduler-name=" + local_scheduler_name,
"--redis-address=" + str(redis_address),
"--temp-dir=" + get_temp_root()
@@ -1349,7 +1172,6 @@ def start_ray_processes(address_info=None,
cleanup=True,
redirect_worker_output=False,
redirect_output=False,
include_global_scheduler=False,
include_log_monitor=False,
include_webui=False,
start_workers_from_local_scheduler=True,
@@ -1357,7 +1179,6 @@ def start_ray_processes(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=True,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
@@ -1398,8 +1219,6 @@ def start_ray_processes(address_info=None,
processes should be redirected to files.
redirect_output (bool): True if stdout and stderr for non-worker
processes should be redirected to files and false otherwise.
include_global_scheduler (bool): If include_global_scheduler is True,
then start a global scheduler process.
include_log_monitor (bool): If True, then start a log monitor to
monitor the log files for all processes on this node and push their
contents to Redis.
@@ -1415,7 +1234,6 @@ def start_ray_processes(address_info=None,
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
@@ -1469,7 +1287,6 @@ def start_ray_processes(address_info=None,
redis_shard_ports=redis_shard_ports,
num_redis_shards=num_redis_shards,
redis_max_clients=redis_max_clients,
use_raylet=use_raylet,
redirect_output=True,
redirect_worker_output=redirect_worker_output,
cleanup=cleanup,
@@ -1488,13 +1305,12 @@ def start_ray_processes(address_info=None,
cleanup=cleanup,
autoscaling_config=autoscaling_config,
redis_password=redis_password)
if use_raylet:
start_raylet_monitor(
redis_address,
stdout_file=monitor_stdout_file,
stderr_file=monitor_stderr_file,
cleanup=cleanup,
redis_password=redis_password)
start_raylet_monitor(
redis_address,
stdout_file=monitor_stdout_file,
stderr_file=monitor_stderr_file,
cleanup=cleanup,
redis_password=redis_password)
if redis_shards == []:
# Get redis shards from primary redis instance.
redis_ip_address, redis_port = redis_address.split(":")
@@ -1516,25 +1332,10 @@ def start_ray_processes(address_info=None,
cleanup=cleanup,
redis_password=redis_password)
# Start the global scheduler, if necessary.
if include_global_scheduler and not use_raylet:
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_global_scheduler_log_file(redirect_output))
start_global_scheduler(
redis_address,
node_ip_address,
stdout_file=global_scheduler_stdout_file,
stderr_file=global_scheduler_stderr_file,
cleanup=cleanup,
redis_password=redis_password)
# Initialize with existing services.
if "object_store_addresses" not in address_info:
address_info["object_store_addresses"] = []
object_store_addresses = address_info["object_store_addresses"]
if "local_scheduler_socket_names" not in address_info:
address_info["local_scheduler_socket_names"] = []
local_scheduler_socket_names = address_info["local_scheduler_socket_names"]
if "raylet_socket_names" not in address_info:
address_info["raylet_socket_names"] = []
raylet_socket_names = address_info["raylet_socket_names"]
@@ -1552,114 +1353,37 @@ def start_ray_processes(address_info=None,
plasma_store_stdout_file, plasma_store_stderr_file = (
new_plasma_store_log_file(i, redirect_output))
# If we use raylet, plasma manager won't be started and we don't need
# to create temp files for them.
plasma_manager_stdout_file, plasma_manager_stderr_file = (
new_plasma_manager_log_file(i, redirect_output and not use_raylet))
object_store_address = start_plasma_store(
node_ip_address,
redis_address,
object_manager_port=object_manager_ports[i],
store_stdout_file=plasma_store_stdout_file,
store_stderr_file=plasma_store_stderr_file,
manager_stdout_file=plasma_manager_stdout_file,
manager_stderr_file=plasma_manager_stderr_file,
objstore_memory=object_store_memory,
cleanup=cleanup,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
redis_password=redis_password)
object_store_addresses.append(object_store_address)
time.sleep(0.1)
if not use_raylet:
# Start any local schedulers that do not yet exist.
for i in range(
len(local_scheduler_socket_names), num_local_schedulers):
# Connect the local scheduler to the object store at the same
# index.
object_store_address = object_store_addresses[i]
plasma_address = "{}:{}".format(node_ip_address,
object_store_address.manager_port)
# Determine how many workers this local scheduler should start.
if start_workers_from_local_scheduler:
num_local_scheduler_workers = workers_per_local_scheduler[i]
workers_per_local_scheduler[i] = 0
else:
# If we're starting the workers from Python, the local
# scheduler should not start any workers.
num_local_scheduler_workers = 0
# Start the local scheduler. Note that if we do not wish to
# redirect the worker output, then we cannot redirect the local
# scheduler output.
local_scheduler_stdout_file, local_scheduler_stderr_file = (
new_local_scheduler_log_file(
i, redirect_output=redirect_worker_output))
local_scheduler_name = start_local_scheduler(
# Start any raylets that do not exist yet.
for i in range(len(raylet_socket_names), num_local_schedulers):
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
i, redirect_output=redirect_worker_output)
address_info["raylet_socket_names"].append(
start_raylet(
redis_address,
node_ip_address,
object_store_address.name,
object_store_address.manager_name,
raylet_socket_name or get_raylet_socket_name(),
object_store_addresses[i],
worker_path,
plasma_address=plasma_address,
stdout_file=local_scheduler_stdout_file,
stderr_file=local_scheduler_stderr_file,
cleanup=cleanup,
resources=resources[i],
num_workers=num_local_scheduler_workers,
redis_password=redis_password)
local_scheduler_socket_names.append(local_scheduler_name)
# Make sure that we have exactly num_local_schedulers instances of
# object stores and local schedulers.
assert len(object_store_addresses) == num_local_schedulers
assert len(local_scheduler_socket_names) == num_local_schedulers
else:
# Start any raylets that do not exist yet.
for i in range(len(raylet_socket_names), num_local_schedulers):
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
i, redirect_output=redirect_worker_output)
address_info["raylet_socket_names"].append(
start_raylet(
redis_address,
node_ip_address,
raylet_socket_name or get_raylet_socket_name(),
object_store_addresses[i].name,
worker_path,
resources=resources[i],
num_workers=workers_per_local_scheduler[i],
stdout_file=raylet_stdout_file,
stderr_file=raylet_stderr_file,
cleanup=cleanup,
redis_password=redis_password))
if not use_raylet:
# Start any workers that the local scheduler has not already started.
for i, num_local_scheduler_workers in enumerate(
workers_per_local_scheduler):
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
worker_stdout_file, worker_stderr_file = new_worker_log_file(
i, j, redirect_output)
start_worker(
node_ip_address,
object_store_address.name,
object_store_address.manager_name,
local_scheduler_name,
redis_address,
worker_path,
stdout_file=worker_stdout_file,
stderr_file=worker_stderr_file,
cleanup=cleanup)
workers_per_local_scheduler[i] -= 1
# Make sure that we've started all the workers.
assert (sum(workers_per_local_scheduler) == 0)
num_workers=workers_per_local_scheduler[i],
stdout_file=raylet_stdout_file,
stderr_file=raylet_stderr_file,
cleanup=cleanup,
redis_password=redis_password))
# Try to start the web UI.
if include_webui:
@@ -1689,7 +1413,6 @@ def start_ray_node(node_ip_address,
resources=None,
plasma_directory=None,
huge_pages=False,
use_raylet=True,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
@@ -1727,7 +1450,6 @@ def start_ray_node(node_ip_address,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
@@ -1758,7 +1480,6 @@ def start_ray_node(node_ip_address,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
@@ -1784,7 +1505,6 @@ def start_ray_head(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=True,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
@@ -1836,7 +1556,6 @@ def start_ray_head(address_info=None,
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
@@ -1861,7 +1580,6 @@ def start_ray_head(address_info=None,
cleanup=cleanup,
redirect_worker_output=redirect_worker_output,
redirect_output=redirect_output,
include_global_scheduler=True,
include_log_monitor=True,
include_webui=include_webui,
start_workers_from_local_scheduler=start_workers_from_local_scheduler,
@@ -1872,7 +1590,6 @@ def start_ray_head(address_info=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
-60
View File
@@ -117,27 +117,6 @@ def get_object_store_socket_name():
return make_inc_temp(prefix="plasma_store", directory_name=sockets_dir)
def get_plasma_manager_socket_name():
"""Get a socket name for plasma manager."""
sockets_dir = get_sockets_dir_path()
return make_inc_temp(prefix="plasma_manager", directory_name=sockets_dir)
def get_local_scheduler_socket_name(suffix=""):
"""Get a socket name for local scheduler.
This function could be unsafe. The socket name may
refer to a file that did not exist at some point, but by the time
you get around to creating it, someone else may have beaten you to
the punch.
"""
sockets_dir = get_sockets_dir_path()
raylet_socket_name = make_inc_temp(
prefix="scheduler", directory_name=sockets_dir, suffix=suffix)
return raylet_socket_name
def get_ipython_notebook_path(port):
"""Get a new ipython notebook path"""
@@ -211,17 +190,6 @@ def new_raylet_log_file(local_scheduler_index, redirect_output):
return raylet_stdout_file, raylet_stderr_file
def new_local_scheduler_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for local scheduler.
It is only used in non-raylet versions.
"""
local_scheduler_stdout_file, local_scheduler_stderr_file = (new_log_files(
"local_scheduler_{}".format(local_scheduler_index),
redirect_output=redirect_output))
return local_scheduler_stdout_file, local_scheduler_stderr_file
def new_webui_log_file():
"""Create new logging files for web ui."""
ui_stdout_file, ui_stderr_file = new_log_files(
@@ -229,17 +197,6 @@ def new_webui_log_file():
return ui_stdout_file, ui_stderr_file
def new_worker_log_file(local_scheduler_index, worker_index, redirect_output):
"""Create new logging files for workers with local scheduler index.
It is only used in non-raylet versions.
"""
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(local_scheduler_index, worker_index),
redirect_output)
return worker_stdout_file, worker_stderr_file
def new_worker_redirected_log_file(worker_id):
"""Create new logging files for workers to redirect its output."""
worker_stdout_file, worker_stderr_file = (new_log_files(
@@ -254,16 +211,6 @@ def new_log_monitor_log_file():
return log_monitor_stdout_file, log_monitor_stderr_file
def new_global_scheduler_log_file(redirect_output):
"""Create new logging files for the new global scheduler.
It is only used in non-raylet versions.
"""
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_log_files("global_scheduler", redirect_output))
return global_scheduler_stdout_file, global_scheduler_stderr_file
def new_plasma_store_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for the plasma store."""
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
@@ -271,13 +218,6 @@ def new_plasma_store_log_file(local_scheduler_index, redirect_output):
return plasma_store_stdout_file, plasma_store_stderr_file
def new_plasma_manager_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for the plasma manager."""
plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
"plasma_manager_{}".format(local_scheduler_index), redirect_output)
return plasma_manager_stdout_file, plasma_manager_stderr_file
def new_monitor_log_file(redirect_output):
"""Create new logging files for the monitor."""
monitor_stdout_file, monitor_stderr_file = new_log_files(
+10 -8
View File
@@ -44,7 +44,6 @@ class Cluster(object):
All nodes are by default started with the following settings:
cleanup=True,
use_raylet=True,
resources={"CPU": 1},
object_store_memory=100 * (2**20) # 100 MB
@@ -55,12 +54,13 @@ class Cluster(object):
Returns:
Node object of the added Ray node.
"""
node_kwargs = dict(
cleanup=True,
use_raylet=True,
resources={"CPU": 1},
object_store_memory=100 * (2**20) # 100 MB
)
node_kwargs = {
"cleanup": True,
"resources": {
"CPU": 1
},
"object_store_memory": 100 * (2**20) # 100 MB
}
node_kwargs.update(override_kwargs)
if self.head_node is None:
@@ -179,7 +179,9 @@ class Node(object):
for process_name, process_list in self.process_dict.items():
logger.info("Killing all {}(s)".format(process_name))
for process in process_list:
process.kill()
# Kill the process if it is still alive.
if process.poll() is None:
process.kill()
for process_name, process_list in self.process_dict.items():
logger.info("Waiting all {}(s)".format(process_name))
-3
View File
@@ -28,9 +28,6 @@ class TestRedisPassword(object):
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="New GCS API doesn't support Redis authentication yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") == "0",
reason="Redis authentication is not supported in legacy Ray.")
def test_redis_password(self, password, shutdown_only):
# Workaround for https://github.com/ray-project/ray/issues/3045
@ray.remote
+3 -14
View File
@@ -35,22 +35,11 @@ def _wait_for_nodes_to_join(num_nodes, timeout=20):
client_table = ray.global_state.client_table()
num_ready_nodes = len(client_table)
if num_ready_nodes == num_nodes:
ready = True
# Check that for each node, a local scheduler and a plasma manager
# are present.
if ray.global_state.use_raylet:
# In raylet mode, this is a list of map.
# The GCS info will appear as a whole instead of part by part.
return
else:
for ip_address, clients in client_table.items():
client_types = [client["ClientType"] for client in clients]
if "local_scheduler" not in client_types:
ready = False
if "plasma_manager" not in client_types:
ready = False
if ready:
return
# In raylet mode, this is a list of map.
# The GCS info will appear as a whole instead of part by part.
return
if num_ready_nodes > num_nodes:
# Too many nodes have joined. Something must be wrong.
raise Exception("{} nodes have joined the cluster, but we were "
+3 -14
View File
@@ -213,20 +213,9 @@ class RayTrialExecutor(TrialExecutor):
assert self._committed_resources.gpu >= 0
def _update_avail_resources(self):
if ray.worker.global_worker.use_raylet:
# TODO(rliaw): Remove once raylet flag is swapped
resources = ray.global_state.cluster_resources()
num_cpus = resources["CPU"]
num_gpus = resources["GPU"]
else:
clients = ray.global_state.client_table()
local_schedulers = [
entry for client in clients.values() for entry in client
if (entry['ClientType'] == 'local_scheduler'
and not entry['Deleted'])
]
num_cpus = sum(ls['CPU'] for ls in local_schedulers)
num_gpus = sum(ls.get('GPU', 0) for ls in local_schedulers)
resources = ray.global_state.cluster_resources()
num_cpus = resources["CPU"]
num_gpus = resources["GPU"]
self._avail_resources = Resources(int(num_cpus), int(num_gpus))
self._resources_initialized = True
+6 -6
View File
@@ -107,7 +107,7 @@ class TrainableFunctionApiTest(unittest.TestCase):
return Resources(cpu=config["cpu"], gpu=config["gpu"])
def _train(self):
return dict(timesteps_this_iter=1, done=True)
return {"timesteps_this_iter": 1, "done": True}
register_trainable("B", B)
@@ -440,7 +440,7 @@ class TrainableFunctionApiTest(unittest.TestCase):
self.state = {"hi": 1}
def _train(self):
return dict(timesteps_this_iter=1, done=True)
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
return self.state
@@ -471,7 +471,7 @@ class TrainableFunctionApiTest(unittest.TestCase):
def _train(self):
self.state["iter"] += 1
return dict(timesteps_this_iter=1, done=True)
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
return self.state
@@ -604,7 +604,7 @@ class RunExperimentTest(unittest.TestCase):
class B(Trainable):
def _train(self):
return dict(timesteps_this_iter=1, done=True)
return {"timesteps_this_iter": 1, "done": True}
register_trainable("f1", train)
trials = run_experiments({
@@ -624,7 +624,7 @@ class RunExperimentTest(unittest.TestCase):
def testCheckpointAtEnd(self):
class train(Trainable):
def _train(self):
return dict(timesteps_this_iter=1, done=True)
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
return path
@@ -887,7 +887,7 @@ class TrialRunnerTest(unittest.TestCase):
self.assertEqual(trials[1].status, Trial.PENDING)
def testFractionalGpus(self):
ray.init(num_cpus=4, num_gpus=1, use_raylet=True)
ray.init(num_cpus=4, num_gpus=1)
runner = TrialRunner(BasicVariantGenerator())
kwargs = {
"resources": Resources(cpu=1, gpu=0.5),
+1 -1
View File
@@ -28,7 +28,7 @@ def pin_in_object_store(obj):
def get_pinned_object(pinned_id):
"""Retrieve a pinned object from the object store."""
from ray.local_scheduler import ObjectID
from ray.raylet import ObjectID
return _from_pinnable(
ray.get(
+10 -33
View File
@@ -15,11 +15,9 @@ import time
import uuid
import ray.gcs_utils
import ray.local_scheduler
import ray.raylet
import ray.ray_constants as ray_constants
ERROR_KEY_PREFIX = b"Error:"
def _random_string():
id_hash = hashlib.sha1()
@@ -70,22 +68,12 @@ def push_error_to_driver(worker,
"""
if driver_id is None:
driver_id = ray_constants.NIL_JOB_ID.id()
error_key = ERROR_KEY_PREFIX + driver_id + b":" + _random_string()
data = {} if data is None else data
if not worker.use_raylet:
worker.redis_client.hmset(error_key, {
"type": error_type,
"message": message,
"data": data
})
worker.redis_client.rpush("ErrorKeys", error_key)
else:
worker.local_scheduler_client.push_error(
ray.ObjectID(driver_id), error_type, message, time.time())
worker.local_scheduler_client.push_error(
ray.ObjectID(driver_id), error_type, message, time.time())
def push_error_to_driver_through_redis(redis_client,
use_raylet,
error_type,
message,
driver_id=None,
@@ -99,8 +87,6 @@ def push_error_to_driver_through_redis(redis_client,
Args:
redis_client: The redis client to use.
use_raylet: True if we are using the Raylet code path and false
otherwise.
error_type (str): The type of the error.
message (str): The message that will be printed in the background
on the driver.
@@ -111,23 +97,14 @@ def push_error_to_driver_through_redis(redis_client,
"""
if driver_id is None:
driver_id = ray_constants.NIL_JOB_ID.id()
error_key = ERROR_KEY_PREFIX + driver_id + b":" + _random_string()
data = {} if data is None else data
if not use_raylet:
redis_client.hmset(error_key, {
"type": error_type,
"message": message,
"data": data
})
redis_client.rpush("ErrorKeys", error_key)
else:
# Do everything in Python and through the Python Redis client instead
# of through the raylet.
error_data = ray.gcs_utils.construct_error_message(
driver_id, error_type, message, time.time())
redis_client.execute_command(
"RAY.TABLE_APPEND", ray.gcs_utils.TablePrefix.ERROR_INFO,
ray.gcs_utils.TablePubsub.ERROR_INFO, driver_id, error_data)
# Do everything in Python and through the Python Redis client instead
# of through the raylet.
error_data = ray.gcs_utils.construct_error_message(driver_id, error_type,
message, time.time())
redis_client.execute_command(
"RAY.TABLE_APPEND", ray.gcs_utils.TablePrefix.ERROR_INFO,
ray.gcs_utils.TablePubsub.ERROR_INFO, driver_id, error_data)
def is_cython(obj):
+89 -306
View File
@@ -27,14 +27,13 @@ import ray.serialization as serialization
import ray.services as services
import ray.signature
import ray.tempfile_services as tempfile_services
import ray.local_scheduler
import ray.raylet
import ray.plasma
import ray.ray_constants as ray_constants
from ray import import_thread
from ray import profiling
from ray.function_manager import FunctionActorManager
from ray.utils import (
binary_to_hex,
check_oversized_pickle,
is_cython,
random_string,
@@ -56,14 +55,6 @@ NIL_ACTOR_ID = NIL_ID
NIL_ACTOR_HANDLE_ID = NIL_ID
NIL_CLIENT_ID = ray_constants.ID_SIZE * b"\xff"
# This must be kept in sync with the `error_types` array in
# common/state/error_table.h.
OBJECT_HASH_MISMATCH_ERROR_TYPE = b"object_hash_mismatch"
PUT_RECONSTRUCTION_ERROR_TYPE = b"put_reconstruction"
# This must be kept in sync with the `scheduling_state` enum in common/task.h.
TASK_STATUS_RUNNING = 8
# Default resource requirements for actors when no resource requirements are
# specified.
DEFAULT_ACTOR_METHOD_CPUS_SIMPLE_CASE = 1
@@ -461,13 +452,9 @@ class Worker(object):
]
for i in range(0, len(object_ids),
ray._config.worker_fetch_request_size()):
if not self.use_raylet:
self.plasma_client.fetch(plain_object_ids[i:(
i + ray._config.worker_fetch_request_size())])
else:
self.local_scheduler_client.reconstruct_objects(
object_ids[i:(
i + ray._config.worker_fetch_request_size())], True)
self.local_scheduler_client.reconstruct_objects(
object_ids[i:(i + ray._config.worker_fetch_request_size())],
True)
# Get the objects. We initially try to get the objects immediately.
final_results = self.retrieve_and_deserialize(plain_object_ids, 0)
@@ -497,25 +484,9 @@ class Worker(object):
ray._config.worker_fetch_request_size())
for i in range(0, len(object_ids_to_fetch),
fetch_request_size):
if not self.use_raylet:
for unready_id in ray_object_ids_to_fetch[i:(
i + fetch_request_size)]:
(self.local_scheduler_client.
reconstruct_objects([unready_id], False))
# Do another fetch for objects that aren't
# available locally yet, in case they were evicted
# since the last fetch. We divide the fetch into
# smaller fetches so as to not block the manager
# for a prolonged period of time in a single call.
# This is only necessary for legacy ray since
# reconstruction and fetch are implemented by
# different processes.
self.plasma_client.fetch(object_ids_to_fetch[i:(
i + fetch_request_size)])
else:
self.local_scheduler_client.reconstruct_objects(
ray_object_ids_to_fetch[i:(
i + fetch_request_size)], False)
self.local_scheduler_client.reconstruct_objects(
ray_object_ids_to_fetch[i:(
i + fetch_request_size)], False)
results = self.retrieve_and_deserialize(
object_ids_to_fetch,
max([
@@ -608,7 +579,7 @@ class Worker(object):
for arg in args:
if isinstance(arg, ray.ObjectID):
args_for_local_scheduler.append(arg)
elif ray.local_scheduler.check_simple_value(arg):
elif ray.raylet.check_simple_value(arg):
args_for_local_scheduler.append(arg)
else:
args_for_local_scheduler.append(put(arg))
@@ -641,14 +612,13 @@ class Worker(object):
task_index = self.task_index
self.task_index += 1
# Submit the task to local scheduler.
task = ray.local_scheduler.Task(
task = ray.raylet.Task(
driver_id, ray.ObjectID(
function_id.id()), args_for_local_scheduler,
num_return_vals, self.current_task_id, task_index,
actor_creation_id, actor_creation_dummy_object_id, actor_id,
actor_handle_id, actor_counter, is_actor_checkpoint_method,
execution_dependencies, resources, placement_resources,
self.use_raylet)
actor_handle_id, actor_counter, execution_dependencies,
resources, placement_resources)
self.local_scheduler_client.submit(task)
return task.returns()
@@ -925,26 +895,13 @@ class Worker(object):
# good to know where the system is hanging.
with self.lock:
function_name = execution_info.function_name
if not self.use_raylet:
extra_data = {
"function_name": function_name,
"task_id": task.task_id().hex(),
"worker_id": binary_to_hex(self.worker_id)
}
else:
extra_data = {
"name": function_name,
"task_id": task.task_id().hex()
}
extra_data = {
"name": function_name,
"task_id": task.task_id().hex()
}
with profiling.profile("task", extra_data=extra_data, worker=self):
self._process_task(task, execution_info)
# In the non-raylet code path, push all of the log events to the global
# state store. In the raylet code path, this is done periodically in a
# background thread.
if not self.use_raylet:
self.profiler.flush_profile_data()
# Increase the task execution counter.
self.function_actor_manager.increase_task_counter(
driver_id, function_id.id())
@@ -998,13 +955,10 @@ def get_gpu_ids():
raise Exception("ray.get_gpu_ids() currently does not work in PYTHON "
"MODE.")
if not global_worker.use_raylet:
assigned_ids = global_worker.local_scheduler_client.gpu_ids()
else:
all_resource_ids = global_worker.local_scheduler_client.resource_ids()
assigned_ids = [
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
]
all_resource_ids = global_worker.local_scheduler_client.resource_ids()
assigned_ids = [
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
]
# If the user had already set CUDA_VISIBLE_DEVICES, then respect that (in
# the sense that only GPU IDs that appear in CUDA_VISIBLE_DEVICES should be
# returned).
@@ -1019,17 +973,11 @@ def get_gpu_ids():
def get_resource_ids():
"""Get the IDs of the resources that are available to the worker.
This function is only supported in the raylet code path.
Returns:
A dictionary mapping the name of a resource to a list of pairs, where
each pair consists of the ID of a resource and the fraction of that
resource reserved for this worker.
"""
if not global_worker.use_raylet:
raise Exception("ray.get_resource_ids() is only supported in the "
"raylet code path.")
if _mode() == LOCAL_MODE:
raise Exception(
"ray.get_resource_ids() currently does not work in PYTHON "
@@ -1112,22 +1060,8 @@ def error_applies_to_driver(error_key, worker=global_worker):
def error_info(worker=global_worker):
"""Return information about failed tasks."""
worker.check_connected()
if worker.use_raylet:
return (global_state.error_messages(job_id=worker.task_driver_id) +
global_state.error_messages(job_id=ray_constants.NIL_JOB_ID))
error_keys = worker.redis_client.lrange("ErrorKeys", 0, -1)
errors = []
for error_key in error_keys:
if error_applies_to_driver(error_key, worker=worker):
error_contents = worker.redis_client.hgetall(error_key)
error_contents = {
"type": ray.utils.decode(error_contents[b"type"]),
"message": ray.utils.decode(error_contents[b"message"]),
"data": ray.utils.decode(error_contents[b"data"])
}
errors.append(error_contents)
return errors
return (global_state.error_messages(job_id=worker.task_driver_id) +
global_state.error_messages(job_id=ray_constants.NIL_JOB_ID))
def _initialize_serialization(driver_id, worker=global_worker):
@@ -1223,7 +1157,6 @@ def _initialize_serialization(driver_id, worker=global_worker):
def get_address_info_from_redis_helper(redis_address,
node_ip_address,
use_raylet=True,
redis_password=None):
redis_ip_address, redis_port = redis_address.split(":")
# For this command to work, some other client (on the same machine as
@@ -1231,118 +1164,50 @@ def get_address_info_from_redis_helper(redis_address,
redis_client = redis.StrictRedis(
host=redis_ip_address, port=int(redis_port), password=redis_password)
if not use_raylet:
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
client_keys = redis_client.keys("{}*".format(
ray.gcs_utils.DB_CLIENT_PREFIX))
# Filter to live clients on the same node and do some basic checking.
plasma_managers = []
local_schedulers = []
for key in client_keys:
info = redis_client.hgetall(key)
# Ignore clients that were deleted.
deleted = info[b"deleted"]
deleted = bool(int(deleted))
if deleted:
continue
assert b"ray_client_id" in info
assert b"node_ip_address" in info
assert b"client_type" in info
client_node_ip_address = ray.utils.decode(info[b"node_ip_address"])
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1"
and redis_ip_address == ray.services.get_node_ip_address())):
if ray.utils.decode(info[b"client_type"]) == "plasma_manager":
plasma_managers.append(info)
elif (ray.utils.decode(
info[b"client_type"]) == "local_scheduler"):
local_schedulers.append(info)
# Make sure that we got at least one plasma manager and local
# scheduler.
assert len(plasma_managers) >= 1
assert len(local_schedulers) >= 1
# Build the address information.
object_store_addresses = []
for manager in plasma_managers:
address = ray.utils.decode(manager[b"manager_address"])
port = services.get_port(address)
object_store_addresses.append(
services.ObjectStoreAddress(
name=ray.utils.decode(manager[b"store_socket_name"]),
manager_name=ray.utils.decode(
manager[b"manager_socket_name"]),
manager_port=port))
scheduler_names = [
ray.utils.decode(scheduler[b"local_scheduler_socket_name"])
for scheduler in local_schedulers
]
client_info = {
"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"local_scheduler_socket_names": scheduler_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)
}
return client_info
# Handle the raylet case.
else:
# In the raylet code path, all client data is stored in a zset at the
# key for the nil client.
client_key = b"CLIENT" + NIL_CLIENT_ID
clients = redis_client.zrange(client_key, 0, -1)
raylets = []
for client_message in clients:
client = ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
client_message, 0)
client_node_ip_address = ray.utils.decode(
client.NodeManagerAddress())
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1"
and redis_ip_address == ray.services.get_node_ip_address())):
raylets.append(client)
# Make sure that at least one raylet has started locally.
# This handles a race condition where Redis has started but
# the raylet has not connected.
if len(raylets) == 0:
raise Exception(
"Redis has started but no raylets have registered yet.")
object_store_addresses = [
services.ObjectStoreAddress(
name=ray.utils.decode(raylet.ObjectStoreSocketName()),
manager_name=None,
manager_port=None) for raylet in raylets
]
raylet_socket_names = [
ray.utils.decode(raylet.RayletSocketName()) for raylet in raylets
]
return {
"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"raylet_socket_names": raylet_socket_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)
}
# In the raylet code path, all client data is stored in a zset at the
# key for the nil client.
client_key = b"CLIENT" + NIL_CLIENT_ID
clients = redis_client.zrange(client_key, 0, -1)
raylets = []
for client_message in clients:
client = ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
client_message, 0)
client_node_ip_address = ray.utils.decode(client.NodeManagerAddress())
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1"
and redis_ip_address == ray.services.get_node_ip_address())):
raylets.append(client)
# Make sure that at least one raylet has started locally.
# This handles a race condition where Redis has started but
# the raylet has not connected.
if len(raylets) == 0:
raise Exception(
"Redis has started but no raylets have registered yet.")
object_store_addresses = [
ray.utils.decode(raylet.ObjectStoreSocketName()) for raylet in raylets
]
raylet_socket_names = [
ray.utils.decode(raylet.RayletSocketName()) for raylet in raylets
]
return {
"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"raylet_socket_names": raylet_socket_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)
}
def get_address_info_from_redis(redis_address,
node_ip_address,
num_retries=5,
use_raylet=True,
redis_password=None):
counter = 0
while True:
try:
return get_address_info_from_redis_helper(
redis_address,
node_ip_address,
use_raylet=use_raylet,
redis_password=redis_password)
redis_address, node_ip_address, redis_password=redis_password)
except Exception:
if counter == num_retries:
raise
@@ -1414,7 +1279,6 @@ def _init(address_info=None,
plasma_directory=None,
huge_pages=False,
include_webui=True,
use_raylet=None,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
@@ -1474,7 +1338,6 @@ def _init(address_info=None,
Store with hugetlbfs support. Requires plasma_directory.
include_webui: Boolean flag indicating whether to start the web
UI, which is a Jupyter notebook.
use_raylet: True if the new raylet code path should be used.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
@@ -1497,16 +1360,6 @@ def _init(address_info=None,
else:
driver_mode = SCRIPT_MODE
if use_raylet is None:
if os.environ.get("RAY_USE_XRAY") == "0":
# This environment variable is used in our testing setup.
logger.info("Detected environment variable 'RAY_USE_XRAY' with "
"value {}. This turns OFF xray.".format(
os.environ.get("RAY_USE_XRAY")))
use_raylet = False
else:
use_raylet = True
# Get addresses of existing services.
if address_info is None:
address_info = {}
@@ -1561,7 +1414,6 @@ def _init(address_info=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
include_webui=include_webui,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
@@ -1610,10 +1462,7 @@ def _init(address_info=None,
node_ip_address = services.get_node_ip_address(redis_address)
# Get the address info of the processes to connect to from Redis.
address_info = get_address_info_from_redis(
redis_address,
node_ip_address,
use_raylet=use_raylet,
redis_password=redis_password)
redis_address, node_ip_address, redis_password=redis_password)
# Connect this driver to Redis, the object store, and the local scheduler.
# Choose the first object store and local scheduler if there are multiple.
@@ -1625,18 +1474,11 @@ def _init(address_info=None,
driver_address_info = {
"node_ip_address": node_ip_address,
"redis_address": address_info["redis_address"],
"store_socket_name": (
address_info["object_store_addresses"][0].name),
"store_socket_name": address_info["object_store_addresses"][0],
"webui_url": address_info["webui_url"]
}
if not use_raylet:
driver_address_info["manager_socket_name"] = (
address_info["object_store_addresses"][0].manager_name)
driver_address_info["local_scheduler_socket_name"] = (
address_info["local_scheduler_socket_names"][0])
else:
driver_address_info["raylet_socket_name"] = (
address_info["raylet_socket_names"][0])
driver_address_info["raylet_socket_name"] = (
address_info["raylet_socket_names"][0])
# We only pass `temp_dir` to a worker (WORKER_MODE).
# It can't be a worker here.
@@ -1645,7 +1487,6 @@ def _init(address_info=None,
object_id_seed=object_id_seed,
mode=driver_mode,
worker=global_worker,
use_raylet=use_raylet,
redis_password=redis_password)
return address_info
@@ -1669,7 +1510,6 @@ def init(redis_address=None,
plasma_directory=None,
huge_pages=False,
include_webui=True,
use_raylet=None,
configure_logging=True,
logging_level=logging.INFO,
logging_format=ray_constants.LOGGER_FORMAT,
@@ -1736,7 +1576,6 @@ def init(redis_address=None,
Store with hugetlbfs support. Requires plasma_directory.
include_webui: Boolean flag indicating whether to start the web
UI, which is a Jupyter notebook.
use_raylet: True if the new raylet code path should be used.
configure_logging: True if allow the logging cofiguration here.
Otherwise, the users may want to configure it by their own.
logging_level: Logging level, default will be loging.INFO.
@@ -1767,22 +1606,6 @@ def init(redis_address=None,
else:
raise Exception("Perhaps you called ray.init twice by accident?")
if use_raylet is None:
if os.environ.get("RAY_USE_XRAY") == "0":
# This environment variable is used in our testing setup.
logger.info("Detected environment variable 'RAY_USE_XRAY' with "
"value {}. This turns OFF xray.".format(
os.environ.get("RAY_USE_XRAY")))
use_raylet = False
else:
use_raylet = True
if not use_raylet and redis_password is not None:
raise Exception("Setting the 'redis_password' argument is not "
"supported in legacy Ray. To run Ray with "
"password-protected Redis ports, set "
"'use_raylet=True'.")
# Convert hostnames to numerical IP address.
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
@@ -1809,7 +1632,6 @@ def init(redis_address=None,
huge_pages=huge_pages,
include_webui=include_webui,
object_store_memory=object_store_memory,
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
@@ -1887,9 +1709,6 @@ def print_error_messages_raylet(worker):
This runs in a separate thread on the driver and prints error messages in
the background.
"""
if not worker.use_raylet:
raise Exception("This function is specific to the raylet code path.")
worker.error_message_pubsub_client = worker.redis_client.pubsub(
ignore_subscribe_messages=True)
# Exports that are published after the call to
@@ -2004,7 +1823,6 @@ def connect(info,
object_id_seed=None,
mode=WORKER_MODE,
worker=global_worker,
use_raylet=True,
redis_password=None):
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
@@ -2015,7 +1833,6 @@ def connect(info,
deterministic.
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, and
LOCAL_MODE.
use_raylet: True if the new raylet code path should be used.
redis_password (str): Prevents external clients without the password
from connecting to Redis if provided.
"""
@@ -2038,7 +1855,6 @@ def connect(info,
worker.actor_id = NIL_ACTOR_ID
worker.connected = True
worker.set_mode(mode)
worker.use_raylet = use_raylet
# If running Ray in LOCAL_MODE, there is no need to create call
# create_worker or to start the worker service.
@@ -2067,7 +1883,6 @@ def connect(info,
traceback_str = traceback.format_exc()
ray.utils.push_error_to_driver_through_redis(
worker.redis_client,
worker.use_raylet,
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
traceback_str,
driver_id=None)
@@ -2108,7 +1923,6 @@ def connect(info,
"driver_id": worker.worker_id,
"start_time": time.time(),
"plasma_store_socket": info["store_socket_name"],
"plasma_manager_socket": info.get("manager_socket_name"),
"local_scheduler_socket": info.get("local_scheduler_socket_name"),
"raylet_socket": info.get("raylet_socket_name")
}
@@ -2123,7 +1937,6 @@ def connect(info,
worker_dict = {
"node_ip_address": worker.node_ip_address,
"plasma_store_socket": info["store_socket_name"],
"plasma_manager_socket": info["manager_socket_name"],
"local_scheduler_socket": info["local_scheduler_socket_name"]
}
if redirect_worker_output:
@@ -2135,18 +1948,10 @@ def connect(info,
raise Exception("This code should be unreachable.")
# Create an object store client.
if not worker.use_raylet:
worker.plasma_client = thread_safe_client(
plasma.connect(info["store_socket_name"],
info["manager_socket_name"], 64))
else:
worker.plasma_client = thread_safe_client(
plasma.connect(info["store_socket_name"], "", 64))
worker.plasma_client = thread_safe_client(
plasma.connect(info["store_socket_name"], "", 64))
if not worker.use_raylet:
local_scheduler_socket = info["local_scheduler_socket_name"]
else:
local_scheduler_socket = info["raylet_socket_name"]
local_scheduler_socket = info["raylet_socket_name"]
# If this is a driver, set the current task ID, the task driver ID, and set
# the task index to 0.
@@ -2177,28 +1982,22 @@ def connect(info,
# rerun the driver.
nil_actor_counter = 0
driver_task = ray.local_scheduler.Task(
worker.task_driver_id, ray.ObjectID(NIL_FUNCTION_ID), [], 0,
worker.current_task_id, worker.task_index,
ray.ObjectID(NIL_ACTOR_ID), ray.ObjectID(NIL_ACTOR_ID),
ray.ObjectID(NIL_ACTOR_ID), ray.ObjectID(NIL_ACTOR_ID),
nil_actor_counter, False, [], {"CPU": 0}, {}, worker.use_raylet)
driver_task = ray.raylet.Task(worker.task_driver_id,
ray.ObjectID(NIL_FUNCTION_ID), [], 0,
worker.current_task_id,
worker.task_index,
ray.ObjectID(NIL_ACTOR_ID),
ray.ObjectID(NIL_ACTOR_ID),
ray.ObjectID(NIL_ACTOR_ID),
ray.ObjectID(NIL_ACTOR_ID),
nil_actor_counter, [], {"CPU": 0}, {})
# Add the driver task to the task table.
if not worker.use_raylet:
global_state._execute_command(
driver_task.task_id(), "RAY.TASK_TABLE_ADD",
driver_task.task_id().id(), TASK_STATUS_RUNNING,
NIL_LOCAL_SCHEDULER_ID,
driver_task.execution_dependencies_string(), 0,
ray.local_scheduler.task_to_string(driver_task))
else:
global_state._execute_command(
driver_task.task_id(), "RAY.TABLE_ADD",
ray.gcs_utils.TablePrefix.RAYLET_TASK,
ray.gcs_utils.TablePubsub.RAYLET_TASK,
driver_task.task_id().id(),
driver_task._serialized_raylet_task())
global_state._execute_command(driver_task.task_id(), "RAY.TABLE_ADD",
ray.gcs_utils.TablePrefix.RAYLET_TASK,
ray.gcs_utils.TablePubsub.RAYLET_TASK,
driver_task.task_id().id(),
driver_task._serialized_raylet_task())
# Set the driver's current task ID to the task ID assigned to the
# driver task.
@@ -2207,9 +2006,9 @@ def connect(info,
# A non-driver worker begins without an assigned task.
worker.current_task_id = ray.ObjectID(NIL_ID)
worker.local_scheduler_client = ray.local_scheduler.LocalSchedulerClient(
worker.local_scheduler_client = ray.raylet.LocalSchedulerClient(
local_scheduler_socket, worker.worker_id, is_worker,
worker.current_task_id, worker.use_raylet)
worker.current_task_id)
# Start the import thread
import_thread.ImportThread(worker, mode).start()
@@ -2221,16 +2020,10 @@ def connect(info,
# temporarily using this implementation which constantly queries the
# scheduler for new error messages.
if mode == SCRIPT_MODE:
if not worker.use_raylet:
t = threading.Thread(
target=print_error_messages,
name="ray_print_error_messages",
args=(worker, ))
else:
t = threading.Thread(
target=print_error_messages_raylet,
name="ray_print_error_messages",
args=(worker, ))
t = threading.Thread(
target=print_error_messages_raylet,
name="ray_print_error_messages",
args=(worker, ))
# Making the thread a daemon causes it to exit when the main thread
# exits.
t.daemon = True
@@ -2238,7 +2031,7 @@ def connect(info,
# If we are using the raylet code path and we are not in local mode, start
# a background thread to periodically flush profiling data to the GCS.
if mode != LOCAL_MODE and worker.use_raylet:
if mode != LOCAL_MODE:
worker.profiler.start_flush_thread()
if mode == SCRIPT_MODE:
@@ -2395,6 +2188,9 @@ def register_custom_serializer(cls,
# worker and not across workers.
class_id = random_string()
# Make sure class_id is a string.
class_id = ray.utils.binary_to_hex(class_id)
if driver_id is None:
driver_id_bytes = worker.task_driver_id.id()
else:
@@ -2481,7 +2277,7 @@ def put(value, worker=global_worker):
# In LOCAL_MODE, ray.put is the identity operation.
return value
object_id = worker.local_scheduler_client.compute_put_id(
worker.current_task_id, worker.put_index, worker.use_raylet)
worker.current_task_id, worker.put_index)
worker.put_object(object_id, value)
worker.put_index += 1
return object_id
@@ -2554,21 +2350,8 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker):
raise Exception("num_returns cannot be greater than the number "
"of objects provided to ray.wait.")
timeout = timeout if timeout is not None else 2**30
if worker.use_raylet:
ready_ids, remaining_ids = worker.local_scheduler_client.wait(
object_ids, num_returns, timeout, False)
else:
object_id_strs = [
plasma.ObjectID(object_id.id()) for object_id in object_ids
]
ready_ids, remaining_ids = worker.plasma_client.wait(
object_id_strs, timeout, num_returns)
ready_ids = [
ray.ObjectID(object_id.binary()) for object_id in ready_ids
]
remaining_ids = [
ray.ObjectID(object_id.binary()) for object_id in remaining_ids
]
ready_ids, remaining_ids = worker.local_scheduler_client.wait(
object_ids, num_returns, timeout, False)
return ready_ids, remaining_ids
+1 -4
View File
@@ -88,10 +88,7 @@ if __name__ == "__main__":
tempfile_services.set_temp_root(args.temp_dir)
ray.worker.connect(
info,
mode=ray.WORKER_MODE,
use_raylet=(args.raylet_name is not None),
redis_password=args.redis_password)
info, mode=ray.WORKER_MODE, redis_password=args.redis_password)
error_explanation = """
This error is unexpected and should not have happened. Somehow a worker
+3 -6
View File
@@ -19,13 +19,10 @@ import setuptools.command.build_ext as _build_ext
# NOTE: The lists below must be kept in sync with ray/CMakeLists.txt.
ray_files = [
"ray/core/src/common/thirdparty/redis/src/redis-server",
"ray/core/src/common/redis_module/libray_redis_module.so",
"ray/core/src/ray/thirdparty/redis/src/redis-server",
"ray/core/src/ray/gcs/redis_module/libray_redis_module.so",
"ray/core/src/plasma/plasma_store_server",
"ray/core/src/plasma/plasma_manager",
"ray/core/src/local_scheduler/local_scheduler",
"ray/core/src/local_scheduler/liblocal_scheduler_library_python.so",
"ray/core/src/global_scheduler/global_scheduler",
"ray/core/src/ray/raylet/liblocal_scheduler_library_python.so",
"ray/core/src/ray/raylet/raylet_monitor", "ray/core/src/ray/raylet/raylet",
"ray/WebUI.ipynb"
]