mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 05:01:48 +08:00
Remove task context from python worker (#5987)
Removes duplicated state between the python and C++ workers. Also cleans up the serialization codepaths a bit.
This commit is contained in:
+101
-129
@@ -75,6 +75,7 @@ from ray.includes.ray_config cimport RayConfig
|
||||
|
||||
import ray
|
||||
import ray.experimental.signal as ray_signal
|
||||
import ray.memory_monitor as memory_monitor
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray import profiling
|
||||
from ray.exceptions import (
|
||||
@@ -178,13 +179,6 @@ cdef c_vector[CObjectID] ObjectIDsToVector(object_ids):
|
||||
return result
|
||||
|
||||
|
||||
def compute_put_id(TaskID task_id, int64_t put_index):
|
||||
if put_index < 1 or put_index > <int64_t>CObjectID.MaxObjectIndex():
|
||||
raise ValueError("The range of 'put_index' should be [1, %d]"
|
||||
% CObjectID.MaxObjectIndex())
|
||||
return ObjectID(CObjectID.ForPut(task_id.native(), put_index, 0).Binary())
|
||||
|
||||
|
||||
def compute_task_id(ObjectID object_id):
|
||||
return TaskID(object_id.native().TaskId().Binary())
|
||||
|
||||
@@ -460,26 +454,6 @@ cdef deserialize_args(
|
||||
|
||||
return ray.signature.recover_args(args)
|
||||
|
||||
cdef _check_worker_state(worker, CTaskType task_type, JobID job_id):
|
||||
assert worker.current_task_id.is_nil()
|
||||
assert worker.task_context.task_index == 0
|
||||
assert worker.task_context.put_index == 1
|
||||
|
||||
# If this worker is not an actor, check that `current_job_id`
|
||||
# was reset when the worker finished the previous task.
|
||||
if <int>task_type in [<int>TASK_TYPE_NORMAL_TASK,
|
||||
<int>TASK_TYPE_ACTOR_CREATION_TASK]:
|
||||
assert worker.current_job_id.is_nil()
|
||||
# Set the driver ID of the current running task. This is
|
||||
# needed so that if the task throws an exception, we propagate
|
||||
# the error message to the correct driver.
|
||||
worker.current_job_id = job_id
|
||||
else:
|
||||
# If this worker is an actor, current_job_id wasn't reset.
|
||||
# Check that current task's driver ID equals the previous
|
||||
# one.
|
||||
assert worker.current_job_id == job_id
|
||||
|
||||
|
||||
cdef _store_task_outputs(worker, return_ids, outputs):
|
||||
for i in range(len(return_ids)):
|
||||
@@ -494,14 +468,12 @@ cdef _store_task_outputs(worker, return_ids, outputs):
|
||||
"from a remote function, but the corresponding "
|
||||
"ObjectID does not exist in the local object store.")
|
||||
else:
|
||||
worker.put_object(return_id, output)
|
||||
worker.put_object(output, object_id=return_id)
|
||||
|
||||
|
||||
cdef execute_task(
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &c_job_id,
|
||||
const CActorID &c_actor_id,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
@@ -510,14 +482,10 @@ cdef execute_task(
|
||||
|
||||
worker = ray.worker.global_worker
|
||||
|
||||
actor_id = ActorID(c_actor_id.Binary())
|
||||
job_id = JobID(c_job_id.Binary())
|
||||
actor_id = worker.core_worker.get_actor_id()
|
||||
job_id = worker.core_worker.get_current_job_id()
|
||||
task_id = worker.core_worker.get_current_task_id()
|
||||
|
||||
# Check that the worker is in the expected state to execute the task.
|
||||
_check_worker_state(worker, task_type, job_id)
|
||||
worker.task_context.current_task_id = task_id
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
|
||||
@@ -525,7 +493,6 @@ cdef execute_task(
|
||||
ray_function.GetFunctionDescriptor())
|
||||
|
||||
if <int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK:
|
||||
worker.actor_id = actor_id
|
||||
actor_class = worker.function_actor_manager.load_actor_class(
|
||||
job_id, function_descriptor)
|
||||
worker.actors[actor_id] = actor_class.__new__(actor_class)
|
||||
@@ -556,7 +523,7 @@ cdef execute_task(
|
||||
ray_constants.from_memory_units(
|
||||
dereference(c_resources.find(b"memory")).second))
|
||||
if c_resources.find(b"object_store_memory") != c_resources.end():
|
||||
worker._set_object_store_client_options(
|
||||
worker.core_worker.set_object_store_client_options(
|
||||
worker_name,
|
||||
int(ray_constants.from_memory_units(
|
||||
dereference(
|
||||
@@ -613,19 +580,10 @@ cdef execute_task(
|
||||
# Send signal with the error.
|
||||
ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
|
||||
|
||||
# Reset the state fields so the next task can run.
|
||||
worker.task_context.current_task_id = TaskID.nil()
|
||||
worker.core_worker.set_current_task_id(TaskID.nil())
|
||||
worker.task_context.task_index = 0
|
||||
worker.task_context.put_index = 1
|
||||
|
||||
# Don't need to reset `current_job_id` if the worker is an
|
||||
# actor. Because the following tasks should all have the
|
||||
# same driver id.
|
||||
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
|
||||
worker.current_job_id = JobID.nil()
|
||||
worker.core_worker.set_current_job_id(JobID.nil())
|
||||
|
||||
# Reset signal counters so that the next task can get
|
||||
# all past signals.
|
||||
ray_signal.reset()
|
||||
@@ -646,8 +604,6 @@ cdef execute_task(
|
||||
cdef CRayStatus task_execution_handler(
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &c_job_id,
|
||||
const CActorID &c_actor_id,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
@@ -658,8 +614,7 @@ cdef CRayStatus task_execution_handler(
|
||||
try:
|
||||
# The call to execute_task should never raise an exception. If it
|
||||
# does, that indicates that there was an unexpected internal error.
|
||||
execute_task(task_type, ray_function, c_job_id,
|
||||
c_actor_id, c_resources, c_args,
|
||||
execute_task(task_type, ray_function, c_resources, c_args,
|
||||
c_arg_reference_ids, c_return_ids, returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
@@ -715,22 +670,11 @@ cdef class CoreWorker:
|
||||
def get_current_task_id(self):
|
||||
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
|
||||
|
||||
def set_current_task_id(self, TaskID task_id):
|
||||
cdef:
|
||||
CTaskID c_task_id = task_id.native()
|
||||
|
||||
with nogil:
|
||||
self.core_worker.get().SetCurrentTaskId(c_task_id)
|
||||
|
||||
def get_current_job_id(self):
|
||||
return JobID(self.core_worker.get().GetCurrentJobId().Binary())
|
||||
|
||||
def set_current_job_id(self, JobID job_id):
|
||||
cdef:
|
||||
CJobID c_job_id = job_id.native()
|
||||
|
||||
with nogil:
|
||||
self.core_worker.get().SetCurrentJobId(c_job_id)
|
||||
def get_actor_id(self):
|
||||
return ActorID(self.core_worker.get().GetActorId().Binary())
|
||||
|
||||
def get_objects(self, object_ids, TaskID current_task_id,
|
||||
int64_t timeout_ms=-1):
|
||||
@@ -756,87 +700,108 @@ cdef class CoreWorker:
|
||||
|
||||
return has_object
|
||||
|
||||
def put_serialized_object(self, serialized_object, ObjectID object_id,
|
||||
int memcopy_threads=6):
|
||||
cdef:
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata
|
||||
CObjectID c_object_id = object_id.native()
|
||||
size_t data_size
|
||||
|
||||
data_size = serialized_object.total_bytes
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Create(
|
||||
metadata, data_size, c_object_id, &data))
|
||||
cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata,
|
||||
size_t data_size, ObjectID object_id,
|
||||
CObjectID *c_object_id, shared_ptr[CBuffer] *data):
|
||||
delay = ray_constants.DEFAULT_PUT_OBJECT_DELAY
|
||||
for attempt in reversed(
|
||||
range(ray_constants.DEFAULT_PUT_OBJECT_RETRIES)):
|
||||
try:
|
||||
if object_id is None:
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Create(
|
||||
metadata, data_size, c_object_id, data))
|
||||
else:
|
||||
c_object_id[0] = object_id.native()
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Create(
|
||||
metadata, data_size, c_object_id[0], data))
|
||||
break
|
||||
except ObjectStoreFullError as e:
|
||||
if attempt:
|
||||
logger.warning("Waiting {} seconds for space to free up "
|
||||
"in the object store.".format(delay))
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
else:
|
||||
self.dump_object_store_memory_usage()
|
||||
raise e
|
||||
|
||||
# If data is nullptr, that means the ObjectID already existed,
|
||||
# which we ignore.
|
||||
# TODO(edoakes): this is hacky, we should return the error instead
|
||||
# and deal with it here.
|
||||
if not data:
|
||||
return
|
||||
return data.get() == NULL
|
||||
|
||||
stream = pyarrow.FixedSizeBufferWriter(
|
||||
pyarrow.py_buffer(Buffer.make(data)))
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
serialized_object.write_to(stream)
|
||||
def put_serialized_object(self, serialized_object, ObjectID object_id=None,
|
||||
int memcopy_threads=6):
|
||||
cdef:
|
||||
CObjectID c_object_id
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Seal(c_object_id))
|
||||
object_already_exists = self._create_put_buffer(
|
||||
metadata, serialized_object.total_bytes,
|
||||
object_id, &c_object_id, &data)
|
||||
if not object_already_exists:
|
||||
stream = pyarrow.FixedSizeBufferWriter(
|
||||
pyarrow.py_buffer(Buffer.make(data)))
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
serialized_object.write_to(stream)
|
||||
|
||||
def put_raw_buffer(self, c_string value, ObjectID object_id,
|
||||
with nogil:
|
||||
check_status(
|
||||
self.core_worker.get().Objects().Seal(c_object_id))
|
||||
|
||||
return ObjectID(c_object_id.Binary())
|
||||
|
||||
def put_raw_buffer(self, c_string value, ObjectID object_id=None,
|
||||
int memcopy_threads=6):
|
||||
cdef:
|
||||
c_string metadata_str = RAW_BUFFER_METADATA
|
||||
CObjectID c_object_id = object_id.native()
|
||||
CObjectID c_object_id
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata = dynamic_pointer_cast[
|
||||
CBuffer, LocalMemoryBuffer](
|
||||
make_shared[LocalMemoryBuffer](
|
||||
<uint8_t*>(metadata_str.data()), metadata_str.size()))
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Create(
|
||||
metadata, value.size(), c_object_id, &data))
|
||||
object_already_exists = self._create_put_buffer(
|
||||
metadata, value.size(), object_id, &c_object_id, &data)
|
||||
if not object_already_exists:
|
||||
stream = pyarrow.FixedSizeBufferWriter(
|
||||
pyarrow.py_buffer(Buffer.make(data)))
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
stream.write(pyarrow.py_buffer(value))
|
||||
|
||||
stream = pyarrow.FixedSizeBufferWriter(
|
||||
pyarrow.py_buffer(Buffer.make(data)))
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
stream.write(pyarrow.py_buffer(value))
|
||||
with nogil:
|
||||
check_status(
|
||||
self.core_worker.get().Objects().Seal(c_object_id))
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Seal(c_object_id))
|
||||
return ObjectID(c_object_id.Binary())
|
||||
|
||||
def put_pickle5_buffers(self, ObjectID object_id, c_string inband,
|
||||
Pickle5Writer writer,
|
||||
int memcopy_threads):
|
||||
def put_pickle5_buffers(self, c_string inband,
|
||||
Pickle5Writer writer, ObjectID object_id=None,
|
||||
int memcopy_threads=6):
|
||||
cdef:
|
||||
shared_ptr[CBuffer] data
|
||||
CObjectID c_object_id
|
||||
c_string metadata_str = PICKLE5_BUFFER_METADATA
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata = dynamic_pointer_cast[
|
||||
CBuffer, LocalMemoryBuffer](
|
||||
make_shared[LocalMemoryBuffer](
|
||||
<uint8_t*>(metadata_str.data()), metadata_str.size()))
|
||||
CObjectID c_object_id = object_id.native()
|
||||
size_t data_size
|
||||
|
||||
data_size = writer.get_total_bytes(inband)
|
||||
object_already_exists = self._create_put_buffer(
|
||||
metadata, writer.get_total_bytes(inband),
|
||||
object_id, &c_object_id, &data)
|
||||
if not object_already_exists:
|
||||
writer.write_to(inband, data, memcopy_threads)
|
||||
with nogil:
|
||||
check_status(
|
||||
self.core_worker.get().Objects().Seal(c_object_id))
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Create(
|
||||
metadata, data_size, c_object_id, &data))
|
||||
|
||||
# If data is nullptr, that means the ObjectID already existed,
|
||||
# which we ignore.
|
||||
# TODO(edoakes): this is hacky, we should return the error instead
|
||||
# and deal with it here.
|
||||
if not data:
|
||||
return
|
||||
|
||||
writer.write_to(inband, data, memcopy_threads)
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Seal(c_object_id))
|
||||
return ObjectID(c_object_id.Binary())
|
||||
|
||||
def wait(self, object_ids, int num_returns, int64_t timeout_ms,
|
||||
TaskID current_task_id):
|
||||
@@ -860,7 +825,7 @@ cdef class CoreWorker:
|
||||
else:
|
||||
not_ready.append(object_id)
|
||||
|
||||
return (ready, not_ready)
|
||||
return ready, not_ready
|
||||
|
||||
def free_objects(self, object_ids, c_bool local_only,
|
||||
c_bool delete_creating_tasks):
|
||||
@@ -871,20 +836,27 @@ cdef class CoreWorker:
|
||||
check_status(self.core_worker.get().Objects().Delete(
|
||||
free_ids, local_only, delete_creating_tasks))
|
||||
|
||||
def set_object_store_client_options(self, c_string client_name,
|
||||
def set_object_store_client_options(self, client_name,
|
||||
int64_t limit_bytes):
|
||||
with nogil:
|
||||
try:
|
||||
logger.debug("Setting plasma memory limit to {} for {}".format(
|
||||
limit_bytes, client_name))
|
||||
check_status(self.core_worker.get().Objects().SetClientOptions(
|
||||
client_name, limit_bytes))
|
||||
client_name.encode("ascii"), limit_bytes))
|
||||
except RayError as e:
|
||||
self.dump_object_store_memory_usage()
|
||||
raise memory_monitor.RayOutOfMemoryError(
|
||||
"Failed to set object_store_memory={} for {}. The "
|
||||
"plasma store may have insufficient memory remaining "
|
||||
"to satisfy this limit (30% of object store memory is "
|
||||
"permanently reserved for shared usage). The current "
|
||||
"object store memory status is:\n\n{}".format(
|
||||
limit_bytes, client_name, e))
|
||||
|
||||
def object_store_memory_usage_string(self):
|
||||
cdef:
|
||||
c_string message
|
||||
|
||||
with nogil:
|
||||
message = self.core_worker.get().Objects().MemoryUsageString()
|
||||
|
||||
return message.decode("utf-8")
|
||||
def dump_object_store_memory_usage(self):
|
||||
message = self.core_worker.get().Objects().MemoryUsageString()
|
||||
logger.warning("Local object store memory usage:\n{}\n".format(
|
||||
message.decode("utf-8")))
|
||||
|
||||
def submit_task(self,
|
||||
function_descriptor,
|
||||
|
||||
@@ -3,7 +3,7 @@ from collections import defaultdict, deque
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.experimental.serve.utils import get_custom_object_id, logger
|
||||
from ray.experimental.serve.utils import logger
|
||||
|
||||
|
||||
class Query:
|
||||
@@ -17,7 +17,7 @@ class Query:
|
||||
self.request_context = request_context
|
||||
|
||||
if result_object_id is None:
|
||||
self.result_object_id = get_custom_object_id()
|
||||
self.result_object_id = ray.ObjectID.from_random()
|
||||
else:
|
||||
self.result_object_id = result_object_id
|
||||
|
||||
@@ -25,7 +25,7 @@ class Query:
|
||||
class WorkIntent:
|
||||
def __init__(self, work_object_id=None):
|
||||
if work_object_id is None:
|
||||
self.work_object_id = get_custom_object_id()
|
||||
self.work_object_id = ray.ObjectID.from_random()
|
||||
else:
|
||||
self.work_object_id = work_object_id
|
||||
|
||||
@@ -160,7 +160,7 @@ class CentralizedQueues:
|
||||
work_queue.popleft(),
|
||||
)
|
||||
ray.worker.global_worker.put_object(
|
||||
work.work_object_id, request)
|
||||
request, work.work_object_id)
|
||||
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -123,12 +123,12 @@ class RayServeMixin:
|
||||
start_timestamp = time.time()
|
||||
try:
|
||||
result = self.__call__(*args, **kwargs)
|
||||
ray.worker.global_worker.put_object(result_object_id, result)
|
||||
ray.worker.global_worker.put_object(result, result_object_id)
|
||||
except Exception as e:
|
||||
wrapped_exception = wrap_to_ray_error(e)
|
||||
self._serve_metric_error_counter += 1
|
||||
ray.worker.global_worker.put_object(result_object_id,
|
||||
wrapped_exception)
|
||||
ray.worker.global_worker.put_object(wrapped_exception,
|
||||
result_object_id)
|
||||
self._serve_metric_latency_list.append(time.time() - start_timestamp)
|
||||
|
||||
serve_context.web = False
|
||||
|
||||
@@ -12,7 +12,7 @@ def test_single_prod_cons_queue(serve_instance):
|
||||
assert got_work.request_args == 1
|
||||
assert got_work.request_kwargs == "kwargs"
|
||||
|
||||
ray.worker.global_worker.put_object(got_work.result_object_id, 2)
|
||||
ray.worker.global_worker.put_object(2, got_work.result_object_id)
|
||||
assert ray.get(ray.ObjectID(result_object_id)) == 2
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_alter_backend(serve_instance):
|
||||
work_object_id = q.dequeue_request("backend-1")
|
||||
got_work = ray.get(ray.ObjectID(work_object_id))
|
||||
assert got_work.request_args == 1
|
||||
ray.worker.global_worker.put_object(got_work.result_object_id, 2)
|
||||
ray.worker.global_worker.put_object(2, got_work.result_object_id)
|
||||
assert ray.get(ray.ObjectID(result_object_id)) == 2
|
||||
|
||||
q.set_traffic("svc", {"backend-2": 1})
|
||||
@@ -32,7 +32,7 @@ def test_alter_backend(serve_instance):
|
||||
work_object_id = q.dequeue_request("backend-2")
|
||||
got_work = ray.get(ray.ObjectID(work_object_id))
|
||||
assert got_work.request_args == 1
|
||||
ray.worker.global_worker.put_object(got_work.result_object_id, 2)
|
||||
ray.worker.global_worker.put_object(2, got_work.result_object_id)
|
||||
assert ray.get(ray.ObjectID(result_object_id)) == 2
|
||||
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@ import logging
|
||||
|
||||
from pygments import formatters, highlight, lexers
|
||||
|
||||
import ray
|
||||
|
||||
|
||||
def _get_logger():
|
||||
logger = logging.getLogger("ray.serve")
|
||||
@@ -32,15 +30,6 @@ class BytesEncoder(json.JSONEncoder):
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def get_custom_object_id():
|
||||
"""Use ray worker API to get computed ObjectID"""
|
||||
worker = ray.worker.global_worker
|
||||
object_id = ray._raylet.compute_put_id(worker.current_task_id,
|
||||
worker.task_context.put_index)
|
||||
worker.task_context.put_index += 1
|
||||
return object_id
|
||||
|
||||
|
||||
def pformat_color_json(d):
|
||||
"""Use pygments to pretty format and colroize dictionary"""
|
||||
formatted_json = json.dumps(d, sort_keys=True, indent=4)
|
||||
|
||||
@@ -141,8 +141,8 @@ class BatchedQueue(object):
|
||||
if not self.write_buffer:
|
||||
return
|
||||
batch_id = self._batch_id(self.write_batch_offset)
|
||||
ray.worker.global_worker.put_object(
|
||||
ray.ObjectID(batch_id), self.write_buffer)
|
||||
ray.worker.global_worker.put_object(self.write_buffer,
|
||||
ray.ObjectID(batch_id))
|
||||
logger.debug("[writer] Flush batch {} offset {} size {}".format(
|
||||
self.write_batch_offset, self.write_item_offset,
|
||||
len(self.write_buffer)))
|
||||
|
||||
@@ -58,6 +58,9 @@ cdef extern from "ray/core_worker/object_interface.h" nogil:
|
||||
CRayStatus SetClientOptions(c_string client_name, int64_t limit)
|
||||
CRayStatus Put(const CRayObject &object, CObjectID *object_id)
|
||||
CRayStatus Put(const CRayObject &object, const CObjectID &object_id)
|
||||
CRayStatus Create(const shared_ptr[CBuffer] &metadata,
|
||||
const size_t data_size, CObjectID *object_id,
|
||||
shared_ptr[CBuffer] *data)
|
||||
CRayStatus Create(const shared_ptr[CBuffer] &metadata,
|
||||
const size_t data_size, const CObjectID &object_id,
|
||||
shared_ptr[CBuffer] *data)
|
||||
@@ -81,8 +84,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CRayStatus (
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &job_id,
|
||||
const CActorID &actor_id,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
const c_vector[CObjectID] &arg_reference_ids,
|
||||
@@ -113,12 +114,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
# TODO(edoakes): remove this once the raylet client is no longer used
|
||||
# directly.
|
||||
CRayletClient &GetRayletClient()
|
||||
# TODO(edoakes): remove these once the Python core worker uses the task
|
||||
# interfaces
|
||||
CJobID GetCurrentJobId()
|
||||
void SetCurrentJobId(const CJobID &job_id)
|
||||
CTaskID GetCurrentTaskId()
|
||||
void SetCurrentTaskId(const CTaskID &task_id)
|
||||
const CActorID &GetActorId()
|
||||
CTaskID GetCallerId()
|
||||
const ResourceMappingType &GetResourceIDs() const
|
||||
|
||||
@@ -2281,7 +2281,7 @@ def test_actor_reconstruction_without_task(ray_start_regular):
|
||||
# put a new object in plasma store.
|
||||
global_worker = ray.worker.global_worker
|
||||
if not global_worker.core_worker.object_exists(obj_id):
|
||||
global_worker.put_object(obj_id, 1)
|
||||
global_worker.put_object(1, obj_id)
|
||||
break
|
||||
|
||||
def get_pid(self):
|
||||
|
||||
@@ -59,7 +59,7 @@ def test_dying_worker_get(ray_start_2_cpus):
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.put_object(x_id, 1)
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -102,7 +102,7 @@ ray.get(ray.ObjectID(ray.utils.hex_to_binary("{}")))
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.put_object(x_id, 1)
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -142,7 +142,7 @@ def test_dying_worker_wait(ray_start_2_cpus):
|
||||
time.sleep(0.1)
|
||||
|
||||
# Create the object.
|
||||
ray.worker.global_worker.put_object(x_id, 1)
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -185,7 +185,7 @@ ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))])
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# wait can return.
|
||||
ray.worker.global_worker.put_object(x_id, 1)
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
|
||||
@@ -331,7 +331,7 @@ def test_dynamic_res_concurrent_res_increment(ray_start_cluster):
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(running_oid))
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@@ -351,7 +351,7 @@ def test_dynamic_res_concurrent_res_increment(ray_start_cluster):
|
||||
ray.get(set_res.remote(res_name, updated_capacity, target_node_id))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(WAIT_OBJECT_ID_STR))
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
@@ -410,7 +410,7 @@ def test_dynamic_res_concurrent_res_decrement(ray_start_cluster):
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(running_oid))
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@@ -430,7 +430,7 @@ def test_dynamic_res_concurrent_res_decrement(ray_start_cluster):
|
||||
ray.get(set_res.remote(res_name, updated_capacity, target_node_id))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(WAIT_OBJECT_ID_STR))
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
@@ -492,7 +492,7 @@ def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(running_oid))
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@@ -512,7 +512,7 @@ def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
|
||||
ray.get(delete_res.remote(res_name, target_node_id))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.worker.global_worker.put_object(1, ray.ObjectID(WAIT_OBJECT_ID_STR))
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
|
||||
@@ -76,7 +76,8 @@ class TestMemoryLimits(unittest.TestCase):
|
||||
print("Raised exception", type(e), e)
|
||||
raise e
|
||||
finally:
|
||||
print(ray.worker.global_worker.dump_object_store_memory_usage())
|
||||
print(ray.worker.global_worker.core_worker.
|
||||
dump_object_store_memory_usage())
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import subprocess
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.utils import _random_string
|
||||
from ray.tests.utils import (
|
||||
RayTestTimeoutException,
|
||||
run_string_as_driver,
|
||||
@@ -556,14 +555,12 @@ print("success")
|
||||
# Create some drivers and let them exit and make sure everything is
|
||||
# still alive.
|
||||
for _ in range(3):
|
||||
nonexistent_id_bytes = _random_string()
|
||||
nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
|
||||
nonexistent_id = ray.ObjectID.from_random()
|
||||
driver_script = driver_script_template.format(address,
|
||||
nonexistent_id_hex)
|
||||
nonexistent_id.hex())
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Simulate the nonexistent dependency becoming available.
|
||||
ray.worker.global_worker.put_object(
|
||||
ray.ObjectID(nonexistent_id_bytes), None)
|
||||
ray.worker.global_worker.put_object(None, nonexistent_id)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
|
||||
@@ -583,14 +580,12 @@ print("success")
|
||||
# Create some drivers and let them exit and make sure everything is
|
||||
# still alive.
|
||||
for _ in range(3):
|
||||
nonexistent_id_bytes = _random_string()
|
||||
nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
|
||||
nonexistent_id = ray.ObjectID.from_random()
|
||||
driver_script = driver_script_template.format(address,
|
||||
nonexistent_id_hex)
|
||||
nonexistent_id.hex())
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Simulate the nonexistent dependency becoming available.
|
||||
ray.worker.global_worker.put_object(
|
||||
ray.ObjectID(nonexistent_id_bytes), None)
|
||||
ray.worker.global_worker.put_object(None, nonexistent_id)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
|
||||
|
||||
+116
-206
@@ -42,7 +42,6 @@ from ray import (
|
||||
ActorID,
|
||||
JobID,
|
||||
ObjectID,
|
||||
TaskID,
|
||||
)
|
||||
from ray import import_thread
|
||||
from ray import profiling
|
||||
@@ -141,10 +140,6 @@ class Worker(object):
|
||||
# TODO: clean up the SerializationContext once the job finished.
|
||||
self.serialization_context_map = {}
|
||||
self.function_actor_manager = FunctionActorManager(self)
|
||||
# Identity of the job that this worker is processing.
|
||||
# It is a JobID.
|
||||
self.current_job_id = JobID.nil()
|
||||
self._task_context = threading.local()
|
||||
# This event is checked regularly by all of the threads so that they
|
||||
# know when to exit.
|
||||
self.threads_stopped = threading.Event()
|
||||
@@ -175,46 +170,20 @@ class Worker(object):
|
||||
return self.node.use_pickle
|
||||
|
||||
@property
|
||||
def task_context(self):
|
||||
"""A thread-local that contains the following attributes.
|
||||
def current_job_id(self):
|
||||
if hasattr(self, "core_worker"):
|
||||
return self.core_worker.get_current_job_id()
|
||||
return JobID.nil()
|
||||
|
||||
current_task_id: For the main thread, this field is the ID of this
|
||||
worker's current running task; for other threads, this field is a
|
||||
fake random ID.
|
||||
task_index: The number of tasks that have been submitted from the
|
||||
current task.
|
||||
put_index: The number of objects that have been put from the current
|
||||
task.
|
||||
"""
|
||||
if not hasattr(self._task_context, "initialized"):
|
||||
# Initialize task_context for the current thread.
|
||||
if ray.utils.is_main_thread():
|
||||
# If this is running on the main thread, initialize it to
|
||||
# NIL. The actual value will set when the worker receives
|
||||
# a task from raylet backend.
|
||||
self._task_context.current_task_id = TaskID.nil()
|
||||
else:
|
||||
# If this is running on a separate thread, then the mapping
|
||||
# to the current task ID may not be correct. Generate a
|
||||
# random task ID so that the backend can differentiate
|
||||
# between different threads.
|
||||
self._task_context.current_task_id = TaskID.for_fake_task()
|
||||
if getattr(self, "_multithreading_warned", False) is not True:
|
||||
logger.warning(
|
||||
"Calling ray.get or ray.wait in a separate thread "
|
||||
"may lead to deadlock if the main thread blocks on "
|
||||
"this thread and there are not enough resources to "
|
||||
"execute more tasks")
|
||||
self._multithreading_warned = True
|
||||
|
||||
self._task_context.task_index = 0
|
||||
self._task_context.put_index = 1
|
||||
self._task_context.initialized = True
|
||||
return self._task_context
|
||||
@property
|
||||
def actor_id(self):
|
||||
if hasattr(self, "core_worker"):
|
||||
return self.core_worker.get_actor_id()
|
||||
return ActorID.nil()
|
||||
|
||||
@property
|
||||
def current_task_id(self):
|
||||
return self.task_context.current_task_id
|
||||
return self.core_worker.get_current_task_id()
|
||||
|
||||
@property
|
||||
def current_session_and_job(self):
|
||||
@@ -283,19 +252,111 @@ class Worker(object):
|
||||
"""
|
||||
self.mode = mode
|
||||
|
||||
def store_and_register(self, object_id, value, depth=100):
|
||||
def put_object(self, value, object_id=None):
|
||||
"""Put value in the local object store with object id `objectid`.
|
||||
|
||||
This assumes that the value for `objectid` has not yet been placed in
|
||||
the local object store. If the plasma store is full, the worker will
|
||||
automatically retry up to DEFAULT_PUT_OBJECT_RETRIES times. Each
|
||||
retry will delay for an exponentially doubling amount of time,
|
||||
starting with DEFAULT_PUT_OBJECT_DELAY. After this, exception
|
||||
will be raised.
|
||||
|
||||
Args:
|
||||
value: The value to put in the object store.
|
||||
object_id (object_id.ObjectID): The object ID of the value to be
|
||||
put. If None, one will be generated.
|
||||
|
||||
Returns:
|
||||
object_id.ObjectID: The object ID the object was put under.
|
||||
|
||||
Raises:
|
||||
ray.exceptions.ObjectStoreFullError: This is raised if the attempt
|
||||
to store the object fails because the object store is full even
|
||||
after multiple retries.
|
||||
"""
|
||||
# Make sure that the value is not an object ID.
|
||||
if isinstance(value, ObjectID):
|
||||
raise TypeError(
|
||||
"Calling 'put' on an ray.ObjectID is not allowed "
|
||||
"(similarly, returning an ray.ObjectID from a remote "
|
||||
"function is not allowed). If you really want to "
|
||||
"do this, you can wrap the ray.ObjectID in a list and "
|
||||
"call 'put' on it (or return it).")
|
||||
|
||||
if isinstance(value, bytes):
|
||||
# If the object is a byte array, skip serializing it and
|
||||
# use a special metadata to indicate it's raw binary. So
|
||||
# that this object can also be read by Java.
|
||||
return self.core_worker.put_raw_buffer(
|
||||
value,
|
||||
object_id=object_id,
|
||||
memcopy_threads=self.memcopy_threads)
|
||||
|
||||
if self.use_pickle:
|
||||
return self._serialize_and_put_pickle5(value, object_id=object_id)
|
||||
else:
|
||||
return self._serialize_and_put_pyarrow(value, object_id=object_id)
|
||||
|
||||
def _serialize_and_put_pickle5(self, value, object_id=None):
|
||||
"""Serialize an object using pickle5 and store it in the object store.
|
||||
|
||||
Args:
|
||||
value: The value to put in the object store.
|
||||
object_id: The ID of the object to store. If none, one will be
|
||||
generated.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if the attempt to store the
|
||||
object fails. This can happen if the object store is full.
|
||||
"""
|
||||
writer = Pickle5Writer()
|
||||
if ray.cloudpickle.FAST_CLOUDPICKLE_USED:
|
||||
inband = pickle.dumps(
|
||||
value, protocol=5, buffer_callback=writer.buffer_callback)
|
||||
else:
|
||||
inband = pickle.dumps(value)
|
||||
return self.core_worker.put_pickle5_buffers(
|
||||
inband,
|
||||
writer,
|
||||
object_id=object_id,
|
||||
memcopy_threads=self.memcopy_threads)
|
||||
|
||||
def _serialize_and_put_pyarrow(self, value, object_id=None):
|
||||
"""Wraps `store_and_register` with cases for existence and pickling.
|
||||
|
||||
Args:
|
||||
object_id (object_id.ObjectID): The object ID of the value to be
|
||||
put.
|
||||
value: The value to put in the object store.
|
||||
"""
|
||||
try:
|
||||
serialized_value = self._serialize_with_pyarrow(value)
|
||||
except TypeError:
|
||||
# TypeError can happen because one of the members of the object
|
||||
# may not be serializable for cloudpickle. So we need
|
||||
# these extra fallbacks here to start from the beginning.
|
||||
# Hopefully the object could have a `__reduce__` method.
|
||||
_register_custom_serializer(type(value), use_pickle=True)
|
||||
logger.warning("WARNING: Serializing the class {} failed, "
|
||||
"falling back to cloudpickle.".format(type(value)))
|
||||
serialized_value = self._serialize_with_pyarrow(value)
|
||||
|
||||
return self.core_worker.put_serialized_object(
|
||||
serialized_value,
|
||||
object_id=object_id,
|
||||
memcopy_threads=self.memcopy_threads)
|
||||
|
||||
def _serialize_with_pyarrow(self, value, depth=100):
|
||||
"""Store an object and attempt to register its class if needed.
|
||||
|
||||
Args:
|
||||
object_id: The ID of the object to store.
|
||||
value: The value to put in the object store.
|
||||
depth: The maximum number of classes to recursively register.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if the attempt to store the
|
||||
object fails. This can happen if there is already an object
|
||||
with the same ID in the object store or if the object store is
|
||||
full.
|
||||
Exception: An exception is raised if the attempt to serialize the
|
||||
object fails.
|
||||
"""
|
||||
counter = 0
|
||||
while True:
|
||||
@@ -306,20 +367,9 @@ class Worker(object):
|
||||
"type {}.".format(type(value)))
|
||||
counter += 1
|
||||
try:
|
||||
if isinstance(value, bytes):
|
||||
# If the object is a byte array, skip serializing it and
|
||||
# use a special metadata to indicate it's raw binary. So
|
||||
# that this object can also be read by Java.
|
||||
self.core_worker.put_raw_buffer(
|
||||
value, object_id, memcopy_threads=self.memcopy_threads)
|
||||
else:
|
||||
serialization_context = self.get_serialization_context(
|
||||
self.current_job_id)
|
||||
self.core_worker.put_serialized_object(
|
||||
pyarrow.serialize(value, serialization_context),
|
||||
object_id,
|
||||
memcopy_threads=self.memcopy_threads)
|
||||
break
|
||||
serialization_context = self.get_serialization_context(
|
||||
self.current_job_id)
|
||||
return pyarrow.serialize(value, serialization_context)
|
||||
except pyarrow.SerializationCallbackError as e:
|
||||
cls_type = type(e.example_object)
|
||||
try:
|
||||
@@ -352,121 +402,6 @@ class Worker(object):
|
||||
"locally.".format(cls_type))
|
||||
logger.warning(warning_message)
|
||||
|
||||
def put_object(self, object_id, value):
|
||||
"""Put value in the local object store with object id `objectid`.
|
||||
|
||||
This assumes that the value for `objectid` has not yet been placed in
|
||||
the local object store. If the plasma store is full, the worker will
|
||||
automatically retry up to DEFAULT_PUT_OBJECT_RETRIES times. Each
|
||||
retry will delay for an exponentially doubling amount of time,
|
||||
starting with DEFAULT_PUT_OBJECT_DELAY. After this, exception
|
||||
will be raised.
|
||||
|
||||
Args:
|
||||
object_id (object_id.ObjectID): The object ID of the value to be
|
||||
put.
|
||||
value: The value to put in the object store.
|
||||
|
||||
Raises:
|
||||
ray.exceptions.ObjectStoreFullError: This is raised if the attempt
|
||||
to store the object fails because the object store is full even
|
||||
after multiple retries.
|
||||
"""
|
||||
# Make sure that the value is not an object ID.
|
||||
if isinstance(value, ObjectID):
|
||||
raise TypeError(
|
||||
"Calling 'put' on an ray.ObjectID is not allowed "
|
||||
"(similarly, returning an ray.ObjectID from a remote "
|
||||
"function is not allowed). If you really want to "
|
||||
"do this, you can wrap the ray.ObjectID in a list and "
|
||||
"call 'put' on it (or return it).")
|
||||
|
||||
delay = ray_constants.DEFAULT_PUT_OBJECT_DELAY
|
||||
for attempt in reversed(
|
||||
range(ray_constants.DEFAULT_PUT_OBJECT_RETRIES)):
|
||||
try:
|
||||
if self.use_pickle:
|
||||
self.store_with_plasma(object_id, value)
|
||||
else:
|
||||
self._try_store_and_register(object_id, value)
|
||||
break
|
||||
except ObjectStoreFullError as e:
|
||||
if attempt:
|
||||
logger.warning("Waiting {} seconds for space to free up "
|
||||
"in the object store.".format(delay))
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
else:
|
||||
self.dump_object_store_memory_usage()
|
||||
raise e
|
||||
|
||||
def dump_object_store_memory_usage(self):
|
||||
"""Prints object store debug string to stdout."""
|
||||
logger.warning("Local object store memory usage:\n{}\n".format(
|
||||
self.core_worker.object_store_memory_usage_string()))
|
||||
|
||||
def store_with_plasma(self, object_id, value):
|
||||
"""Serialize and store an object.
|
||||
|
||||
Args:
|
||||
object_id: The ID of the object to store.
|
||||
value: The value to put in the object store.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if the attempt to store the
|
||||
object fails. This can happen if there is already an object
|
||||
with the same ID in the object store or if the object store is
|
||||
full.
|
||||
"""
|
||||
try:
|
||||
if isinstance(value, bytes):
|
||||
# If the object is a byte array, skip serializing it and
|
||||
# use a special metadata to indicate it's raw binary. So
|
||||
# that this object can also be read by Java.
|
||||
self.core_worker.put_raw_buffer(
|
||||
value, object_id, memcopy_threads=self.memcopy_threads)
|
||||
else:
|
||||
writer = Pickle5Writer()
|
||||
if ray.cloudpickle.FAST_CLOUDPICKLE_USED:
|
||||
inband = pickle.dumps(
|
||||
value,
|
||||
protocol=5,
|
||||
buffer_callback=writer.buffer_callback)
|
||||
else:
|
||||
inband = pickle.dumps(value)
|
||||
self.core_worker.put_pickle5_buffers(object_id, inband, writer,
|
||||
self.memcopy_threads)
|
||||
except pyarrow.plasma.PlasmaObjectExists:
|
||||
# The object already exists in the object store, so there is no
|
||||
# need to add it again. TODO(rkn): We need to compare hashes
|
||||
# and make sure that the objects are in fact the same. We also
|
||||
# should return an error code to caller instead of printing a
|
||||
# message.
|
||||
logger.info("The object with ID {} already exists "
|
||||
"in the object store.".format(object_id))
|
||||
|
||||
def _try_store_and_register(self, object_id, value):
|
||||
"""Wraps `store_and_register` with cases for existence and pickling.
|
||||
|
||||
Args:
|
||||
object_id (object_id.ObjectID): The object ID of the value to be
|
||||
put.
|
||||
value: The value to put in the object store.
|
||||
"""
|
||||
try:
|
||||
self.store_and_register(object_id, value)
|
||||
except TypeError:
|
||||
# TypeError can happen because one of the members of the object
|
||||
# may not be serializable for cloudpickle. So we need
|
||||
# these extra fallbacks here to start from the beginning.
|
||||
# Hopefully the object could have a `__reduce__` method.
|
||||
_register_custom_serializer(type(value), use_pickle=True)
|
||||
warning_message = ("WARNING: Serializing the class {} failed, "
|
||||
"falling back to cloudpickle.".format(
|
||||
type(value)))
|
||||
logger.warning(warning_message)
|
||||
self.store_and_register(object_id, value)
|
||||
|
||||
def deserialize_objects(self,
|
||||
data_metadata_pairs,
|
||||
object_ids,
|
||||
@@ -674,22 +609,6 @@ class Worker(object):
|
||||
|
||||
return ray.signature.recover_args(arguments)
|
||||
|
||||
def _set_object_store_client_options(self, name, object_store_memory):
|
||||
try:
|
||||
logger.debug("Setting plasma memory limit to {} for {}".format(
|
||||
object_store_memory, name))
|
||||
self.core_worker.set_object_store_client_options(
|
||||
name.encode("ascii"), object_store_memory)
|
||||
except RayError as e:
|
||||
self.dump_object_store_memory_usage()
|
||||
raise memory_monitor.RayOutOfMemoryError(
|
||||
"Failed to set object_store_memory={} for {}. The "
|
||||
"plasma store may have insufficient memory remaining "
|
||||
"to satisfy this limit (30% of object store memory is "
|
||||
"permanently reserved for shared usage). The current "
|
||||
"object store memory status is:\n\n{}".format(
|
||||
object_store_memory, name, e))
|
||||
|
||||
def main_loop(self):
|
||||
"""The main loop a worker runs to receive and execute tasks."""
|
||||
|
||||
@@ -1461,11 +1380,9 @@ def connect(node,
|
||||
|
||||
if not isinstance(job_id, JobID):
|
||||
raise TypeError("The type of given job id must be JobID.")
|
||||
worker.current_job_id = job_id
|
||||
|
||||
# All workers start out as non-actors. A worker can be turned into an actor
|
||||
# after it is created.
|
||||
worker.actor_id = ActorID.nil()
|
||||
worker.node = node
|
||||
worker.set_mode(mode)
|
||||
|
||||
@@ -1560,24 +1477,22 @@ def connect(node,
|
||||
(mode == SCRIPT_MODE),
|
||||
node.plasma_store_socket_name,
|
||||
node.raylet_socket_name,
|
||||
worker.current_job_id,
|
||||
job_id,
|
||||
gcs_options,
|
||||
node.get_logs_dir_path(),
|
||||
node.node_ip_address,
|
||||
)
|
||||
worker.task_context.current_task_id = (
|
||||
worker.core_worker.get_current_task_id())
|
||||
worker.raylet_client = ray._raylet.RayletClient(worker.core_worker)
|
||||
|
||||
if driver_object_store_memory is not None:
|
||||
worker._set_object_store_client_options(
|
||||
worker.core_worker.set_object_store_client_options(
|
||||
"ray_driver_{}".format(os.getpid()), driver_object_store_memory)
|
||||
|
||||
# Put something in the plasma store so that subsequent plasma store
|
||||
# accesses will be faster. Currently the first access is always slow, and
|
||||
# we don't want the user to experience this.
|
||||
temporary_object_id = ray.ObjectID(np.random.bytes(20))
|
||||
worker.put_object(temporary_object_id, 1)
|
||||
worker.put_object(1, object_id=temporary_object_id)
|
||||
ray.internal.free([temporary_object_id])
|
||||
|
||||
# Start the import thread
|
||||
@@ -1944,7 +1859,7 @@ def get(object_ids):
|
||||
if isinstance(value, RayError):
|
||||
last_task_error_raise_time = time.time()
|
||||
if isinstance(value, ray.exceptions.UnreconstructableError):
|
||||
worker.dump_object_store_memory_usage()
|
||||
worker.core_worker.dump_object_store_memory_usage()
|
||||
if isinstance(value, RayTaskError):
|
||||
raise value.as_instanceof_cause()
|
||||
else:
|
||||
@@ -1981,12 +1896,8 @@ def put(value, weakref=False):
|
||||
if worker.mode == LOCAL_MODE:
|
||||
object_id = worker.local_mode_manager.put_object(value)
|
||||
else:
|
||||
object_id = ray._raylet.compute_put_id(
|
||||
worker.current_task_id,
|
||||
worker.task_context.put_index,
|
||||
)
|
||||
try:
|
||||
worker.put_object(object_id, value)
|
||||
object_id = worker.put_object(value)
|
||||
except ObjectStoreFullError:
|
||||
logger.info(
|
||||
"Put failed since the value was either too large or the "
|
||||
@@ -1995,7 +1906,6 @@ def put(value, weakref=False):
|
||||
"ray.put(value, weakref=True) to allow object data to "
|
||||
"be evicted early.")
|
||||
raise
|
||||
worker.task_context.put_index += 1
|
||||
# Pin the object buffer with the returned id. This avoids put returns
|
||||
# from getting evicted out from under the id.
|
||||
# TODO(edoakes): we should be able to avoid this extra IPC by holding
|
||||
|
||||
Reference in New Issue
Block a user