from __future__ import absolute_import from __future__ import division from __future__ import print_function import binascii import hashlib import numpy as np import os import sys import time import uuid import ray.gcs_utils import ray.local_scheduler import ray.ray_constants as ray_constants ERROR_KEY_PREFIX = b"Error:" DRIVER_ID_LENGTH = 20 def _random_string(): id_hash = hashlib.sha1() id_hash.update(uuid.uuid4().bytes) id_bytes = id_hash.digest() assert len(id_bytes) == 20 return id_bytes def format_error_message(exception_message, task_exception=False): """Improve the formatting of an exception thrown by a remote function. This method takes a traceback from an exception and makes it nicer by removing a few uninformative lines and adding some space to indent the remaining lines nicely. Args: exception_message (str): A message generated by traceback.format_exc(). Returns: A string of the formatted exception message. """ lines = exception_message.split("\n") if task_exception: # For errors that occur inside of tasks, remove lines 1 and 2 which are # always the same, they just contain information about the worker code. lines = lines[0:1] + lines[3:] pass return "\n".join(lines) def push_error_to_driver(worker, error_type, message, driver_id=None, data=None): """Push an error message to the driver to be printed in the background. Args: worker: The worker to use. error_type (str): The type of the error. message (str): The message that will be printed in the background on the driver. driver_id: The ID of the driver to push the error message to. If this is None, then the message will be pushed to all drivers. data: This should be a dictionary mapping strings to strings. It will be serialized with json and stored in Redis. """ if driver_id is None: driver_id = ray_constants.NIL_JOB_ID.id() error_key = ERROR_KEY_PREFIX + driver_id + b":" + _random_string() data = {} if data is None else data if not worker.use_raylet: worker.redis_client.hmset(error_key, { "type": error_type, "message": message, "data": data }) worker.redis_client.rpush("ErrorKeys", error_key) else: worker.local_scheduler_client.push_error( ray.ObjectID(driver_id), error_type, message, time.time()) def push_error_to_driver_through_redis(redis_client, use_raylet, error_type, message, driver_id=None, data=None): """Push an error message to the driver to be printed in the background. Normally the push_error_to_driver function should be used. However, in some instances, the local scheduler client is not available, e.g., because the error happens in Python before the driver or worker has connected to the backend processes. Args: redis_client: The redis client to use. use_raylet: True if we are using the Raylet code path and false otherwise. error_type (str): The type of the error. message (str): The message that will be printed in the background on the driver. driver_id: The ID of the driver to push the error message to. If this is None, then the message will be pushed to all drivers. data: This should be a dictionary mapping strings to strings. It will be serialized with json and stored in Redis. """ if driver_id is None: driver_id = ray_constants.NIL_JOB_ID.id() error_key = ERROR_KEY_PREFIX + driver_id + b":" + _random_string() data = {} if data is None else data if not use_raylet: redis_client.hmset(error_key, { "type": error_type, "message": message, "data": data }) redis_client.rpush("ErrorKeys", error_key) else: # Do everything in Python and through the Python Redis client instead # of through the raylet. error_data = ray.gcs_utils.construct_error_message( error_type, message, time.time()) redis_client.execute_command( "RAY.TABLE_APPEND", ray.gcs_utils.TablePrefix.ERROR_INFO, ray.gcs_utils.TablePubsub.ERROR_INFO, driver_id, error_data) def is_cython(obj): """Check if an object is a Cython function or method""" # TODO(suo): We could split these into two functions, one for Cython # functions and another for Cython methods. # TODO(suo): There doesn't appear to be a Cython function 'type' we can # check against via isinstance. Please correct me if I'm wrong. def check_cython(x): return type(x).__name__ == "cython_function_or_method" # Check if function or method, respectively return check_cython(obj) or \ (hasattr(obj, "__func__") and check_cython(obj.__func__)) def random_string(): """Generate a random string to use as an ID. Note that users may seed numpy, which could cause this function to generate duplicate IDs. Therefore, we need to seed numpy ourselves, but we can't interfere with the state of the user's random number generator, so we extract the state of the random number generator and reset it after we are done. TODO(rkn): If we want to later guarantee that these are generated in a deterministic manner, then we will need to make some changes here. Returns: A random byte string of length 20. """ # Get the state of the numpy random number generator. numpy_state = np.random.get_state() # Try to use true randomness. np.random.seed(None) # Generate the random ID. random_id = np.random.bytes(20) # Reset the state of the numpy random number generator. np.random.set_state(numpy_state) return random_id def decode(byte_str): """Make this unicode in Python 3, otherwise leave it as bytes.""" if not isinstance(byte_str, bytes): raise ValueError("The argument must be a bytes object.") if sys.version_info >= (3, 0): return byte_str.decode("ascii") else: return byte_str def binary_to_object_id(binary_object_id): return ray.ObjectID(binary_object_id) def binary_to_hex(identifier): hex_identifier = binascii.hexlify(identifier) if sys.version_info >= (3, 0): hex_identifier = hex_identifier.decode() return hex_identifier def hex_to_binary(hex_identifier): return binascii.unhexlify(hex_identifier) def get_cuda_visible_devices(): """Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable. Returns: if CUDA_VISIBLE_DEVICES is set, this returns a list of integers with the IDs of the GPUs. If it is not set, this returns None. """ gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None) if gpu_ids_str is None: return None if gpu_ids_str == "": return [] return [int(i) for i in gpu_ids_str.split(",")] def set_cuda_visible_devices(gpu_ids): """Set the CUDA_VISIBLE_DEVICES environment variable. Args: gpu_ids: This is a list of integers representing GPU IDs. """ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids]) def resources_from_resource_arguments(default_num_cpus, default_num_gpus, default_resources, runtime_num_cpus, runtime_num_gpus, runtime_resources): """Determine a task's resource requirements. Args: default_num_cpus: The default number of CPUs required by this function or actor method. default_num_gpus: The default number of GPUs required by this function or actor method. default_resources: The default custom resources required by this function or actor method. runtime_num_cpus: The number of CPUs requested when the task was invoked. runtime_num_gpus: The number of GPUs requested when the task was invoked. runtime_resources: The custom resources requested when the task was invoked. Returns: A dictionary of the resource requirements for the task. """ if runtime_resources is not None: resources = runtime_resources.copy() elif default_resources is not None: resources = default_resources.copy() else: resources = {} if "CPU" in resources or "GPU" in resources: raise ValueError("The resources dictionary must not " "contain the key 'CPU' or 'GPU'") assert default_num_cpus is not None resources["CPU"] = (default_num_cpus if runtime_num_cpus is None else runtime_num_cpus) if runtime_num_gpus is not None: resources["GPU"] = runtime_num_gpus elif default_num_gpus is not None: resources["GPU"] = default_num_gpus return resources def merge_dicts(d1, d2): """Merge two dicts and return a new dict that's their union.""" d = d1.copy() d.update(d2) return d def check_oversized_pickle(pickled, name, obj_type, worker): """Send a warning message if the pickled object is too large. Args: pickled: the pickled object. name: name of the pickled object. obj_type: type of the pickled object, can be 'function', 'remote function', 'actor', or 'object'. worker: the worker used to send warning message. """ length = len(pickled) if length <= ray_constants.PICKLE_OBJECT_WARNING_SIZE: return warning_message = ( "Warning: The {} {} has size {} when pickled. " "It will be stored in Redis, which could cause memory issues. " "This may mean that its definition uses a large array or other object." ).format(obj_type, name, length) push_error_to_driver( worker, ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, warning_message, driver_id=worker.task_driver_id.id())