mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 02:00:12 +08:00
Global scheduler skeleton (#45)
* Initial scheduler commit * global scheduler * add global scheduler * Implement global scheduler skeleton. * Formatting. * Allow local scheduler to be started without a connection to redis so that we can test it without a global scheduler. * Fail if there are no local schedulers when the global scheduler receives a task. * Initialize uninitialized value and formatting fix. * Generalize local scheduler table to db client table. * Remove code duplication in local scheduler and add flag for whether a task came from the global scheduler or not. * Queue task specs in the local scheduler instead of tasks. * Simple global scheduler tests, including valgrind. * Factor out functions for starting processes. * Fixes.
This commit is contained in:
committed by
Stephanie Wang
parent
08707f9408
commit
d77b685a90
@@ -0,0 +1 @@
|
||||
from lib.python.global_scheduler_services import *
|
||||
@@ -1 +1 @@
|
||||
from libphoton import *
|
||||
from photon import *
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
from libphoton import *
|
||||
from photon_services import *
|
||||
+47
-27
@@ -4,14 +4,16 @@ import psutil
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import sys
|
||||
import subprocess
|
||||
import string
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Ray modules
|
||||
import config
|
||||
import photon
|
||||
import plasma
|
||||
import global_scheduler
|
||||
|
||||
# all_processes is a list of the scheduler, object store, and worker processes
|
||||
# that have been started by this services module if Ray is being used in local
|
||||
@@ -96,14 +98,33 @@ def start_redis(num_retries=20, cleanup=True):
|
||||
counter += 1
|
||||
raise Exception("Couldn't start Redis.")
|
||||
|
||||
def start_global_scheduler(redis_address, cleanup=True):
|
||||
"""Start a global scheduler process.
|
||||
|
||||
Args:
|
||||
redis_address (str): The address of the Redis instance.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
"""
|
||||
p = global_scheduler.start_global_scheduler(redis_address)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_local_scheduler(redis_address, plasma_store_name, cleanup=True):
|
||||
local_scheduler_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../photon/build/photon_scheduler")
|
||||
if RUN_PHOTON_PROFILER:
|
||||
local_scheduler_prefix = ["valgrind", "--tool=callgrind", local_scheduler_filepath]
|
||||
else:
|
||||
local_scheduler_prefix = [local_scheduler_filepath]
|
||||
local_scheduler_name = "/tmp/scheduler{}".format(random_name())
|
||||
p = subprocess.Popen(local_scheduler_prefix + ["-s", local_scheduler_name, "-r", redis_address, "-p", plasma_store_name])
|
||||
"""Start a local scheduler process.
|
||||
|
||||
Args:
|
||||
redis_address (str): The address of the Redis instance.
|
||||
plasma_store_name (str): The name of the plasma store socket to connect to.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
|
||||
Return:
|
||||
The name of the local scheduler socket.
|
||||
"""
|
||||
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, redis_address=redis_address, use_profiler=RUN_PHOTON_PROFILER)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
return local_scheduler_name
|
||||
@@ -113,29 +134,27 @@ def start_objstore(node_ip_address, redis_address, cleanup=True):
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The ip address of the node running the object store.
|
||||
redis_address (str): The address of the Redis instance to connect to.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
|
||||
Return:
|
||||
A tuple of the Plasma store socket name, the Plasma manager socket name, and
|
||||
the plasma manager port.
|
||||
"""
|
||||
# Let the object store use a fraction of the system memory.
|
||||
# Compute a fraction of the system memory for the Plasma store to use.
|
||||
system_memory = psutil.virtual_memory().total
|
||||
plasma_store_memory = int(system_memory * 0.75)
|
||||
plasma_store_filepath = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../plasma/build/plasma_store")
|
||||
if RUN_PLASMA_STORE_PROFILER:
|
||||
plasma_store_prefix = ["valgrind", "--tool=callgrind", plasma_store_filepath]
|
||||
else:
|
||||
plasma_store_prefix = [plasma_store_filepath]
|
||||
store_name = "/tmp/ray_plasma_store{}".format(random_name())
|
||||
p1 = subprocess.Popen(plasma_store_prefix + ["-s", store_name, "-m", str(plasma_store_memory)])
|
||||
|
||||
manager_name = "/tmp/ray_plasma_manager{}".format(random_name())
|
||||
p2, manager_port = plasma.start_plasma_manager(store_name, manager_name, redis_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER)
|
||||
|
||||
# Start the Plasma store.
|
||||
plasma_store_name, p1 = plasma.start_plasma_store(plasma_store_memory=plasma_store_memory, use_profiler=RUN_PLASMA_STORE_PROFILER)
|
||||
# Start the plasma manager.
|
||||
plasma_manager_name, p2, plasma_manager_port = plasma.start_plasma_manager(plasma_store_name, redis_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER)
|
||||
if cleanup:
|
||||
all_processes.append(p1)
|
||||
all_processes.append(p2)
|
||||
|
||||
return store_name, manager_name, manager_port
|
||||
return plasma_store_name, plasma_manager_name, plasma_manager_port
|
||||
|
||||
def start_worker(address_info, worker_path, cleanup=True):
|
||||
"""This method starts a worker process.
|
||||
@@ -186,8 +205,8 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None
|
||||
worker.
|
||||
|
||||
Returns:
|
||||
This returns a tuple of three things. The first element is a tuple of the
|
||||
Redis hostname and port. The second
|
||||
This returns a dictionary of the address information for the processes that
|
||||
were started.
|
||||
"""
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")
|
||||
@@ -195,12 +214,14 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None
|
||||
redis_port = start_redis(cleanup=True)
|
||||
redis_address = address(node_ip_address, redis_port)
|
||||
time.sleep(0.1)
|
||||
# Start the global scheduler.
|
||||
start_global_scheduler(redis_address, cleanup=True)
|
||||
# Start Plasma.
|
||||
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
|
||||
# Start the local scheduler.
|
||||
time.sleep(0.1)
|
||||
# Start the local scheduler.
|
||||
local_scheduler_name = start_local_scheduler(redis_address, object_store_name, cleanup=True)
|
||||
time.sleep(0.2)
|
||||
time.sleep(0.1)
|
||||
# Aggregate the address information together.
|
||||
address_info = {"node_ip_address": node_ip_address,
|
||||
"redis_port": redis_port,
|
||||
@@ -210,7 +231,6 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None
|
||||
# Start the workers.
|
||||
for _ in range(num_workers):
|
||||
start_worker(address_info, worker_path, cleanup=True)
|
||||
time.sleep(0.3)
|
||||
# Return the addresses of the relevant processes.
|
||||
start_webui(redis_port)
|
||||
return address_info
|
||||
|
||||
@@ -643,8 +643,7 @@ def init(start_ray_local=False, num_workers=None, driver_mode=SCRIPT_MODE):
|
||||
# corresponing call to disconnect will happen in the call to cleanup() when
|
||||
# the Python script exits.
|
||||
connect(address_info, driver_mode, worker=global_worker)
|
||||
if driver_mode != PYTHON_MODE:
|
||||
return "{}:{}".format(address_info["node_ip_address"], address_info["redis_port"])
|
||||
return address_info
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
"""Disconnect the driver, and terminate any processes started in init.
|
||||
|
||||
+2
-1
@@ -26,7 +26,8 @@ setup(name="ray",
|
||||
"build/plasma_manager",
|
||||
"lib/python/libplasma.so"],
|
||||
"photon": ["build/photon_scheduler",
|
||||
"libphoton.so"]},
|
||||
"photon/libphoton.so"],
|
||||
"global_scheduler": ["build/global_scheduler"]},
|
||||
data_files=datafiles,
|
||||
cmdclass={"install": install},
|
||||
install_requires=["numpy",
|
||||
|
||||
Reference in New Issue
Block a user