[Stats] metrics agent exporter (#9361)

This commit is contained in:
SangBin Cho
2020-07-14 09:49:16 -07:00
committed by GitHub
parent 5b192842b5
commit f6eb47fc1f
18 changed files with 247 additions and 61 deletions
+3
View File
@@ -108,6 +108,7 @@ class Node:
include_log_monitor=True,
resources={},
temp_dir=ray.utils.get_ray_temp_dir(),
metrics_agent_port=self._get_unused_port()[0],
worker_path=os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"workers/default_worker.py"))
@@ -554,6 +555,7 @@ class Node:
open_log(reporter_err_name))
process_info = ray.services.start_reporter(
self.redis_address,
self._ray_params.metrics_agent_port,
stdout_file=stdout_file,
stderr_file=stderr_file,
redis_password=self._ray_params.redis_password,
@@ -661,6 +663,7 @@ class Node:
self._ray_params.max_worker_port,
self._ray_params.object_manager_port,
self._ray_params.redis_password,
self._ray_params.metrics_agent_port,
use_valgrind=use_valgrind,
use_profiler=use_profiler,
stdout_file=stdout_file,
+3
View File
@@ -87,6 +87,7 @@ class RayParams:
Java worker.
java_worker_options (list): The command options for Java worker.
load_code_from_local: Whether load code from local file or from GCS.
metrics_agent_port(int): The port to bind metrics agent.
_internal_config (str): JSON configuration for overriding
RayConfig defaults. For testing purposes ONLY.
lru_evict (bool): Enable LRU eviction if space is needed.
@@ -132,6 +133,7 @@ class RayParams:
java_worker_options=None,
load_code_from_local=False,
_internal_config=None,
metrics_agent_port=None,
lru_evict=False):
self.object_ref_seed = object_ref_seed
self.redis_address = redis_address
@@ -169,6 +171,7 @@ class RayParams:
self.include_java = include_java
self.java_worker_options = java_worker_options
self.load_code_from_local = load_code_from_local
self.metrics_agent_port = metrics_agent_port
self._internal_config = _internal_config
self._lru_evict = lru_evict
self._check_usage()
+14 -3
View File
@@ -55,6 +55,10 @@ class ReporterServer(reporter_pb2_grpc.ReporterServiceServicer):
return reporter_pb2.GetProfilingStatsReply(
profiling_stats=profiling_stats, stdout=stdout, stderr=stderr)
def ReportMetrics(self, request, context):
# TODO(sang): Process metrics here.
return reporter_pb2.ReportMetricsReply()
def recursive_asdict(o):
if isinstance(o, tuple) and hasattr(o, "_asdict"):
@@ -94,11 +98,12 @@ class Reporter:
redis_client: A client used to communicate with the Redis server.
"""
def __init__(self, redis_address, redis_password=None):
def __init__(self, redis_address, port, redis_password=None):
"""Initialize the reporter object."""
self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
self.ip = ray.services.get_node_ip_address()
self.hostname = platform.node()
self.port = port
_ = psutil.cpu_percent() # For initialization
@@ -225,7 +230,7 @@ class Reporter:
server = grpc.server(thread_pool, options=(("grpc.so_reuseport", 0), ))
reporter_pb2_grpc.add_ReporterServiceServicer_to_server(
ReporterServer(), server)
port = server.add_insecure_port("[::]:0")
port = server.add_insecure_port("[::]:{}".format(self.port))
server.start()
self.redis_client.set("REPORTER_PORT:{}".format(self.ip), port)
"""Run the reporter."""
@@ -248,6 +253,11 @@ if __name__ == "__main__":
required=True,
type=str,
help="The address to use for Redis.")
parser.add_argument(
"--port",
required=True,
type=int,
help="The port to bind the reporter process.")
parser.add_argument(
"--redis-password",
required=False,
@@ -270,7 +280,8 @@ if __name__ == "__main__":
args = parser.parse_args()
ray.utils.setup_logger(args.logging_level, args.logging_format)
reporter = Reporter(args.redis_address, redis_password=args.redis_password)
reporter = Reporter(
args.redis_address, args.port, redis_password=args.redis_password)
try:
reporter.run()
+7 -4
View File
@@ -1065,6 +1065,7 @@ def start_log_monitor(redis_address,
def start_reporter(redis_address,
port,
stdout_file=None,
stderr_file=None,
redis_password=None,
@@ -1073,6 +1074,7 @@ def start_reporter(redis_address,
Args:
redis_address (str): The address of the Redis instance.
port(int): The port to bind the reporter process.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
@@ -1085,10 +1087,8 @@ def start_reporter(redis_address,
reporter_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "reporter.py")
command = [
sys.executable,
"-u",
reporter_filepath,
"--redis-address={}".format(redis_address),
sys.executable, "-u", reporter_filepath,
"--redis-address={}".format(redis_address), "--port={}".format(port)
]
if redis_password:
command += ["--redis-password", redis_password]
@@ -1249,6 +1249,7 @@ def start_raylet(redis_address,
max_worker_port=None,
object_manager_port=None,
redis_password=None,
metrics_agent_port=None,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
@@ -1284,6 +1285,7 @@ def start_raylet(redis_address,
max_worker_port (int): The highest port number that workers will bind
on. If set, min_worker_port must also be set.
redis_password: The password to use when connecting to Redis.
metrics_agent_port(int): The port where metrics agent is bound to.
use_valgrind (bool): True if the raylet should be started inside
of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the raylet should be started inside
@@ -1390,6 +1392,7 @@ def start_raylet(redis_address,
"--redis_password={}".format(redis_password or ""),
"--temp_dir={}".format(temp_dir),
"--session_dir={}".format(session_dir),
"--metrics-agent-port={}".format(metrics_agent_port),
]
if config.get("plasma_store_as_thread"):
# command related to the plasma store