From f93bb008bb40f63661718bb506ff44c295f730b4 Mon Sep 17 00:00:00 2001 From: mehrdadn Date: Mon, 8 Jun 2020 21:29:46 -0700 Subject: [PATCH] Change os.uname()[1] and socket.gethostname() to the portable and faster platform.node_ip() (#8839) Co-authored-by: Mehrdad --- doc/kubernetes/example.py | 4 ++-- doc/source/deploy-on-kubernetes.rst | 4 ++-- doc/yarn/example.py | 4 ++-- python/ray/dashboard/dashboard.py | 4 ++-- python/ray/log_monitor.py | 4 ++-- python/ray/memory_monitor.py | 3 ++- python/ray/reporter.py | 6 +++--- python/ray/tune/trainable.py | 3 ++- python/ray/tune/trial.py | 3 ++- rllib/evaluation/rollout_worker.py | 3 ++- rllib/execution/replay_buffer.py | 4 ++-- rllib/execution/tree_agg.py | 4 ++-- rllib/optimizers/aso_tree_aggregator.py | 4 ++-- rllib/optimizers/async_replay_optimizer.py | 4 ++-- rllib/utils/actors.py | 4 ++-- 15 files changed, 31 insertions(+), 27 deletions(-) diff --git a/doc/kubernetes/example.py b/doc/kubernetes/example.py index a6b0dd059..b1ea3e23d 100644 --- a/doc/kubernetes/example.py +++ b/doc/kubernetes/example.py @@ -7,10 +7,10 @@ import ray @ray.remote def gethostname(x): + import platform import time - import socket time.sleep(0.01) - return x + (socket.gethostname(), ) + return x + (platform.node(), ) def wait_for_nodes(expected): diff --git a/doc/source/deploy-on-kubernetes.rst b/doc/source/deploy-on-kubernetes.rst index ecc1022ab..e8355fcef 100644 --- a/doc/source/deploy-on-kubernetes.rst +++ b/doc/source/deploy-on-kubernetes.rst @@ -211,7 +211,7 @@ program: .. code-block:: python from collections import Counter - import socket + import platform import time import ray @@ -220,7 +220,7 @@ program: @ray.remote def f(x): time.sleep(0.01) - return x + (socket.gethostname(), ) + return x + (platform.node(), ) # Check that objects can be transferred from each node to each other node. %time Counter(ray.get([f.remote(f.remote(())) for _ in range(100)])) diff --git a/doc/yarn/example.py b/doc/yarn/example.py index 56e96bf32..51d6d1433 100644 --- a/doc/yarn/example.py +++ b/doc/yarn/example.py @@ -6,10 +6,10 @@ import ray @ray.remote def gethostname(x): + import platform import time - import socket time.sleep(0.01) - return x + (socket.gethostname(), ) + return x + (platform.node(), ) def wait_for_nodes(expected): diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py index d2a84d5b3..6e5aa2f8d 100644 --- a/python/ray/dashboard/dashboard.py +++ b/python/ray/dashboard/dashboard.py @@ -12,7 +12,7 @@ import errno import json import logging import os -import socket +import platform import threading import time import traceback @@ -965,7 +965,7 @@ if __name__ == "__main__": args.redis_address, password=args.redis_password) traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The dashboard on node {} failed with the following " - "error:\n{}".format(socket.gethostname(), traceback_str)) + "error:\n{}".format(platform.node(), traceback_str)) ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.DASHBOARD_DIED_ERROR, message) if isinstance(e, OSError) and e.errno == errno.ENOENT: diff --git a/python/ray/log_monitor.py b/python/ray/log_monitor.py index 9f75f5bdb..699733104 100644 --- a/python/ray/log_monitor.py +++ b/python/ray/log_monitor.py @@ -4,8 +4,8 @@ import glob import json import logging import os +import platform import shutil -import socket import time import traceback @@ -302,7 +302,7 @@ if __name__ == "__main__": args.redis_address, password=args.redis_password) traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The log monitor on node {} failed with the following " - "error:\n{}".format(socket.gethostname(), traceback_str)) + "error:\n{}".format(platform.node(), traceback_str)) ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.LOG_MONITOR_DIED_ERROR, message) raise e diff --git a/python/ray/memory_monitor.py b/python/ray/memory_monitor.py index b9c780209..c460b22d5 100644 --- a/python/ray/memory_monitor.py +++ b/python/ray/memory_monitor.py @@ -1,5 +1,6 @@ import logging import os +import platform import sys import time @@ -47,7 +48,7 @@ class RayOutOfMemoryError(Exception): " ".join(cmdline)[:100].strip()) return ("More than {}% of the memory on ".format(int( 100 * threshold)) + "node {} is used ({} / {} GB). ".format( - os.uname()[1], round(used_gb, 2), round(total_gb, 2)) + + platform.node(), round(used_gb, 2), round(total_gb, 2)) + "The top 10 memory consumers are:\n\n{}".format(proc_str) + "\n\nIn addition, up to {} GiB of shared memory is ".format( round(get_shared(psutil.virtual_memory()) / (1024**3), 2)) diff --git a/python/ray/reporter.py b/python/ray/reporter.py index eea3d79fb..4fc396318 100644 --- a/python/ray/reporter.py +++ b/python/ray/reporter.py @@ -6,7 +6,7 @@ import traceback import time import datetime import grpc -import socket +import platform import subprocess import sys from concurrent import futures @@ -92,7 +92,7 @@ class Reporter: """Initialize the reporter object.""" self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False)) self.ip = ray.services.get_node_ip_address() - self.hostname = socket.gethostname() + self.hostname = platform.node() _ = psutil.cpu_percent() # For initialization @@ -252,7 +252,7 @@ if __name__ == "__main__": args.redis_address, password=args.redis_password) traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The reporter on node {} failed with the following " - "error:\n{}".format(socket.gethostname(), traceback_str)) + "error:\n{}".format(platform.node(), traceback_str)) ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.REPORTER_DIED_ERROR, message) raise e diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index b55c1a5fe..330c5adae 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -6,6 +6,7 @@ import logging import glob import os import pickle +import platform import pandas as pd from six import string_types import shutil @@ -308,7 +309,7 @@ class Trainable: time_this_iter_s=time_this_iter, time_total_s=self._time_total, pid=os.getpid(), - hostname=os.uname()[1], + hostname=platform.node(), node_ip=self._local_ip, config=self.config, time_since_restore=self._time_since_restore, diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index ca273c155..4231fc3f6 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -3,6 +3,7 @@ from collections import deque import copy from datetime import datetime import logging +import platform import shutil import uuid import time @@ -42,7 +43,7 @@ class Location: def __str__(self): if not self.pid: return "" - elif self.hostname == os.uname()[1]: + elif self.hostname == platform.node(): return "pid={}".format(self.pid) else: return "{}:{}".format(self.hostname, self.pid) diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 66d6374fc..69acba5d2 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -3,6 +3,7 @@ import numpy as np import gym import logging import pickle +import platform import os import ray @@ -891,7 +892,7 @@ class RolloutWorker(ParallelIteratorWorker): def get_host(self): """Returns the hostname of the process running this evaluator.""" - return os.uname()[1] + return platform.node() @DeveloperAPI def apply(self, func, *args): diff --git a/rllib/execution/replay_buffer.py b/rllib/execution/replay_buffer.py index 455a3c525..a1ecab230 100644 --- a/rllib/execution/replay_buffer.py +++ b/rllib/execution/replay_buffer.py @@ -1,7 +1,7 @@ import numpy as np import random -import os import collections +import platform import sys import ray @@ -343,7 +343,7 @@ class LocalReplayBuffer(ParallelIteratorWorker): return _local_replay_buffer def get_host(self): - return os.uname()[1] + return platform.node() def add_batch(self, batch): # Make a copy so the replay buffer doesn't pin plasma memory. diff --git a/rllib/execution/tree_agg.py b/rllib/execution/tree_agg.py index af5a22524..bb8442242 100644 --- a/rllib/execution/tree_agg.py +++ b/rllib/execution/tree_agg.py @@ -1,5 +1,5 @@ import logging -import os +import platform from typing import List import ray @@ -58,7 +58,7 @@ class Aggregator(ParallelIteratorWorker): super().__init__(generator, repeat=False) def get_host(self): - return os.uname()[1] + return platform.node() def set_weights(self, weights, global_vars): self.weights = weights diff --git a/rllib/optimizers/aso_tree_aggregator.py b/rllib/optimizers/aso_tree_aggregator.py index 95d3af54c..a3061172e 100644 --- a/rllib/optimizers/aso_tree_aggregator.py +++ b/rllib/optimizers/aso_tree_aggregator.py @@ -2,7 +2,7 @@ import collections import logging -import os +import platform import time import ray @@ -172,4 +172,4 @@ class AggregationWorker(AggregationWorkerBase): return result def get_host(self): - return os.uname()[1] + return platform.node() diff --git a/rllib/optimizers/async_replay_optimizer.py b/rllib/optimizers/async_replay_optimizer.py index 4106d31f7..b98c9c53c 100644 --- a/rllib/optimizers/async_replay_optimizer.py +++ b/rllib/optimizers/async_replay_optimizer.py @@ -5,7 +5,7 @@ https://arxiv.org/abs/1803.00933""" import collections import logging import numpy as np -import os +import platform import random from six.moves import queue import threading @@ -340,7 +340,7 @@ class LocalReplayBuffer(ParallelIteratorWorker): return _local_replay_buffer def get_host(self): - return os.uname()[1] + return platform.node() def add_batch(self, batch): # Make a copy so the replay buffer doesn't pin plasma memory. diff --git a/rllib/utils/actors.py b/rllib/utils/actors.py index 073267be3..4849cc8dd 100644 --- a/rllib/utils/actors.py +++ b/rllib/utils/actors.py @@ -1,5 +1,5 @@ import logging -import os +import platform import ray from collections import deque @@ -73,7 +73,7 @@ def drop_colocated(actors): def split_colocated(actors): - localhost = os.uname()[1] + localhost = platform.node() hosts = ray.get([a.get_host.remote() for a in actors]) local = [] non_local = []