Change os.uname()[1] and socket.gethostname() to the portable and faster platform.node_ip() (#8839)

Co-authored-by: Mehrdad <noreply@github.com>
This commit is contained in:
mehrdadn
2020-06-08 21:29:46 -07:00
committed by GitHub
parent d2ef29f0d2
commit f93bb008bb
15 changed files with 31 additions and 27 deletions
+2 -2
View File
@@ -7,10 +7,10 @@ import ray
@ray.remote
def gethostname(x):
import platform
import time
import socket
time.sleep(0.01)
return x + (socket.gethostname(), )
return x + (platform.node(), )
def wait_for_nodes(expected):
+2 -2
View File
@@ -211,7 +211,7 @@ program:
.. code-block:: python
from collections import Counter
import socket
import platform
import time
import ray
@@ -220,7 +220,7 @@ program:
@ray.remote
def f(x):
time.sleep(0.01)
return x + (socket.gethostname(), )
return x + (platform.node(), )
# Check that objects can be transferred from each node to each other node.
%time Counter(ray.get([f.remote(f.remote(())) for _ in range(100)]))
+2 -2
View File
@@ -6,10 +6,10 @@ import ray
@ray.remote
def gethostname(x):
import platform
import time
import socket
time.sleep(0.01)
return x + (socket.gethostname(), )
return x + (platform.node(), )
def wait_for_nodes(expected):
+2 -2
View File
@@ -12,7 +12,7 @@ import errno
import json
import logging
import os
import socket
import platform
import threading
import time
import traceback
@@ -965,7 +965,7 @@ if __name__ == "__main__":
args.redis_address, password=args.redis_password)
traceback_str = ray.utils.format_error_message(traceback.format_exc())
message = ("The dashboard on node {} failed with the following "
"error:\n{}".format(socket.gethostname(), traceback_str))
"error:\n{}".format(platform.node(), traceback_str))
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
if isinstance(e, OSError) and e.errno == errno.ENOENT:
+2 -2
View File
@@ -4,8 +4,8 @@ import glob
import json
import logging
import os
import platform
import shutil
import socket
import time
import traceback
@@ -302,7 +302,7 @@ if __name__ == "__main__":
args.redis_address, password=args.redis_password)
traceback_str = ray.utils.format_error_message(traceback.format_exc())
message = ("The log monitor on node {} failed with the following "
"error:\n{}".format(socket.gethostname(), traceback_str))
"error:\n{}".format(platform.node(), traceback_str))
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.LOG_MONITOR_DIED_ERROR, message)
raise e
+2 -1
View File
@@ -1,5 +1,6 @@
import logging
import os
import platform
import sys
import time
@@ -47,7 +48,7 @@ class RayOutOfMemoryError(Exception):
" ".join(cmdline)[:100].strip())
return ("More than {}% of the memory on ".format(int(
100 * threshold)) + "node {} is used ({} / {} GB). ".format(
os.uname()[1], round(used_gb, 2), round(total_gb, 2)) +
platform.node(), round(used_gb, 2), round(total_gb, 2)) +
"The top 10 memory consumers are:\n\n{}".format(proc_str) +
"\n\nIn addition, up to {} GiB of shared memory is ".format(
round(get_shared(psutil.virtual_memory()) / (1024**3), 2))
+3 -3
View File
@@ -6,7 +6,7 @@ import traceback
import time
import datetime
import grpc
import socket
import platform
import subprocess
import sys
from concurrent import futures
@@ -92,7 +92,7 @@ class Reporter:
"""Initialize the reporter object."""
self.cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False))
self.ip = ray.services.get_node_ip_address()
self.hostname = socket.gethostname()
self.hostname = platform.node()
_ = psutil.cpu_percent() # For initialization
@@ -252,7 +252,7 @@ if __name__ == "__main__":
args.redis_address, password=args.redis_password)
traceback_str = ray.utils.format_error_message(traceback.format_exc())
message = ("The reporter on node {} failed with the following "
"error:\n{}".format(socket.gethostname(), traceback_str))
"error:\n{}".format(platform.node(), traceback_str))
ray.utils.push_error_to_driver_through_redis(
redis_client, ray_constants.REPORTER_DIED_ERROR, message)
raise e
+2 -1
View File
@@ -6,6 +6,7 @@ import logging
import glob
import os
import pickle
import platform
import pandas as pd
from six import string_types
import shutil
@@ -308,7 +309,7 @@ class Trainable:
time_this_iter_s=time_this_iter,
time_total_s=self._time_total,
pid=os.getpid(),
hostname=os.uname()[1],
hostname=platform.node(),
node_ip=self._local_ip,
config=self.config,
time_since_restore=self._time_since_restore,
+2 -1
View File
@@ -3,6 +3,7 @@ from collections import deque
import copy
from datetime import datetime
import logging
import platform
import shutil
import uuid
import time
@@ -42,7 +43,7 @@ class Location:
def __str__(self):
if not self.pid:
return ""
elif self.hostname == os.uname()[1]:
elif self.hostname == platform.node():
return "pid={}".format(self.pid)
else:
return "{}:{}".format(self.hostname, self.pid)
+2 -1
View File
@@ -3,6 +3,7 @@ import numpy as np
import gym
import logging
import pickle
import platform
import os
import ray
@@ -891,7 +892,7 @@ class RolloutWorker(ParallelIteratorWorker):
def get_host(self):
"""Returns the hostname of the process running this evaluator."""
return os.uname()[1]
return platform.node()
@DeveloperAPI
def apply(self, func, *args):
+2 -2
View File
@@ -1,7 +1,7 @@
import numpy as np
import random
import os
import collections
import platform
import sys
import ray
@@ -343,7 +343,7 @@ class LocalReplayBuffer(ParallelIteratorWorker):
return _local_replay_buffer
def get_host(self):
return os.uname()[1]
return platform.node()
def add_batch(self, batch):
# Make a copy so the replay buffer doesn't pin plasma memory.
+2 -2
View File
@@ -1,5 +1,5 @@
import logging
import os
import platform
from typing import List
import ray
@@ -58,7 +58,7 @@ class Aggregator(ParallelIteratorWorker):
super().__init__(generator, repeat=False)
def get_host(self):
return os.uname()[1]
return platform.node()
def set_weights(self, weights, global_vars):
self.weights = weights
+2 -2
View File
@@ -2,7 +2,7 @@
import collections
import logging
import os
import platform
import time
import ray
@@ -172,4 +172,4 @@ class AggregationWorker(AggregationWorkerBase):
return result
def get_host(self):
return os.uname()[1]
return platform.node()
+2 -2
View File
@@ -5,7 +5,7 @@ https://arxiv.org/abs/1803.00933"""
import collections
import logging
import numpy as np
import os
import platform
import random
from six.moves import queue
import threading
@@ -340,7 +340,7 @@ class LocalReplayBuffer(ParallelIteratorWorker):
return _local_replay_buffer
def get_host(self):
return os.uname()[1]
return platform.node()
def add_batch(self, batch):
# Make a copy so the replay buffer doesn't pin plasma memory.
+2 -2
View File
@@ -1,5 +1,5 @@
import logging
import os
import platform
import ray
from collections import deque
@@ -73,7 +73,7 @@ def drop_colocated(actors):
def split_colocated(actors):
localhost = os.uname()[1]
localhost = platform.node()
hosts = ray.get([a.get_host.remote() for a in actors])
local = []
non_local = []