[rllib] Use 64-byte aligned memory when concatenating arrays (#4408)

2026-06-28 02:01:24 +08:00 · 2019-03-25 23:56:51 -07:00
parent c68eea6134
commit 8ee240f40e
4 changed files with 59 additions and 3 deletions
@@ -38,6 +38,10 @@ def collect_episodes(local_evaluator,
    collected, _ = ray.wait(
        pending, num_returns=len(pending), timeout=timeout_seconds * 1.0)
    num_metric_batches_dropped = len(pending) - len(collected)
+    if pending and len(collected) == 0:
+        raise ValueError(
+            "Timed out waiting for metrics from workers. You can configure "
+            "this timeout with `collect_metrics_timeout`.")

    metric_lists = ray.get(collected)
    metric_lists.append(local_evaluator.get_metrics())
@@ -7,6 +7,7 @@ import collections
 import numpy as np

 from ray.rllib.utils.annotations import PublicAPI
+from ray.rllib.utils.memory import concat_aligned

 # Defaults policy id for single agent environments
 DEFAULT_POLICY_ID = "default"
@@ -104,7 +105,7 @@ class SampleBatch(object):
        out = {}
        samples = [s for s in samples if s.count > 0]
        for k in samples[0].keys():
-            out[k] = np.concatenate([s[k] for s in samples])
+            out[k] = concat_aligned([s[k] for s in samples])
        return SampleBatch(out)

    @PublicAPI
@@ -121,7 +122,7 @@ class SampleBatch(object):
        assert self.keys() == other.keys(), "must have same columns"
        out = {}
        for k in self.keys():
-            out[k] = np.concatenate([self[k], other[k]])
+            out[k] = concat_aligned([self[k], other[k]])
        return SampleBatch(out)

    @PublicAPI
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def aligned_array(size, dtype, align=64):
+    """Returns an array of a given size that is 64-byte aligned.
+
+    The returned array can be efficiently copied into GPU memory by TensorFlow.
+    """
+
+    n = size * dtype.itemsize
+    empty = np.empty(n + (align - 1), dtype=np.uint8)
+    data_align = empty.ctypes.data % align
+    offset = 0 if data_align == 0 else (align - data_align)
+    output = empty[offset:offset + n].view(dtype)
+
+    assert len(output) == size, len(output)
+    assert output.ctypes.data % align == 0, output.ctypes.data
+    return output
+
+
+def concat_aligned(items):
+    """Concatenate arrays, ensuring the output is 64-byte aligned.
+
+    We only align float arrays; other arrays are concatenated as normal.
+
+    This should be used instead of np.concatenate() to improve performance
+    when the output array is likely to be fed into TensorFlow.
+    """
+
+    if len(items) == 0:
+        return []
+    elif len(items) == 1:
+        # we assume the input is aligned. In any case, it doesn't help
+        # performance to force align it since that incurs a needless copy.
+        return items[0]
+    elif (isinstance(items[0], np.ndarray)
+          and items[0].dtype in [np.float32, np.float64, np.uint8]):
+        dtype = items[0].dtype
+        flat = aligned_array(sum(s.size for s in items), dtype)
+        batch_dim = sum(s.shape[0] for s in items)
+        new_shape = (batch_dim, ) + items[0].shape[1:]
+        output = flat.reshape(new_shape)
+        assert output.ctypes.data % 64 == 0, output.ctypes.data
+        np.concatenate(items, out=output)
+        return output
+    else:
+        return np.concatenate(items)