From f6b84cb2f76188d0e77bc293d8662379609edd0e Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Fri, 20 Nov 2020 08:59:43 +0100
Subject: [PATCH] [RLlib] Fix offline logp vs prob bug in OffPolicyEstimator
 class. (#12158)

---
 rllib/offline/off_policy_estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rllib/offline/off_policy_estimator.py b/rllib/offline/off_policy_estimator.py
index ac2a04ed6..43aa2cc20 100644
--- a/rllib/offline/off_policy_estimator.py
+++ b/rllib/offline/off_policy_estimator.py
@@ -70,7 +70,8 @@ class OffPolicyEstimator:
             state_batches=[batch[k] for k in state_keys],
             prev_action_batch=batch.data.get(SampleBatch.PREV_ACTIONS),
             prev_reward_batch=batch.data.get(SampleBatch.PREV_REWARDS))
-        return convert_to_numpy(log_likelihoods)
+        log_likelihoods = convert_to_numpy(log_likelihoods)
+        return np.exp(log_likelihoods)
 
     @DeveloperAPI
     def process(self, batch: SampleBatchType):