diff --git a/__pycache__/model.cpython-38.pyc b/__pycache__/model.cpython-38.pyc new file mode 100644 index 0000000..96b075c Binary files /dev/null and b/__pycache__/model.cpython-38.pyc differ diff --git a/__pycache__/replay_memory.cpython-38.pyc b/__pycache__/replay_memory.cpython-38.pyc new file mode 100644 index 0000000..4b28683 Binary files /dev/null and b/__pycache__/replay_memory.cpython-38.pyc differ diff --git a/__pycache__/sac.cpython-38.pyc b/__pycache__/sac.cpython-38.pyc new file mode 100644 index 0000000..659207e Binary files /dev/null and b/__pycache__/sac.cpython-38.pyc differ diff --git a/__pycache__/utils.cpython-38.pyc b/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000..1bf2124 Binary files /dev/null and b/__pycache__/utils.cpython-38.pyc differ diff --git a/main.py b/main.py index a3047c5..562e3d9 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ from tensorboardX import SummaryWriter from replay_memory import ReplayMemory parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args') -parser.add_argument('--env-name', default="HalfCheetah-v2", +parser.add_argument('--env-name', default="HalfCheetahBulletEnv-v0", help='Mujoco Gym environment (default: HalfCheetah-v2)') parser.add_argument('--policy', default="Gaussian", help='Policy Type: Gaussian | Deterministic (default: Gaussian)') diff --git a/runs/2020-06-06_00-16-03_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382763.pranjal-Lenovo-ideapad-730S-13IWL b/runs/2020-06-06_00-16-03_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382763.pranjal-Lenovo-ideapad-730S-13IWL new file mode 100644 index 0000000..e69de29 diff --git a/runs/2020-06-06_00-16-31_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382791.pranjal-Lenovo-ideapad-730S-13IWL b/runs/2020-06-06_00-16-31_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382791.pranjal-Lenovo-ideapad-730S-13IWL new file mode 100644 index 0000000..e69de29 diff --git a/runs/2020-06-06_00-17-41_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382861.pranjal-Lenovo-ideapad-730S-13IWL b/runs/2020-06-06_00-17-41_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382861.pranjal-Lenovo-ideapad-730S-13IWL new file mode 100644 index 0000000..c331439 Binary files /dev/null and b/runs/2020-06-06_00-17-41_SAC_HalfCheetahBulletEnv-v0_Gaussian_/events.out.tfevents.1591382861.pranjal-Lenovo-ideapad-730S-13IWL differ diff --git a/sac.py b/sac.py index 0ff0b45..aef23f0 100644 --- a/sac.py +++ b/sac.py @@ -67,6 +67,14 @@ class SAC(object): qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] + qf_loss = qf1_loss + qf2_loss + + self.critic_optim.zero_grad() + qf_loss.backward() + self.critic_optim.step() + + for c_param in self.critic.parameters(): + c_param.requires_grad = False pi, log_pi, _ = self.policy.sample(state_batch) @@ -75,18 +83,13 @@ class SAC(object): policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] - self.critic_optim.zero_grad() - qf1_loss.backward() - self.critic_optim.step() - - self.critic_optim.zero_grad() - qf2_loss.backward() - self.critic_optim.step() - self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() + for c_param in self.critic.parameters(): + c_param.requires_grad = True + if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()