Source code for parl.algorithms.paddle.a2c

#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

import parl
import paddle
import paddle.nn.functional as F
from paddle.distribution import Categorical
from parl.utils.utils import check_model_method
import numpy as np

__all__ = ['A2C']

[docs]class A2C(parl.Algorithm):
[docs] def __init__(self, model, vf_loss_coeff=None): """ A2C algorithm Args: model (parl.Model): forward network of policy and value vf_loss_coeff (float): coefficient of the value function loss """ # check model and vf_loss_coeff input check_model_method(model, 'value', self.__class__.__name__) check_model_method(model, 'policy', self.__class__.__name__) check_model_method(model, 'policy_and_value', self.__class__.__name__) assert isinstance(vf_loss_coeff, (int, float)) self.model = model self.vf_loss_coeff = vf_loss_coeff clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=40.0) self.optimizer = paddle.optimizer.Adam( learning_rate=0.001, parameters=self.model.parameters(), grad_clip=clip)
[docs] def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. actions: An int64 tensor of shape [B]. advantages: A float32 tensor of shape [B]. target_values: A float32 tensor of shape [B]. learning_rate: float scalar of leanring rate. entropy_coeff: float scalar of entropy coefficient. """ # shape: [B, act_dim] logits = self.model.policy(obs) act_dim = logits.shape[-1] actions_onehot = F.one_hot(actions, act_dim) # [B, act_dim] --> [B] actions_log_probs = paddle.sum( F.log_softmax(logits) * actions_onehot, axis=-1) # The policy gradient loss pi_loss = -1.0 * paddle.sum(actions_log_probs * advantages) # The value function loss values = self.model.value(obs) delta = values - target_values vf_loss = 0.5 * paddle.sum(paddle.square(delta)) # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0) # Using the Categorical just for calculating the entropy. # See for detail policy_distribution = Categorical(logits) policy_entropy = policy_distribution.entropy() entropy = paddle.sum(policy_entropy) total_loss = ( pi_loss + vf_loss * self.vf_loss_coeff + entropy * entropy_coeff) self.optimizer.set_lr(learning_rate) total_loss.backward() self.optimizer.step() self.optimizer.clear_grad() return total_loss, pi_loss, vf_loss, entropy
[docs] def prob_and_value(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits, values = self.model.policy_and_value(obs) probs = F.softmax(logits) return probs, values
[docs] def predict(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ logits = self.model.policy(obs) probs = F.softmax(logits) predict_actions = paddle.argmax(probs, 1) return predict_actions
[docs] def value(self, obs): """ Args: obs: An float32 tensor of shape ([B] + observation_space). E.g. [B, C, H, W] in atari. """ values = self.model.value(obs) return values