openoker
/
ray


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
							from gym.spaces import Box

from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as \
    TorchFullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_tf, try_import_torch

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()


# __sphinx_doc_model_api_1_begin__
class DuelingQModel(TFModelV2):  # or: TorchModelV2
    """A simple, hard-coded dueling head model."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        # Pass num_outputs=None into super constructor (so that no action/
        # logits output layer is built).
        # Alternatively, you can pass in num_outputs=[last layer size of
        # config[model][fcnet_hiddens]] AND set no_last_linear=True, but
        # this seems more tedious as you will have to explain users of this
        # class that num_outputs is NOT the size of your Q-output layer.
        super(DuelingQModel, self).__init__(obs_space, action_space, None,
                                            model_config, name)
        # Now: self.num_outputs contains the last layer's size, which
        # we can use to construct the dueling head (see torch: SlimFC
        # below).

        # Construct advantage head ...
        self.A = tf.keras.layers.Dense(num_outputs)
        # torch:
        # self.A = SlimFC(
        #     in_size=self.num_outputs, out_size=num_outputs)

        # ... and value head.
        self.V = tf.keras.layers.Dense(1)
        # torch:
        # self.V = SlimFC(in_size=self.num_outputs, out_size=1)

    def get_q_values(self, underlying_output):
        # Calculate q-values following dueling logic:
        v = self.V(underlying_output)  # value
        a = self.A(underlying_output)  # advantages (per action)
        advantages_mean = tf.reduce_mean(a, 1)
        advantages_centered = a - tf.expand_dims(advantages_mean, 1)
        return v + advantages_centered  # q-values


# __sphinx_doc_model_api_1_end__


class TorchDuelingQModel(TorchModelV2):
    """A simple, hard-coded dueling head model."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        # Pass num_outputs=None into super constructor (so that no action/
        # logits output layer is built).
        # Alternatively, you can pass in num_outputs=[last layer size of
        # config[model][fcnet_hiddens]] AND set no_last_linear=True, but
        # this seems more tedious as you will have to explain users of this
        # class that num_outputs is NOT the size of your Q-output layer.
        nn.Module.__init__(self)
        super(TorchDuelingQModel, self).__init__(obs_space, action_space, None,
                                                 model_config, name)
        # Now: self.num_outputs contains the last layer's size, which
        # we can use to construct the dueling head (see torch: SlimFC
        # below).

        # Construct advantage head ...
        self.A = SlimFC(in_size=self.num_outputs, out_size=num_outputs)

        # ... and value head.
        self.V = SlimFC(in_size=self.num_outputs, out_size=1)

    def get_q_values(self, underlying_output):
        # Calculate q-values following dueling logic:
        v = self.V(underlying_output)  # value
        a = self.A(underlying_output)  # advantages (per action)
        advantages_mean = torch.mean(a, 1)
        advantages_centered = a - torch.unsqueeze(advantages_mean, 1)
        return v + advantages_centered  # q-values


class ContActionQModel(TFModelV2):
    """A simple, q-value-from-cont-action model (for e.g. SAC type algos)."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        # Pass num_outputs=None into super constructor (so that no action/
        # logits output layer is built).
        # Alternatively, you can pass in num_outputs=[last layer size of
        # config[model][fcnet_hiddens]] AND set no_last_linear=True, but
        # this seems more tedious as you will have to explain users of this
        # class that num_outputs is NOT the size of your Q-output layer.
        super(ContActionQModel, self).__init__(obs_space, action_space, None,
                                               model_config, name)

        # Now: self.num_outputs contains the last layer's size, which
        # we can use to construct the single q-value computing head.

        # Nest an RLlib FullyConnectedNetwork (torch or tf) into this one here
        # to be used for Q-value calculation.
        # Use the current value of self.num_outputs, which is the wrapped
        # model's output layer size.
        combined_space = Box(-1.0, 1.0,
                             (self.num_outputs + action_space.shape[0], ))
        self.q_head = FullyConnectedNetwork(combined_space, action_space, 1,
                                            model_config, "q_head")

        # Missing here: Probably still have to provide action output layer
        # and value layer and make sure self.num_outputs is correctly set.

    def get_single_q_value(self, underlying_output, action):
        # Calculate the q-value after concating the underlying output with
        # the given action.
        input_ = tf.concat([underlying_output, action], axis=-1)
        # Construct a simple input_dict (needed for self.q_head as it's an
        # RLlib ModelV2).
        input_dict = {"obs": input_}
        # Ignore state outputs.
        q_values, _ = self.q_head(input_dict)
        return q_values


# __sphinx_doc_model_api_2_begin__


class TorchContActionQModel(TorchModelV2):
    """A simple, q-value-from-cont-action model (for e.g. SAC type algos)."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        nn.Module.__init__(self)
        # Pass num_outputs=None into super constructor (so that no action/
        # logits output layer is built).
        # Alternatively, you can pass in num_outputs=[last layer size of
        # config[model][fcnet_hiddens]] AND set no_last_linear=True, but
        # this seems more tedious as you will have to explain users of this
        # class that num_outputs is NOT the size of your Q-output layer.
        super(TorchContActionQModel, self).__init__(obs_space, action_space,
                                                    None, model_config, name)

        # Now: self.num_outputs contains the last layer's size, which
        # we can use to construct the single q-value computing head.

        # Nest an RLlib FullyConnectedNetwork (torch or tf) into this one here
        # to be used for Q-value calculation.
        # Use the current value of self.num_outputs, which is the wrapped
        # model's output layer size.
        combined_space = Box(-1.0, 1.0,
                             (self.num_outputs + action_space.shape[0], ))
        self.q_head = TorchFullyConnectedNetwork(combined_space, action_space,
                                                 1, model_config, "q_head")

        # Missing here: Probably still have to provide action output layer
        # and value layer and make sure self.num_outputs is correctly set.

    def get_single_q_value(self, underlying_output, action):
        # Calculate the q-value after concating the underlying output with
        # the given action.
        input_ = torch.cat([underlying_output, action], dim=-1)
        # Construct a simple input_dict (needed for self.q_head as it's an
        # RLlib ModelV2).
        input_dict = {"obs": input_}
        # Ignore state outputs.
        q_values, _ = self.q_head(input_dict)
        return q_values


# __sphinx_doc_model_api_2_end__