distributional_q_tf_model.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. """Tensorflow model for DQN"""
  2. from typing import List
  3. import gym
  4. from ray.rllib.models.tf.layers import NoisyLayer
  5. from ray.rllib.models.tf.tf_modelv2 import TFModelV2
  6. from ray.rllib.utils.framework import try_import_tf
  7. from ray.rllib.utils.typing import ModelConfigDict, TensorType
  8. tf1, tf, tfv = try_import_tf()
  9. class DistributionalQTFModel(TFModelV2):
  10. """Extension of standard TFModel to provide distributional Q values.
  11. It also supports options for noisy nets and parameter space noise.
  12. Data flow:
  13. obs -> forward() -> model_out
  14. model_out -> get_q_value_distributions() -> Q(s, a) atoms
  15. model_out -> get_state_value() -> V(s)
  16. Note that this class by itself is not a valid model unless you
  17. implement forward() in a subclass."""
  18. def __init__(
  19. self,
  20. obs_space: gym.spaces.Space,
  21. action_space: gym.spaces.Space,
  22. num_outputs: int,
  23. model_config: ModelConfigDict,
  24. name: str,
  25. q_hiddens=(256, ),
  26. dueling: bool = False,
  27. num_atoms: int = 1,
  28. use_noisy: bool = False,
  29. v_min: float = -10.0,
  30. v_max: float = 10.0,
  31. sigma0: float = 0.5,
  32. # TODO(sven): Move `add_layer_norm` into ModelCatalog as
  33. # generic option, then error if we use ParameterNoise as
  34. # Exploration type and do not have any LayerNorm layers in
  35. # the net.
  36. add_layer_norm: bool = False):
  37. """Initialize variables of this model.
  38. Extra model kwargs:
  39. q_hiddens (List[int]): List of layer-sizes after(!) the
  40. Advantages(A)/Value(V)-split. Hence, each of the A- and V-
  41. branches will have this structure of Dense layers. To define
  42. the NN before this A/V-split, use - as always -
  43. config["model"]["fcnet_hiddens"].
  44. dueling (bool): Whether to build the advantage(A)/value(V) heads
  45. for DDQN. If True, Q-values are calculated as:
  46. Q = (A - mean[A]) + V. If False, raw NN output is interpreted
  47. as Q-values.
  48. num_atoms (int): If >1, enables distributional DQN.
  49. use_noisy (bool): Use noisy nets.
  50. v_min (float): Min value support for distributional DQN.
  51. v_max (float): Max value support for distributional DQN.
  52. sigma0 (float): Initial value of noisy layers.
  53. add_layer_norm (bool): Enable layer norm (for param noise).
  54. Note that the core layers for forward() are not defined here, this
  55. only defines the layers for the Q head. Those layers for forward()
  56. should be defined in subclasses of DistributionalQModel.
  57. """
  58. super(DistributionalQTFModel, self).__init__(
  59. obs_space, action_space, num_outputs, model_config, name)
  60. # setup the Q head output (i.e., model for get_q_values)
  61. self.model_out = tf.keras.layers.Input(
  62. shape=(num_outputs, ), name="model_out")
  63. def build_action_value(prefix: str,
  64. model_out: TensorType) -> List[TensorType]:
  65. if q_hiddens:
  66. action_out = model_out
  67. for i in range(len(q_hiddens)):
  68. if use_noisy:
  69. action_out = NoisyLayer(
  70. "{}hidden_{}".format(prefix, i), q_hiddens[i],
  71. sigma0)(action_out)
  72. elif add_layer_norm:
  73. action_out = tf.keras.layers.Dense(
  74. units=q_hiddens[i],
  75. activation=tf.nn.relu)(action_out)
  76. action_out = \
  77. tf.keras.layers.LayerNormalization()(
  78. action_out)
  79. else:
  80. action_out = tf.keras.layers.Dense(
  81. units=q_hiddens[i],
  82. activation=tf.nn.relu,
  83. name="hidden_%d" % i)(action_out)
  84. else:
  85. # Avoid postprocessing the outputs. This enables custom models
  86. # to be used for parametric action DQN.
  87. action_out = model_out
  88. if use_noisy:
  89. action_scores = NoisyLayer(
  90. "{}output".format(prefix),
  91. self.action_space.n * num_atoms,
  92. sigma0,
  93. activation=None)(action_out)
  94. elif q_hiddens:
  95. action_scores = tf.keras.layers.Dense(
  96. units=self.action_space.n * num_atoms,
  97. activation=None)(action_out)
  98. else:
  99. action_scores = model_out
  100. if num_atoms > 1:
  101. # Distributional Q-learning uses a discrete support z
  102. # to represent the action value distribution
  103. z = tf.range(num_atoms, dtype=tf.float32)
  104. z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
  105. def _layer(x):
  106. support_logits_per_action = tf.reshape(
  107. tensor=x, shape=(-1, self.action_space.n, num_atoms))
  108. support_prob_per_action = tf.nn.softmax(
  109. logits=support_logits_per_action)
  110. x = tf.reduce_sum(
  111. input_tensor=z * support_prob_per_action, axis=-1)
  112. logits = support_logits_per_action
  113. dist = support_prob_per_action
  114. return [x, z, support_logits_per_action, logits, dist]
  115. return tf.keras.layers.Lambda(_layer)(action_scores)
  116. else:
  117. logits = tf.expand_dims(tf.ones_like(action_scores), -1)
  118. dist = tf.expand_dims(tf.ones_like(action_scores), -1)
  119. return [action_scores, logits, dist]
  120. def build_state_score(prefix: str,
  121. model_out: TensorType) -> TensorType:
  122. state_out = model_out
  123. for i in range(len(q_hiddens)):
  124. if use_noisy:
  125. state_out = NoisyLayer(
  126. "{}dueling_hidden_{}".format(prefix, i), q_hiddens[i],
  127. sigma0)(state_out)
  128. else:
  129. state_out = tf.keras.layers.Dense(
  130. units=q_hiddens[i], activation=tf.nn.relu)(state_out)
  131. if add_layer_norm:
  132. state_out = tf.keras.layers.LayerNormalization()(
  133. state_out)
  134. if use_noisy:
  135. state_score = NoisyLayer(
  136. "{}dueling_output".format(prefix),
  137. num_atoms,
  138. sigma0,
  139. activation=None)(state_out)
  140. else:
  141. state_score = tf.keras.layers.Dense(
  142. units=num_atoms, activation=None)(state_out)
  143. return state_score
  144. q_out = build_action_value(name + "/action_value/", self.model_out)
  145. self.q_value_head = tf.keras.Model(self.model_out, q_out)
  146. if dueling:
  147. state_out = build_state_score(name + "/state_value/",
  148. self.model_out)
  149. self.state_value_head = tf.keras.Model(self.model_out, state_out)
  150. def get_q_value_distributions(self,
  151. model_out: TensorType) -> List[TensorType]:
  152. """Returns distributional values for Q(s, a) given a state embedding.
  153. Override this in your custom model to customize the Q output head.
  154. Args:
  155. model_out (Tensor): embedding from the model layers
  156. Returns:
  157. (action_scores, logits, dist) if num_atoms == 1, otherwise
  158. (action_scores, z, support_logits_per_action, logits, dist)
  159. """
  160. return self.q_value_head(model_out)
  161. def get_state_value(self, model_out: TensorType) -> TensorType:
  162. """Returns the state value prediction for the given state embedding."""
  163. return self.state_value_head(model_out)