# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
# with 32 workers and 8 envs per worker. IMPALA, when ran with
# similar configurations, solved Pong in 10-12 minutes.
# APPO can also solve Pong in 2.5 million timesteps, which is
# 2x more efficient than that of IMPALA.
pong-appo:
    env: PongNoFrameskip-v4
    run: APPO
    stop:
        episode_reward_mean: 18.0
        timesteps_total: 5000000
    config:
        # Works for both torch and tf.
        framework: tf
        vtrace: True
        use_kl_loss: False
        rollout_fragment_length: 50
        train_batch_size: 750
        num_workers: 32
        broadcast_interval: 1
        max_sample_requests_in_flight_per_worker: 1
        num_multi_gpu_tower_stacks: 1
        num_envs_per_worker: 8
        minibatch_buffer_size: 4
        num_sgd_iter: 2
        vf_loss_coeff: 1.0
        clip_param: 0.3
        num_gpus: 1
        grad_clip: 10
        model:
          dim: 42