123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305 |
- # flake8: noqa
- # __create-algo-checkpoint-begin__
- # Create a PPO algorithm object using a config object ..
- from ray.rllib.algorithms.ppo import PPOConfig
- my_ppo_config = PPOConfig().environment("CartPole-v1")
- my_ppo = my_ppo_config.build()
- # .. train one iteration ..
- my_ppo.train()
- # .. and call `save()` to create a checkpoint.
- save_result = my_ppo.save()
- path_to_checkpoint = save_result.checkpoint.path
- print(
- "An Algorithm checkpoint has been created inside directory: "
- f"'{path_to_checkpoint}'."
- )
- # Let's terminate the algo for demonstration purposes.
- my_ppo.stop()
- # Doing this will lead to an error.
- # my_ppo.train()
- # __create-algo-checkpoint-end__
- # __restore-from-algo-checkpoint-begin__
- from ray.rllib.algorithms.algorithm import Algorithm
- # Use the Algorithm's `from_checkpoint` utility to get a new algo instance
- # that has the exact same state as the old one, from which the checkpoint was
- # created in the first place:
- my_new_ppo = Algorithm.from_checkpoint(path_to_checkpoint)
- # Continue training.
- my_new_ppo.train()
- # __restore-from-algo-checkpoint-end__
- my_new_ppo.stop()
- # __restore-from-algo-checkpoint-2-begin__
- # Re-build a fresh algorithm.
- my_new_ppo = my_ppo_config.build()
- # Restore the old (checkpointed) state.
- my_new_ppo.restore(save_result)
- # Continue training.
- my_new_ppo.train()
- # __restore-from-algo-checkpoint-2-end__
- my_new_ppo.stop()
- # __multi-agent-checkpoints-begin__
- import os
- # Use our example multi-agent CartPole environment to train in.
- from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
- # Set up a multi-agent Algorithm, training two policies independently.
- my_ma_config = PPOConfig().multi_agent(
- # Which policies should RLlib create and train?
- policies={"pol1", "pol2"},
- # Let RLlib know, which agents in the environment (we'll have "agent1"
- # and "agent2") map to which policies.
- policy_mapping_fn=(
- lambda agent_id, episode, worker, **kw: (
- "pol1" if agent_id == "agent1" else "pol2"
- )
- ),
- # Setting these is not necessary. All policies will always be trained by default.
- # However, since we do provide a list of IDs here, we need to remain in charge of
- # changing this `policies_to_train` list, should we ever alter the Algorithm
- # (e.g. remove one of the policies or add a new one).
- policies_to_train=["pol1", "pol2"], # Again, `None` would be totally fine here.
- )
- # Add the MultiAgentCartPole env to our config and build our Algorithm.
- my_ma_config.environment(
- MultiAgentCartPole,
- env_config={
- "num_agents": 2,
- },
- )
- my_ma_algo = my_ma_config.build()
- my_ma_algo.train()
- ma_checkpoint_dir = my_ma_algo.save().checkpoint.path
- print(
- "An Algorithm checkpoint has been created inside directory: "
- f"'{ma_checkpoint_dir}'.\n"
- "Individual Policy checkpoints can be found in "
- f"'{os.path.join(ma_checkpoint_dir, 'policies')}'."
- )
- # Create a new Algorithm instance from the above checkpoint, just as you would for
- # a single-agent setup:
- my_ma_algo_clone = Algorithm.from_checkpoint(ma_checkpoint_dir)
- # __multi-agent-checkpoints-end__
- my_ma_algo_clone.stop()
- # __multi-agent-checkpoints-restore-policy-sub-set-begin__
- # Here, we use the same (multi-agent Algorithm) checkpoint as above, but only restore
- # it with the first Policy ("pol1").
- my_ma_algo_only_pol1 = Algorithm.from_checkpoint(
- ma_checkpoint_dir,
- # Tell the `from_checkpoint` util to create a new Algo, but only with "pol1" in it.
- policy_ids=["pol1"],
- # Make sure to update the mapping function (we must not map to "pol2" anymore
- # to avoid a runtime error). Now both agents ("agent0" and "agent1") map to
- # the same policy.
- policy_mapping_fn=lambda agent_id, episode, worker, **kw: "pol1",
- # Since we defined this above, we have to re-define it here with the updated
- # PolicyIDs, otherwise, RLlib will throw an error (it will think that there is an
- # unknown PolicyID in this list ("pol2")).
- policies_to_train=["pol1"],
- )
- # Make sure, pol2 is NOT in this Algorithm anymore.
- assert my_ma_algo_only_pol1.get_policy("pol2") is None
- # Continue training (only with pol1).
- my_ma_algo_only_pol1.train()
- # __multi-agent-checkpoints-restore-policy-sub-set-end__
- my_ma_algo_only_pol1.stop()
- # __create-policy-checkpoint-begin__
- # Retrieve the Policy object from an Algorithm.
- # Note that for normal, single-agent Algorithms, the Policy ID is "default_policy".
- policy1 = my_ma_algo.get_policy(policy_id="pol1")
- # Tell RLlib to store an individual policy checkpoint (only for "pol1") inside
- # /tmp/my_policy_checkpoint
- policy1.export_checkpoint("/tmp/my_policy_checkpoint")
- # __create-policy-checkpoint-end__
- # __restore-policy-begin__
- import numpy as np
- from ray.rllib.policy.policy import Policy
- # Use the `from_checkpoint` utility of the Policy class:
- my_restored_policy = Policy.from_checkpoint("/tmp/my_policy_checkpoint")
- # Use the restored policy for serving actions.
- obs = np.array([0.0, 0.1, 0.2, 0.3]) # individual CartPole observation
- action = my_restored_policy.compute_single_action(obs)
- print(f"Computed action {action} from given CartPole observation.")
- # __restore-policy-end__
- # __restore-algorithm-from-checkpoint-with-fewer-policies-begin__
- from ray.rllib.algorithms.ppo import PPOConfig
- from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
- # Set up an Algorithm with 5 Policies.
- algo_w_5_policies = (
- PPOConfig()
- .environment(
- env=MultiAgentCartPole,
- env_config={
- "num_agents": 5,
- },
- )
- .multi_agent(
- policies={"pol0", "pol1", "pol2", "pol3", "pol4"},
- # Map "agent0" -> "pol0", etc...
- policy_mapping_fn=(
- lambda agent_id, episode, worker, **kwargs: f"pol{agent_id}"
- ),
- )
- .build()
- )
- # .. train one iteration ..
- algo_w_5_policies.train()
- # .. and call `save()` to create a checkpoint.
- path_to_checkpoint = algo_w_5_policies.save().checkpoint.path
- print(
- "An Algorithm checkpoint has been created inside directory: "
- f"'{path_to_checkpoint}'. It should contain 5 policies in the 'policies/' sub dir."
- )
- # Let's terminate the algo for demonstration purposes.
- algo_w_5_policies.stop()
- # We will now recreate a new algo from this checkpoint, but only with 2 of the
- # original policies ("pol0" and "pol1"). Note that this will require us to change the
- # `policy_mapping_fn` (instead of mapping 5 agents to 5 policies, we now have
- # to map 5 agents to only 2 policies).
- def new_policy_mapping_fn(agent_id, episode, worker, **kwargs):
- return "pol0" if agent_id in ["agent0", "agent1"] else "pol1"
- algo_w_2_policies = Algorithm.from_checkpoint(
- checkpoint=path_to_checkpoint,
- policy_ids={"pol0", "pol1"}, # <- restore only those policy IDs here.
- policy_mapping_fn=new_policy_mapping_fn, # <- use this new mapping fn.
- )
- # Test, whether we can train with this new setup.
- algo_w_2_policies.train()
- # Terminate the new algo.
- algo_w_2_policies.stop()
- # __restore-algorithm-from-checkpoint-with-fewer-policies-end__
- # __export-models-begin__
- from ray.rllib.algorithms.ppo import PPOConfig
- # Create a new Algorithm (which contains a Policy, which contains a NN Model).
- # Switch on for native models to be included in the Policy checkpoints.
- ppo_config = (
- PPOConfig().environment("Pendulum-v1").checkpointing(export_native_model_files=True)
- )
- # The default framework is TensorFlow, but if you would like to do this example with
- # PyTorch, uncomment the following line of code:
- # ppo_config.framework("torch")
- # Create the Algorithm and train one iteration.
- ppo = ppo_config.build()
- ppo.train()
- # Get the underlying PPOTF1Policy (or PPOTorchPolicy) object.
- ppo_policy = ppo.get_policy()
- # __export-models-end__
- # Export the Keras NN model (that our PPOTF1Policy inside the PPO Algorithm uses)
- # to disk ...
- # 1) .. using the Policy object:
- # __export-models-1-begin__
- ppo_policy.export_model("/tmp/my_nn_model")
- # .. check /tmp/my_nn_model/ for the model files.
- # For Keras You should be able to recover the model via:
- # keras_model = tf.saved_model.load("/tmp/my_nn_model/")
- # And pass in a Pendulum-v1 observation:
- # results = keras_model(tf.convert_to_tensor(
- # np.array([[0.0, 0.1, 0.2]]), dtype=np.float32)
- # )
- # For PyTorch, do:
- # pytorch_model = torch.load("/tmp/my_nn_model/model.pt")
- # results = pytorch_model(
- # input_dict={
- # "obs": torch.from_numpy(np.array([[0.0, 0.1, 0.2]], dtype=np.float32)),
- # },
- # state=[torch.tensor(0)], # dummy value
- # seq_lens=torch.tensor(0), # dummy value
- # )
- # __export-models-1-end__
- # 2) .. via the Policy's checkpointing method:
- # __export-models-2-begin__
- checkpoint_dir = ppo_policy.export_checkpoint("tmp/ppo_policy")
- # .. check /tmp/ppo_policy/model/ for the model files.
- # You should be able to recover the keras model via:
- # keras_model = tf.saved_model.load("/tmp/ppo_policy/model")
- # And pass in a Pendulum-v1 observation:
- # results = keras_model(tf.convert_to_tensor(
- # np.array([[0.0, 0.1, 0.2]]), dtype=np.float32)
- # )
- # __export-models-2-end__
- # 3) .. via the Algorithm (Policy) checkpoint:
- # __export-models-3-begin__
- checkpoint_dir = ppo.save().checkpoint.path
- # .. check `checkpoint_dir` for the Algorithm checkpoint files.
- # For keras you should be able to recover the model via:
- # keras_model = tf.saved_model.load(checkpoint_dir + "/policies/default_policy/model/")
- # And pass in a Pendulum-v1 observation
- # results = keras_model(tf.convert_to_tensor(
- # np.array([[0.0, 0.1, 0.2]]), dtype=np.float32)
- # )
- # __export-models-3-end__
- # __export-models-as-onnx-begin__
- # Using the same Policy object, we can also export our NN Model in the ONNX format:
- ppo_policy.export_model("/tmp/my_nn_model", onnx=False)
- # __export-models-as-onnx-end__
|