# More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
self.seed=0# Seed for numpy, torch and the game
self.max_num_gpus=None# Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
### Game
self.observation_shape=(1,1,4)# Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space=list(range(2))# Fixed list of all possible actions. You should only edit the length
self.players=list(range(1))# List of players. You should only edit the length
self.stacked_observations=0# Number of previous observations and previous actions to add to the current observation
# Evaluate
self.muzero_player=0# Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
self.opponent=None# Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
### Self-Play
self.num_workers=1# Number of simultaneous threads/workers self-playing to feed the replay buffer
self.selfplay_on_gpu=False
self.max_moves=500# Maximum number of moves if game is not finished before
self.num_simulations=50# Number of future moves self-simulated
self.discount=0.997# Chronological discount of the reward
self.temperature_threshold=None# Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time
self.support_size=10# Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
# Residual Network
self.downsample=False# Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
self.blocks=1# Number of blocks in the ResNet
self.channels=2# Number of channels in the ResNet
self.reduced_channels_reward=2# Number of channels in reward head
self.reduced_channels_value=2# Number of channels in value head
self.reduced_channels_policy=2# Number of channels in policy head
self.resnet_fc_reward_layers=[]# Define the hidden layers in the reward head of the dynamic network
self.resnet_fc_value_layers=[]# Define the hidden layers in the value head of the prediction network
self.resnet_fc_policy_layers=[]# Define the hidden layers in the policy head of the prediction network
# Fully Connected Network
self.encoding_size=8
self.fc_representation_layers=[]# Define the hidden layers in the representation network
self.fc_dynamics_layers=[16]# Define the hidden layers in the dynamics network
self.fc_reward_layers=[16]# Define the hidden layers in the reward network
self.fc_value_layers=[16]# Define the hidden layers in the value network
self.fc_policy_layers=[16]# Define the hidden layers in the policy network
### Training
self.results_path=os.path.join(os.path.dirname(os.path.realpath(__file__)),"../results",os.path.basename(__file__)[:-3],datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))# Path to store the model weights and TensorBoard logs
self.save_model=True# Save the checkpoint in results_path as model.checkpoint
self.training_steps=2000# Total number of training steps (ie weights update according to a batch)
self.batch_size=128# Number of parts of games to train on at each training step
self.checkpoint_interval=10# Number of training steps before using the model for self-playing
self.value_loss_weight=1# Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.train_on_gpu=torch.cuda.is_available()# Train on GPU if available
self.optimizer="Adam"# "Adam" or "SGD". Paper uses SGD
self.weight_decay=1e-4# L2 weights regularization
self.momentum=0.9# Used only if optimizer is SGD
# Exponential learning rate schedule
self.lr_init=0.02# Initial learning rate
self.lr_decay_rate=0.9# Set it to 1 to use a constant learning rate
self.lr_decay_steps=1000
### Replay Buffer
self.replay_buffer_size=500# Number of self-play games to keep in the replay buffer
self.num_unroll_steps=10# Number of game moves to keep for every batch element
self.td_steps=50# Number of steps in the future to take into account for calculating the target value
self.PER=True# Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha=0.5# How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
# Reanalyze (See paper appendix Reanalyse)
self.use_last_model_value=True# Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
self.reanalyse_on_gpu=False
### Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay=0# Number of seconds to wait after each played game
self.training_delay=0# Number of seconds to wait after each training step
self.ratio=1.5# Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it