import gymnasium as gym import torch import torch.nn as nn import torch.optim as optim import numpy as np import matplotlib.pyplot as plt # ——— Réseaux de neurones ——— class Actor(nn.Module): def __init__(self, state_dim, action_dim): super().__init__() self.net = nn.Sequential( nn.Linear(state_dim, 128), nn.ReLU(), nn.Linear(128, action_dim), nn.Softmax(dim=-1) ) def forward(self, state): return self.net(state) class Critic(nn.Module): def __init__(self, state_dim): super().__init__() self.net = nn.Sequential( nn.Linear(state_dim, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, state): return self.net(state) def compute_returns(rewards, values, gamma): """Calcule les retours et avantages normalisés""" returns = [] R = 0 for r, v in zip(reversed(rewards), reversed(values)): R = r + gamma * R returns.insert(0, R) returns = torch.tensor(returns, dtype=torch.float32) values = torch.stack(values) advantages = returns - values.squeeze() advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) return returns, advantages def train_and_save(): env = gym.make("CartPole-v1") actor = Actor(env.observation_space.shape[0], env.action_space.n) critic = Critic(env.observation_space.shape[0]) optimizerA = optim.Adam(actor.parameters(), lr=3e-3) optimizerC = optim.Adam(critic.parameters(), lr=3e-3) gamma = 0.99 nb_episodes = 1500 rewards_history = [] advantages_history = [] critic_preds = [] for episode in range(nb_episodes): state, _ = env.reset() done = False log_probs = [] values = [] rewards = [] entropies = [] while not done: state_tensor = torch.tensor(state, dtype=torch.float32) probs = actor(state_tensor) dist = torch.distributions.Categorical(probs) action = dist.sample() next_state, reward, done, trunc, _ = env.step(action.item()) value = critic(state_tensor) log_prob = dist.log_prob(action) entropy = dist.entropy() log_probs.append(log_prob) values.append(value) rewards.append(reward) entropies.append(entropy) state = next_state # ——— Calcul des avantages et retours ——— returns, advantages = compute_returns(rewards, values, gamma) # ——— Mise à jour Actor ——— log_probs = torch.stack(log_probs) entropies = torch.stack(entropies) actor_loss = -(log_probs * advantages.detach()).mean() - 0.01 * entropies.mean() optimizerA.zero_grad() actor_loss.backward() optimizerA.step() # ——— Mise à jour Critic ——— critic_loss = (returns - torch.stack(values).squeeze()).pow(2).mean() optimizerC.zero_grad() critic_loss.backward() optimizerC.step() total_reward = sum(rewards) rewards_history.append(total_reward) advantages_history.append(advantages.mean().item()) critic_preds.append(torch.stack(values).mean().item()) print(f"Épisode {episode}, Récompense : {total_reward:.1f}") # ——— Graphiques tous les 500 épisodes ——— if episode % 500 == 0 and episode != 0: fig, axes = plt.subplots(1, 3, figsize=(15, 4)) axes[0].plot(rewards_history, label='Rewards') axes[0].set_title('Rewards') axes[0].legend() axes[1].plot(advantages_history, label='Advantages', color='orange') axes[1].set_title('Advantages') axes[1].legend() axes[2].plot(critic_preds, label='Critic Prediction', color='green') axes[2].plot(rewards_history, label='Actual Reward', color='red', linestyle='--') axes[2].set_title('Critic vs Reward') axes[2].legend() plt.suptitle(f'Episode {episode}') plt.tight_layout() plt.show() if np.mean(rewards_history[-100:]) >= 475: print("I see this as an absolute win!") break torch.save(actor.state_dict(), "a2c_cartpole.pth") def show(weights_path="a2c_cartpole.pth"): env = gym.make("CartPole-v1", render_mode="human") actor = Actor(env.observation_space.shape[0], env.action_space.n) actor.load_state_dict(torch.load(weights_path)) actor.eval() state, _ = env.reset() done = False while not done: state_tensor = torch.tensor(state, dtype=torch.float32) with torch.no_grad(): probs = actor(state_tensor) action = torch.argmax(probs).item() next_state, _, terminated, truncated, _ = env.step(action) done = terminated or truncated state = next_state env.close() print("Demonstration finished.") if __name__ == "__main__": train_and_save() show()