diff --git a/tp6.py b/tp6.py new file mode 100644 index 0000000..d6e96e8 --- /dev/null +++ b/tp6.py @@ -0,0 +1,169 @@ +import gymnasium as gym +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +import matplotlib.pyplot as plt + + +# ——— Réseaux de neurones ——— +class Actor(nn.Module): + def __init__(self, state_dim, action_dim): + super().__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 128), + nn.ReLU(), + nn.Linear(128, action_dim), + nn.Softmax(dim=-1) + ) + + def forward(self, state): + return self.net(state) + + +class Critic(nn.Module): + def __init__(self, state_dim): + super().__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 128), + nn.ReLU(), + nn.Linear(128, 1) + ) + + def forward(self, state): + return self.net(state) + + +def compute_returns(rewards, values, gamma): + """Calcule les retours et avantages normalisés""" + returns = [] + R = 0 + for r, v in zip(reversed(rewards), reversed(values)): + R = r + gamma * R + returns.insert(0, R) + returns = torch.tensor(returns, dtype=torch.float32) + values = torch.stack(values) + advantages = returns - values.squeeze() + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + return returns, advantages + + +def train_and_save(): + env = gym.make("CartPole-v1") + actor = Actor(env.observation_space.shape[0], env.action_space.n) + critic = Critic(env.observation_space.shape[0]) + + optimizerA = optim.Adam(actor.parameters(), lr=3e-3) + optimizerC = optim.Adam(critic.parameters(), lr=3e-3) + gamma = 0.99 + + nb_episodes = 1500 + rewards_history = [] + advantages_history = [] + critic_preds = [] + + for episode in range(nb_episodes): + state, _ = env.reset() + done = False + + log_probs = [] + values = [] + rewards = [] + entropies = [] + + while not done: + state_tensor = torch.tensor(state, dtype=torch.float32) + probs = actor(state_tensor) + dist = torch.distributions.Categorical(probs) + action = dist.sample() + + next_state, reward, done, trunc, _ = env.step(action.item()) + + value = critic(state_tensor) + log_prob = dist.log_prob(action) + entropy = dist.entropy() + + log_probs.append(log_prob) + values.append(value) + rewards.append(reward) + entropies.append(entropy) + + state = next_state + + # ——— Calcul des avantages et retours ——— + returns, advantages = compute_returns(rewards, values, gamma) + + # ——— Mise à jour Actor ——— + log_probs = torch.stack(log_probs) + entropies = torch.stack(entropies) + actor_loss = -(log_probs * advantages.detach()).mean() - 0.01 * entropies.mean() + + optimizerA.zero_grad() + actor_loss.backward() + optimizerA.step() + + # ——— Mise à jour Critic ——— + critic_loss = (returns - torch.stack(values).squeeze()).pow(2).mean() + + optimizerC.zero_grad() + critic_loss.backward() + optimizerC.step() + + total_reward = sum(rewards) + rewards_history.append(total_reward) + advantages_history.append(advantages.mean().item()) + critic_preds.append(torch.stack(values).mean().item()) + + print(f"Épisode {episode}, Récompense : {total_reward:.1f}") + + # ——— Graphiques tous les 500 épisodes ——— + if episode % 500 == 0 and episode != 0: + fig, axes = plt.subplots(1, 3, figsize=(15, 4)) + + axes[0].plot(rewards_history, label='Rewards') + axes[0].set_title('Rewards') + axes[0].legend() + + axes[1].plot(advantages_history, label='Advantages', color='orange') + axes[1].set_title('Advantages') + axes[1].legend() + + axes[2].plot(critic_preds, label='Critic Prediction', color='green') + axes[2].plot(rewards_history, label='Actual Reward', color='red', linestyle='--') + axes[2].set_title('Critic vs Reward') + axes[2].legend() + + plt.suptitle(f'Episode {episode}') + plt.tight_layout() + plt.show() + + if np.mean(rewards_history[-100:]) >= 475: + print("I see this as an absolute win!") + break + + torch.save(actor.state_dict(), "a2c_cartpole.pth") + + +def show(weights_path="a2c_cartpole.pth"): + env = gym.make("CartPole-v1", render_mode="human") + actor = Actor(env.observation_space.shape[0], env.action_space.n) + actor.load_state_dict(torch.load(weights_path)) + actor.eval() + + state, _ = env.reset() + done = False + while not done: + state_tensor = torch.tensor(state, dtype=torch.float32) + with torch.no_grad(): + probs = actor(state_tensor) + action = torch.argmax(probs).item() + next_state, _, terminated, truncated, _ = env.step(action) + done = terminated or truncated + state = next_state + env.close() + print("Demonstration finished.") + + +if __name__ == "__main__": + train_and_save() + show() diff --git a/tp7.py b/tp7.py new file mode 100644 index 0000000..d6e96e8 --- /dev/null +++ b/tp7.py @@ -0,0 +1,169 @@ +import gymnasium as gym +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +import matplotlib.pyplot as plt + + +# ——— Réseaux de neurones ——— +class Actor(nn.Module): + def __init__(self, state_dim, action_dim): + super().__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 128), + nn.ReLU(), + nn.Linear(128, action_dim), + nn.Softmax(dim=-1) + ) + + def forward(self, state): + return self.net(state) + + +class Critic(nn.Module): + def __init__(self, state_dim): + super().__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 128), + nn.ReLU(), + nn.Linear(128, 1) + ) + + def forward(self, state): + return self.net(state) + + +def compute_returns(rewards, values, gamma): + """Calcule les retours et avantages normalisés""" + returns = [] + R = 0 + for r, v in zip(reversed(rewards), reversed(values)): + R = r + gamma * R + returns.insert(0, R) + returns = torch.tensor(returns, dtype=torch.float32) + values = torch.stack(values) + advantages = returns - values.squeeze() + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + return returns, advantages + + +def train_and_save(): + env = gym.make("CartPole-v1") + actor = Actor(env.observation_space.shape[0], env.action_space.n) + critic = Critic(env.observation_space.shape[0]) + + optimizerA = optim.Adam(actor.parameters(), lr=3e-3) + optimizerC = optim.Adam(critic.parameters(), lr=3e-3) + gamma = 0.99 + + nb_episodes = 1500 + rewards_history = [] + advantages_history = [] + critic_preds = [] + + for episode in range(nb_episodes): + state, _ = env.reset() + done = False + + log_probs = [] + values = [] + rewards = [] + entropies = [] + + while not done: + state_tensor = torch.tensor(state, dtype=torch.float32) + probs = actor(state_tensor) + dist = torch.distributions.Categorical(probs) + action = dist.sample() + + next_state, reward, done, trunc, _ = env.step(action.item()) + + value = critic(state_tensor) + log_prob = dist.log_prob(action) + entropy = dist.entropy() + + log_probs.append(log_prob) + values.append(value) + rewards.append(reward) + entropies.append(entropy) + + state = next_state + + # ——— Calcul des avantages et retours ——— + returns, advantages = compute_returns(rewards, values, gamma) + + # ——— Mise à jour Actor ——— + log_probs = torch.stack(log_probs) + entropies = torch.stack(entropies) + actor_loss = -(log_probs * advantages.detach()).mean() - 0.01 * entropies.mean() + + optimizerA.zero_grad() + actor_loss.backward() + optimizerA.step() + + # ——— Mise à jour Critic ——— + critic_loss = (returns - torch.stack(values).squeeze()).pow(2).mean() + + optimizerC.zero_grad() + critic_loss.backward() + optimizerC.step() + + total_reward = sum(rewards) + rewards_history.append(total_reward) + advantages_history.append(advantages.mean().item()) + critic_preds.append(torch.stack(values).mean().item()) + + print(f"Épisode {episode}, Récompense : {total_reward:.1f}") + + # ——— Graphiques tous les 500 épisodes ——— + if episode % 500 == 0 and episode != 0: + fig, axes = plt.subplots(1, 3, figsize=(15, 4)) + + axes[0].plot(rewards_history, label='Rewards') + axes[0].set_title('Rewards') + axes[0].legend() + + axes[1].plot(advantages_history, label='Advantages', color='orange') + axes[1].set_title('Advantages') + axes[1].legend() + + axes[2].plot(critic_preds, label='Critic Prediction', color='green') + axes[2].plot(rewards_history, label='Actual Reward', color='red', linestyle='--') + axes[2].set_title('Critic vs Reward') + axes[2].legend() + + plt.suptitle(f'Episode {episode}') + plt.tight_layout() + plt.show() + + if np.mean(rewards_history[-100:]) >= 475: + print("I see this as an absolute win!") + break + + torch.save(actor.state_dict(), "a2c_cartpole.pth") + + +def show(weights_path="a2c_cartpole.pth"): + env = gym.make("CartPole-v1", render_mode="human") + actor = Actor(env.observation_space.shape[0], env.action_space.n) + actor.load_state_dict(torch.load(weights_path)) + actor.eval() + + state, _ = env.reset() + done = False + while not done: + state_tensor = torch.tensor(state, dtype=torch.float32) + with torch.no_grad(): + probs = actor(state_tensor) + action = torch.argmax(probs).item() + next_state, _, terminated, truncated, _ = env.step(action) + done = terminated or truncated + state = next_state + env.close() + print("Demonstration finished.") + + +if __name__ == "__main__": + train_and_save() + show()