feat: add tp6
All checks were successful
SonarQube Scan / SonarQube Trigger (push) Successful in 24s
All checks were successful
SonarQube Scan / SonarQube Trigger (push) Successful in 24s
This commit is contained in:
169
tp6.py
Normal file
169
tp6.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import gymnasium as gym
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# ——— Réseaux de neurones ———
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(state_dim, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, action_dim),
|
||||
nn.Softmax(dim=-1)
|
||||
)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state)
|
||||
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, state_dim):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(state_dim, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, 1)
|
||||
)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state)
|
||||
|
||||
|
||||
def compute_returns(rewards, values, gamma):
|
||||
"""Calcule les retours et avantages normalisés"""
|
||||
returns = []
|
||||
R = 0
|
||||
for r, v in zip(reversed(rewards), reversed(values)):
|
||||
R = r + gamma * R
|
||||
returns.insert(0, R)
|
||||
returns = torch.tensor(returns, dtype=torch.float32)
|
||||
values = torch.stack(values)
|
||||
advantages = returns - values.squeeze()
|
||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
||||
return returns, advantages
|
||||
|
||||
|
||||
def train_and_save():
|
||||
env = gym.make("CartPole-v1")
|
||||
actor = Actor(env.observation_space.shape[0], env.action_space.n)
|
||||
critic = Critic(env.observation_space.shape[0])
|
||||
|
||||
optimizerA = optim.Adam(actor.parameters(), lr=3e-3)
|
||||
optimizerC = optim.Adam(critic.parameters(), lr=3e-3)
|
||||
gamma = 0.99
|
||||
|
||||
nb_episodes = 1500
|
||||
rewards_history = []
|
||||
advantages_history = []
|
||||
critic_preds = []
|
||||
|
||||
for episode in range(nb_episodes):
|
||||
state, _ = env.reset()
|
||||
done = False
|
||||
|
||||
log_probs = []
|
||||
values = []
|
||||
rewards = []
|
||||
entropies = []
|
||||
|
||||
while not done:
|
||||
state_tensor = torch.tensor(state, dtype=torch.float32)
|
||||
probs = actor(state_tensor)
|
||||
dist = torch.distributions.Categorical(probs)
|
||||
action = dist.sample()
|
||||
|
||||
next_state, reward, done, trunc, _ = env.step(action.item())
|
||||
|
||||
value = critic(state_tensor)
|
||||
log_prob = dist.log_prob(action)
|
||||
entropy = dist.entropy()
|
||||
|
||||
log_probs.append(log_prob)
|
||||
values.append(value)
|
||||
rewards.append(reward)
|
||||
entropies.append(entropy)
|
||||
|
||||
state = next_state
|
||||
|
||||
# ——— Calcul des avantages et retours ———
|
||||
returns, advantages = compute_returns(rewards, values, gamma)
|
||||
|
||||
# ——— Mise à jour Actor ———
|
||||
log_probs = torch.stack(log_probs)
|
||||
entropies = torch.stack(entropies)
|
||||
actor_loss = -(log_probs * advantages.detach()).mean() - 0.01 * entropies.mean()
|
||||
|
||||
optimizerA.zero_grad()
|
||||
actor_loss.backward()
|
||||
optimizerA.step()
|
||||
|
||||
# ——— Mise à jour Critic ———
|
||||
critic_loss = (returns - torch.stack(values).squeeze()).pow(2).mean()
|
||||
|
||||
optimizerC.zero_grad()
|
||||
critic_loss.backward()
|
||||
optimizerC.step()
|
||||
|
||||
total_reward = sum(rewards)
|
||||
rewards_history.append(total_reward)
|
||||
advantages_history.append(advantages.mean().item())
|
||||
critic_preds.append(torch.stack(values).mean().item())
|
||||
|
||||
print(f"Épisode {episode}, Récompense : {total_reward:.1f}")
|
||||
|
||||
# ——— Graphiques tous les 500 épisodes ———
|
||||
if episode % 500 == 0 and episode != 0:
|
||||
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
|
||||
|
||||
axes[0].plot(rewards_history, label='Rewards')
|
||||
axes[0].set_title('Rewards')
|
||||
axes[0].legend()
|
||||
|
||||
axes[1].plot(advantages_history, label='Advantages', color='orange')
|
||||
axes[1].set_title('Advantages')
|
||||
axes[1].legend()
|
||||
|
||||
axes[2].plot(critic_preds, label='Critic Prediction', color='green')
|
||||
axes[2].plot(rewards_history, label='Actual Reward', color='red', linestyle='--')
|
||||
axes[2].set_title('Critic vs Reward')
|
||||
axes[2].legend()
|
||||
|
||||
plt.suptitle(f'Episode {episode}')
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
if np.mean(rewards_history[-100:]) >= 475:
|
||||
print("I see this as an absolute win!")
|
||||
break
|
||||
|
||||
torch.save(actor.state_dict(), "a2c_cartpole.pth")
|
||||
|
||||
|
||||
def show(weights_path="a2c_cartpole.pth"):
|
||||
env = gym.make("CartPole-v1", render_mode="human")
|
||||
actor = Actor(env.observation_space.shape[0], env.action_space.n)
|
||||
actor.load_state_dict(torch.load(weights_path))
|
||||
actor.eval()
|
||||
|
||||
state, _ = env.reset()
|
||||
done = False
|
||||
while not done:
|
||||
state_tensor = torch.tensor(state, dtype=torch.float32)
|
||||
with torch.no_grad():
|
||||
probs = actor(state_tensor)
|
||||
action = torch.argmax(probs).item()
|
||||
next_state, _, terminated, truncated, _ = env.step(action)
|
||||
done = terminated or truncated
|
||||
state = next_state
|
||||
env.close()
|
||||
print("Demonstration finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save()
|
||||
show()
|
||||
169
tp7.py
Normal file
169
tp7.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import gymnasium as gym
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# ——— Réseaux de neurones ———
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(state_dim, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, action_dim),
|
||||
nn.Softmax(dim=-1)
|
||||
)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state)
|
||||
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, state_dim):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(state_dim, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, 1)
|
||||
)
|
||||
|
||||
def forward(self, state):
|
||||
return self.net(state)
|
||||
|
||||
|
||||
def compute_returns(rewards, values, gamma):
|
||||
"""Calcule les retours et avantages normalisés"""
|
||||
returns = []
|
||||
R = 0
|
||||
for r, v in zip(reversed(rewards), reversed(values)):
|
||||
R = r + gamma * R
|
||||
returns.insert(0, R)
|
||||
returns = torch.tensor(returns, dtype=torch.float32)
|
||||
values = torch.stack(values)
|
||||
advantages = returns - values.squeeze()
|
||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
||||
return returns, advantages
|
||||
|
||||
|
||||
def train_and_save():
|
||||
env = gym.make("CartPole-v1")
|
||||
actor = Actor(env.observation_space.shape[0], env.action_space.n)
|
||||
critic = Critic(env.observation_space.shape[0])
|
||||
|
||||
optimizerA = optim.Adam(actor.parameters(), lr=3e-3)
|
||||
optimizerC = optim.Adam(critic.parameters(), lr=3e-3)
|
||||
gamma = 0.99
|
||||
|
||||
nb_episodes = 1500
|
||||
rewards_history = []
|
||||
advantages_history = []
|
||||
critic_preds = []
|
||||
|
||||
for episode in range(nb_episodes):
|
||||
state, _ = env.reset()
|
||||
done = False
|
||||
|
||||
log_probs = []
|
||||
values = []
|
||||
rewards = []
|
||||
entropies = []
|
||||
|
||||
while not done:
|
||||
state_tensor = torch.tensor(state, dtype=torch.float32)
|
||||
probs = actor(state_tensor)
|
||||
dist = torch.distributions.Categorical(probs)
|
||||
action = dist.sample()
|
||||
|
||||
next_state, reward, done, trunc, _ = env.step(action.item())
|
||||
|
||||
value = critic(state_tensor)
|
||||
log_prob = dist.log_prob(action)
|
||||
entropy = dist.entropy()
|
||||
|
||||
log_probs.append(log_prob)
|
||||
values.append(value)
|
||||
rewards.append(reward)
|
||||
entropies.append(entropy)
|
||||
|
||||
state = next_state
|
||||
|
||||
# ——— Calcul des avantages et retours ———
|
||||
returns, advantages = compute_returns(rewards, values, gamma)
|
||||
|
||||
# ——— Mise à jour Actor ———
|
||||
log_probs = torch.stack(log_probs)
|
||||
entropies = torch.stack(entropies)
|
||||
actor_loss = -(log_probs * advantages.detach()).mean() - 0.01 * entropies.mean()
|
||||
|
||||
optimizerA.zero_grad()
|
||||
actor_loss.backward()
|
||||
optimizerA.step()
|
||||
|
||||
# ——— Mise à jour Critic ———
|
||||
critic_loss = (returns - torch.stack(values).squeeze()).pow(2).mean()
|
||||
|
||||
optimizerC.zero_grad()
|
||||
critic_loss.backward()
|
||||
optimizerC.step()
|
||||
|
||||
total_reward = sum(rewards)
|
||||
rewards_history.append(total_reward)
|
||||
advantages_history.append(advantages.mean().item())
|
||||
critic_preds.append(torch.stack(values).mean().item())
|
||||
|
||||
print(f"Épisode {episode}, Récompense : {total_reward:.1f}")
|
||||
|
||||
# ——— Graphiques tous les 500 épisodes ———
|
||||
if episode % 500 == 0 and episode != 0:
|
||||
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
|
||||
|
||||
axes[0].plot(rewards_history, label='Rewards')
|
||||
axes[0].set_title('Rewards')
|
||||
axes[0].legend()
|
||||
|
||||
axes[1].plot(advantages_history, label='Advantages', color='orange')
|
||||
axes[1].set_title('Advantages')
|
||||
axes[1].legend()
|
||||
|
||||
axes[2].plot(critic_preds, label='Critic Prediction', color='green')
|
||||
axes[2].plot(rewards_history, label='Actual Reward', color='red', linestyle='--')
|
||||
axes[2].set_title('Critic vs Reward')
|
||||
axes[2].legend()
|
||||
|
||||
plt.suptitle(f'Episode {episode}')
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
if np.mean(rewards_history[-100:]) >= 475:
|
||||
print("I see this as an absolute win!")
|
||||
break
|
||||
|
||||
torch.save(actor.state_dict(), "a2c_cartpole.pth")
|
||||
|
||||
|
||||
def show(weights_path="a2c_cartpole.pth"):
|
||||
env = gym.make("CartPole-v1", render_mode="human")
|
||||
actor = Actor(env.observation_space.shape[0], env.action_space.n)
|
||||
actor.load_state_dict(torch.load(weights_path))
|
||||
actor.eval()
|
||||
|
||||
state, _ = env.reset()
|
||||
done = False
|
||||
while not done:
|
||||
state_tensor = torch.tensor(state, dtype=torch.float32)
|
||||
with torch.no_grad():
|
||||
probs = actor(state_tensor)
|
||||
action = torch.argmax(probs).item()
|
||||
next_state, _, terminated, truncated, _ = env.step(action)
|
||||
done = terminated or truncated
|
||||
state = next_state
|
||||
env.close()
|
||||
print("Demonstration finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train_and_save()
|
||||
show()
|
||||
Reference in New Issue
Block a user