diff --git a/ex1.py b/ex1.py index 3157ad1..a07df49 100644 --- a/ex1.py +++ b/ex1.py @@ -1,23 +1,21 @@ -import numpy as np -import pandas as pd from typing import List +import numpy as np -def best_path(start_label: str, goal_label: str) -> List[str]: + +def best_path(start_label: str, goal_label: str, Q: np.array) -> List[str]: s = labels.index(start_label) g = labels.index(goal_label) path = [start_label] while s!= g: - a = np.argmax(Q[s]) # ici, on récupère l'action - next = NEXT[s, a] - if next is None: - raise f'Action impossible State{s} Action{a} NextState{next}' - s = labels.index(next) - path.append(next) + a = np.argmax(Q[s]) # ici, on récupère l'action la plus optimale + next_state = NEXT_MOVE_TABLE[s, a] # on récupère le prochain state + s = labels.index(next_state) # on récupère l'index du state dans la matrice Q + path.append(next_state) # On ajoute l'état futur dans le chemin return path -labels = list("ABCDEFGHIJKL") +labels = list('ABCDEFGHIJKL') R = np.array([ # UP DOWN LEFT RIGHT @@ -35,11 +33,8 @@ R = np.array([ [1,0,1,0], #L ], dtype=float) -# Fait une matrice de même dimension que R remplie de 0 -Q = np.zeros_like(R) - -# on a l'état courant et l'action en cours, il nous faut st+1 (la prochaine action) -NEXT = np.array([ +# on a l'état courant et l'action en cours, il nous faut st+1 (le prochain state) +NEXT_MOVE_TABLE = np.array([ # UP DOWN LEFT RIGHT [None, None, None, 'B'], #A [None, 'F', 'A', 'C'], #B @@ -67,21 +62,41 @@ goal_opt1 = labels.index('C') down_index = 1 goal_opt2 = labels.index('H') left_index = 2 -R_goal = R.copy() -R_goal[goal_opt1, down_index] = 1_000.0 -R_goal[goal_opt2, left_index] = 1_000.0 +R_goal_e_g = R.copy() +R_goal_e_g[goal_opt1, down_index] = 1_000.0 +R_goal_e_g[goal_opt2, left_index] = 1_000.0 -for _ in range(n_iters): - s = rng.integers(0, R.shape[0]) # random current state - actions = np.where(R_goal[s] > 0)[0] # valid actions - if actions.size == 0: - continue - a = rng.choice(actions) # random valid action - s_next = a # transition to next state - TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a] - Q[s, a] += alpha * TD +# même chose pour le chemin qui mène à A pour faire un autre test +goal_to_a = labels.index('B') +R_goal_k_a = R.copy() +R_goal_k_a[goal_to_a, left_index] = 1_000.0 -print("Matrice Q: ") -print(Q) -print("Path E -> G: ", " -> ".join(best_path('E', 'G'))) +def generate_q_values(n_iters: int, R_goal: np.array) -> np.array: + # Fait une matrice de même dimension que R remplie de 0 + Q = np.zeros_like(R_goal) + for _ in range(n_iters): + s = rng.integers(0, R.shape[0]) # random current state + actions = np.where(R_goal[s] > 0)[0] # valid actions + if actions.size == 0: + continue + a = rng.choice(actions) # random valid action + s_next_label: str | None = NEXT_MOVE_TABLE[s, a] # transition to next state + if s_next_label is None: + continue + s_next = labels.index(s_next_label) + TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a] + Q[s, a] += alpha * TD + return Q + +Q_e_g = generate_q_values(n_iters, R_goal_e_g) +Q_k_a = generate_q_values(n_iters, R_goal_k_a) + +print("Matrice Q E -> G: ") +print(Q_e_g) + +print("Matrice Q K -> A") +print(Q_k_a) + +print("Path E -> G: ", " -> ".join(best_path('E', 'G', Q_e_g))) +print("Path K -> A: ", " -> ".join(best_path('K', 'A', Q_k_a)))