import numpy as np import pandas as pd from typing import List def best_path(start_label: str, goal_label: str) -> List[str]: s = labels.index(start_label) g = labels.index(goal_label) path = [start_label] while s!= g: a = np.argmax(Q[s]) # ici, on récupère l'action next = NEXT[s, a] if next is None: raise f'Action impossible State{s} Action{a} NextState{next}' s = labels.index(next) path.append(next) return path labels = list("ABCDEFGHIJKL") R = np.array([ # UP DOWN LEFT RIGHT [0,0,0,1], #A [0,1,1,1], #B [0,1,1,0], #C [0,1,0,0], #D [0,1,0,0], #E [1,1,0,0], #F [1,0,0,1], #G [1,1,1,0], #H [1,0,0,1], #I [1,0,1,1], #J [0,0,1,1], #K [1,0,1,0], #L ], dtype=float) # Fait une matrice de même dimension que R remplie de 0 Q = np.zeros_like(R) # on a l'état courant et l'action en cours, il nous faut st+1 (la prochaine action) NEXT = np.array([ # UP DOWN LEFT RIGHT [None, None, None, 'B'], #A [None, 'F', 'A', 'C'], #B [None, 'G', 'B', None], #C [None, 'H', None, None], #D [None, 'I', None, None], #E ['B', 'J', None, None], #F ['C', None, None, 'H'], #G ['D', 'L', 'G', None], #H ['E', None, None, 'J'], #I ['F', None, 'I', 'K'], #J [None, None, 'J', 'L'], #K ['H', None, 'K', None], #L ]) # Hyperparameters gamma = 0.75 alpha = 0.90 n_iters = 1_000 rng = np.random.default_rng(0) # augmente le reward pour les directions qui mènent à G (C DOWN & H LEFT) goal_opt1 = labels.index('C') down_index = 1 goal_opt2 = labels.index('H') left_index = 2 R_goal = R.copy() R_goal[goal_opt1, down_index] = 1_000.0 R_goal[goal_opt2, left_index] = 1_000.0 for _ in range(n_iters): s = rng.integers(0, R.shape[0]) # random current state actions = np.where(R_goal[s] > 0)[0] # valid actions if actions.size == 0: continue a = rng.choice(actions) # random valid action s_next = a # transition to next state TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a] Q[s, a] += alpha * TD print("Matrice Q: ") print(Q) print("Path E -> G: ", " -> ".join(best_path('E', 'G')))