import numpy as np import pandas as pd labels = list("ABCDEFGHIJKL") R = np.array([ [0,1,0,0,0,0,0,0,0,0,0,0], #A [1,0,1,0,0,1,0,0,0,0,0,0], #B [0,1,0,0,0,0,1,0,0,0,0,0], #C [0,0,0,0,0,0,0,1,0,0,0,0], #D [0,0,0,0,0,0,0,0,1,0,0,0], #E [0,1,0,0,0,0,0,0,0,1,0,0], #F [0,0,1,0,0,0,0,1,0,0,0,0], #G [0,0,0,1,0,0,1,0,0,0,0,1], #H [0,0,0,0,1,0,0,0,0,1,0,0], #I [0,0,0,0,0,1,0,0,1,0,1,0], #J [0,0,0,0,0,0,0,0,0,1,0,1], #K [0,0,0,0,0,0,0,1,0,0,1,0], #L ], dtype=float) # Fait une matrice de même dimension que R remplie de 0 Q = np.zeros_like(R) print(type(R)) # recup le type print(R.ndim) # 2 -> matrice 2d print(R.shape) # (3, 3) -> 3 lignes 3 colonnes print(R.dtype) # float64 print(R.size) # 9 éléments print(R.strides) # e.g. (24,8) # huperparamètre gamma = 0.75 alpha = 0.90 n_iters = 1000 rng = np.random.default_rng(0) # Train Q-Learning for goal 'G' goal_label = 'G' goal = labels.index(goal_label) R_goal = R.copy() R_goal[goal, goal] = 1000.0 for _ in range(n_iters): s = rng.integers(0, R.shape[0]) # random current state actions = np.where(R_goal[s] > 0)[0] # valid actions if actions.size == 0: continue a = rng.choice(actions) # random valid action s_next = a # transition to next state # Calcul du time difference TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a] # Equation de Bellman Q[s, a] += alpha * TD def best_path(start_label: str, goal_label: str): s = labels.index(start_label) g = labels.index(goal_label) path = [start_label] while s!= g: s = np.argmax(Q[s]) # ici, on récupère l'action path.append(labels[s]) return path print("Path E -> G: ", " -> ".join(best_path('E', 'G')))