69 lines
1.7 KiB
Python
69 lines
1.7 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
|
|
labels = list("ABCDEFGHIJKL")
|
|
|
|
R = np.array([
|
|
[0,1,0,0,0,0,0,0,0,0,0,0], #A
|
|
[1,0,1,0,0,1,0,0,0,0,0,0], #B
|
|
[0,1,0,0,0,0,1,0,0,0,0,0], #C
|
|
[0,0,0,0,0,0,0,1,0,0,0,0], #D
|
|
[0,0,0,0,0,0,0,0,1,0,0,0], #E
|
|
[0,1,0,0,0,0,0,0,0,1,0,0], #F
|
|
[0,0,1,0,0,0,0,1,0,0,0,0], #G
|
|
[0,0,0,1,0,0,1,0,0,0,0,1], #H
|
|
[0,0,0,0,1,0,0,0,0,1,0,0], #I
|
|
[0,0,0,0,0,1,0,0,1,0,1,0], #J
|
|
[0,0,0,0,0,0,0,0,0,1,0,1], #K
|
|
[0,0,0,0,0,0,0,1,0,0,1,0], #L
|
|
], dtype=float)
|
|
|
|
# Fait une matrice de même dimension que R remplie de 0
|
|
Q = np.zeros_like(R)
|
|
|
|
print(type(R)) # recup le type
|
|
print(R.ndim) # 2 -> matrice 2d
|
|
print(R.shape) # (3, 3) -> 3 lignes 3 colonnes
|
|
print(R.dtype) # float64
|
|
print(R.size) # 9 éléments
|
|
print(R.strides) # e.g. (24,8)
|
|
|
|
# huperparamètre
|
|
|
|
gamma = 0.75
|
|
alpha = 0.90
|
|
n_iters = 1000
|
|
|
|
rng = np.random.default_rng(0)
|
|
|
|
# Train Q-Learning for goal 'G'
|
|
goal_label = 'G'
|
|
goal = labels.index(goal_label)
|
|
R_goal = R.copy()
|
|
R_goal[goal, goal] = 1000.0
|
|
|
|
for _ in range(n_iters):
|
|
s = rng.integers(0, R.shape[0]) # random current state
|
|
actions = np.where(R_goal[s] > 0)[0] # valid actions
|
|
if actions.size == 0:
|
|
continue
|
|
a = rng.choice(actions) # random valid action
|
|
s_next = a # transition to next state
|
|
# Calcul du time difference
|
|
TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a]
|
|
# Equation de Bellman
|
|
Q[s, a] += alpha * TD
|
|
|
|
|
|
def best_path(start_label: str, goal_label: str):
|
|
s = labels.index(start_label)
|
|
g = labels.index(goal_label)
|
|
path = [start_label]
|
|
while s!= g:
|
|
s = np.argmax(Q[s]) # ici, on récupère l'action
|
|
path.append(labels[s])
|
|
return path
|
|
|
|
|
|
print("Path E -> G: ", " -> ".join(best_path('E', 'G')))
|