import numpy as np
import pandas as pd

labels = list("ABCDEFGHIJKL")

R = np.array([
    [0,1,0,0,0,0,0,0,0,0,0,0], #A
    [1,0,1,0,0,1,0,0,0,0,0,0], #B
    [0,1,0,0,0,0,1,0,0,0,0,0], #C
    [0,0,0,0,0,0,0,1,0,0,0,0], #D
    [0,0,0,0,0,0,0,0,1,0,0,0], #E
    [0,1,0,0,0,0,0,0,0,1,0,0], #F
    [0,0,1,0,0,0,0,1,0,0,0,0], #G
    [0,0,0,1,0,0,1,0,0,0,0,1], #H
    [0,0,0,0,1,0,0,0,0,1,0,0], #I
    [0,0,0,0,0,1,0,0,1,0,1,0], #J
    [0,0,0,0,0,0,0,0,0,1,0,1], #K
    [0,0,0,0,0,0,0,1,0,0,1,0], #L
], dtype=float)

# Fait une matrice de même dimension que R remplie de 0
Q = np.zeros_like(R)

print(type(R)) # recup le type
print(R.ndim) # 2 -> matrice 2d
print(R.shape) # (3, 3) -> 3 lignes 3 colonnes
print(R.dtype) # float64
print(R.size) # 9 éléments
print(R.strides) # e.g. (24,8)

# huperparamètre

gamma = 0.75
alpha = 0.90
n_iters = 1000

rng = np.random.default_rng(0)

# Train Q-Learning for goal 'G'
goal_label = 'G'
goal = labels.index(goal_label)
R_goal = R.copy()
R_goal[goal, goal] = 1000.0

for _ in range(n_iters):
    s = rng.integers(0, R.shape[0]) # random current state
    actions = np.where(R_goal[s] > 0)[0] # valid actions
    if actions.size == 0:
        continue
    a = rng.choice(actions) # random valid action
    s_next = a # transition to next state
    # Calcul du time difference
    TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a]
    # Equation de Bellman
    Q[s, a] += alpha * TD


def best_path(start_label: str, goal_label: str):
    s = labels.index(start_label)
    g = labels.index(goal_label)
    path = [start_label]
    while s!= g:
        s = np.argmax(Q[s]) # ici, on récupère l'action
        path.append(labels[s])
    return path


print("Path E -> G: ", " -> ".join(best_path('E', 'G')))