This commit is contained in:
87
ex1.py
Normal file
87
ex1.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import List
|
||||
|
||||
|
||||
def best_path(start_label: str, goal_label: str) -> List[str]:
|
||||
s = labels.index(start_label)
|
||||
g = labels.index(goal_label)
|
||||
path = [start_label]
|
||||
while s!= g:
|
||||
a = np.argmax(Q[s]) # ici, on récupère l'action
|
||||
next = NEXT[s, a]
|
||||
if next is None:
|
||||
raise f'Action impossible State{s} Action{a} NextState{next}'
|
||||
s = labels.index(next)
|
||||
path.append(next)
|
||||
return path
|
||||
|
||||
|
||||
labels = list("ABCDEFGHIJKL")
|
||||
|
||||
R = np.array([
|
||||
# UP DOWN LEFT RIGHT
|
||||
[0,0,0,1], #A
|
||||
[0,1,1,1], #B
|
||||
[0,1,1,0], #C
|
||||
[0,1,0,0], #D
|
||||
[0,1,0,0], #E
|
||||
[1,1,0,0], #F
|
||||
[1,0,0,1], #G
|
||||
[1,1,1,0], #H
|
||||
[1,0,0,1], #I
|
||||
[1,0,1,1], #J
|
||||
[0,0,1,1], #K
|
||||
[1,0,1,0], #L
|
||||
], dtype=float)
|
||||
|
||||
# Fait une matrice de même dimension que R remplie de 0
|
||||
Q = np.zeros_like(R)
|
||||
|
||||
# on a l'état courant et l'action en cours, il nous faut st+1 (la prochaine action)
|
||||
NEXT = np.array([
|
||||
# UP DOWN LEFT RIGHT
|
||||
[None, None, None, 'B'], #A
|
||||
[None, 'F', 'A', 'C'], #B
|
||||
[None, 'G', 'B', None], #C
|
||||
[None, 'H', None, None], #D
|
||||
[None, 'I', None, None], #E
|
||||
['B', 'J', None, None], #F
|
||||
['C', None, None, 'H'], #G
|
||||
['D', 'L', 'G', None], #H
|
||||
['E', None, None, 'J'], #I
|
||||
['F', None, 'I', 'K'], #J
|
||||
[None, None, 'J', 'L'], #K
|
||||
['H', None, 'K', None], #L
|
||||
])
|
||||
|
||||
# Hyperparameters
|
||||
gamma = 0.75
|
||||
alpha = 0.90
|
||||
n_iters = 1_000
|
||||
|
||||
rng = np.random.default_rng(0)
|
||||
|
||||
# augmente le reward pour les directions qui mènent à G (C DOWN & H LEFT)
|
||||
goal_opt1 = labels.index('C')
|
||||
down_index = 1
|
||||
goal_opt2 = labels.index('H')
|
||||
left_index = 2
|
||||
R_goal = R.copy()
|
||||
R_goal[goal_opt1, down_index] = 1_000.0
|
||||
R_goal[goal_opt2, left_index] = 1_000.0
|
||||
|
||||
for _ in range(n_iters):
|
||||
s = rng.integers(0, R.shape[0]) # random current state
|
||||
actions = np.where(R_goal[s] > 0)[0] # valid actions
|
||||
if actions.size == 0:
|
||||
continue
|
||||
a = rng.choice(actions) # random valid action
|
||||
s_next = a # transition to next state
|
||||
TD = R_goal[s, a] + gamma * Q[s_next].max() - Q[s, a]
|
||||
Q[s, a] += alpha * TD
|
||||
|
||||
print("Matrice Q: ")
|
||||
print(Q)
|
||||
|
||||
print("Path E -> G: ", " -> ".join(best_path('E', 'G')))
|
||||
Reference in New Issue
Block a user