feat: adds bug fixes and Vitterbi algorithm

This commit is contained in:
Namu
2026-05-23 21:42:27 +02:00
parent 76121b8c68
commit 73a940ae20
3 changed files with 64 additions and 8 deletions

View File

@@ -16,7 +16,7 @@ class HMM:
# B # B
emission_matrix: np.ndarray emission_matrix: np.ndarray
def __init__(self, emission_matrix_file_name: str, numeric_text: np.ndarray): def __init__(self, emission_matrix_file_name: str|None, numeric_text: np.ndarray):
""" """
/!\\ long /!\\ long
@@ -32,13 +32,21 @@ class HMM:
self.initial_probabilities = np.zeros(26) self.initial_probabilities = np.zeros(26)
self.initial_probabilities[::] = 1 / 26 # les probabilités initiales sont 1/26 pour les 26 lettres self.initial_probabilities[::] = 1 / 26 # les probabilités initiales sont 1/26 pour les 26 lettres
def generate_emission_matrix(self, file_name) -> None: def generate_emission_matrix(self, file_name: str|None) -> None:
""" """
Lis le fichier de la matrice d'émission et la retourne Lis le fichier de la matrice d'émission et l'assigne à l'attribut de la classe qui y correspond.
sous forme de dataframe pandas. Si le nom de fichier n'ai pas donné, une matrice identité est utilisée à la place
La matrice est sous format numpy.
:param file_name: :param file_name:
:return: :return:
""" """
if file_name is None:
self.emission_matrix = np.zeros(shape=(26,26))
for i in range(26):
self.emission_matrix[i, i] = 1
else:
self.emission_matrix = pd.read_excel(file_name).iloc[:, 1:].to_numpy(dtype=float) self.emission_matrix = pd.read_excel(file_name).iloc[:, 1:].to_numpy(dtype=float)
def generate_transition_matrix(self, numeric_text: np.ndarray) -> None: def generate_transition_matrix(self, numeric_text: np.ndarray) -> None:
@@ -107,7 +115,7 @@ class HMM:
N = len(self.initial_probabilities) N = len(self.initial_probabilities)
beta = np.ones(N) beta = np.ones(N)
T = len(O) T = len(O)
# On remonte le temps de T-2 à 0
for t in range(T - 2, -1, -1): for t in range(T - 2, -1, -1):
new_beta = np.zeros(N) new_beta = np.zeros(N)
for i in range(N): for i in range(N):
@@ -117,3 +125,35 @@ class HMM:
# résultat somme de pi_i * b_i(o_1) * beta_1(i) # résultat somme de pi_i * b_i(o_1) * beta_1(i)
return np.sum([self.initial_probabilities[i] * self.emission_matrix[i, O[0]] * beta[i] for i in range(N)]), beta return np.sum([self.initial_probabilities[i] * self.emission_matrix[i, O[0]] * beta[i] for i in range(N)]), beta
def viterbi(self, O: list[int]) -> list[int]:
"""
Note: je suis partis de cette algo : https://en.wikipedia.org/wiki/Viterbi_algorithm
Je le trouve plus simple à lire, même si moins concis que celui du sujet.
J'ai adapté les noms pour correspondre le plus possible à ceux du TP.
:param O:
:return:
"""
N = len(self.initial_probabilities)
T = len(O)
dzeta = np.zeros((T, N))
psi = np.zeros((T, N), dtype=int)
dzeta[0] = self.initial_probabilities * self.emission_matrix[:, O[0]]
for t in range(1, T):
for j in range(N):
trans_probs = dzeta[t - 1] * self.transition_matrix[:, j]
best_r = np.argmax(trans_probs)
dzeta[t, j] = trans_probs[best_r] * self.emission_matrix[j, O[t]]
psi[t, j] = best_r
best_path = np.zeros(T, dtype=int)
best_path[T - 1] = np.argmax(dzeta[T - 1])
for t in range(T - 2, -1, -1):
best_path[t] = psi[t + 1, best_path[t + 1]]
return best_path.tolist()

11
main.py
View File

@@ -52,3 +52,14 @@ if __name__ == '__main__':
print('Résultat sur les textes ----------------------------------------------') print('Résultat sur les textes ----------------------------------------------')
print(f'texte 1 {text_1_result}, texte 2 {text_2_result}, texte 3 {text_3_result}') print(f'texte 1 {text_1_result}, texte 2 {text_2_result}, texte 3 {text_3_result}')
lambda_fr_identity = HMM(numeric_text=numeric_french_text, emission_matrix_file_name=None)
lambda_en_identity = HMM(numeric_text=numeric_english_text, emission_matrix_file_name=None)
lambda_it_identity = HMM(numeric_text=numeric_italian_text, emission_matrix_file_name=None)
text_1_result = utils.forward_detection_with_text(lambda_fr_identity, lambda_en_identity, lambda_it_identity, words_text_1)
text_2_result = utils.forward_detection_with_text(lambda_fr_identity, lambda_en_identity, lambda_it_identity, words_text_2)
text_3_result = utils.forward_detection_with_text(lambda_fr_identity, lambda_en_identity, lambda_it_identity, words_text_3)
print('Résultat avec une matrice identité -----------------------------------')
print(f'texte 1 {text_1_result}, texte 2 {text_2_result}, texte 3 {text_3_result}')

View File

@@ -6,7 +6,12 @@ from HMM import HMM
def normalize_probabilities(prob_fr: float, prob_en: float, prob_it: float, searched: float) -> float: def normalize_probabilities(prob_fr: float, prob_en: float, prob_it: float, searched: float) -> float:
sum = prob_fr + prob_en + prob_it sum = prob_fr + prob_en + prob_it
# si on utilise une matrice identité en tant que matrice d'émission il y a de forte change d'avoir une somme à 0.
if sum != 0:
return searched / sum return searched / sum
else:
return searched # retourne 0
def forward_detection(hmm_fr: HMM, hmm_en: HMM, hmm_it: HMM, O: list[int]) -> tuple[str, float, list[float]]: def forward_detection(hmm_fr: HMM, hmm_en: HMM, hmm_it: HMM, O: list[int]) -> tuple[str, float, list[float]]:
@@ -95,7 +100,7 @@ def forward_detection_with_text(hmm_fr: HMM, hmm_en: HMM, hmm_it: HMM, O: list[l
french_prob_count = english_prob_count = italian_prob_count = 0 french_prob_count = english_prob_count = italian_prob_count = 0
for word in O: for word in O:
lang, _ = forward_detection(hmm_fr, hmm_en, hmm_it, word) lang, _, _ = forward_detection(hmm_fr, hmm_en, hmm_it, word)
match lang: match lang:
case 'Français': case 'Français':
french_prob_count += 1 french_prob_count += 1