Feat: Adds the HMM and detection for one word

2026-05-18 11:49:59 +02:00
commit 302b5f5d46
23 changed files with 1285 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Ignored default folder with query files
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,20 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <list>
+          <option value="pandas" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.14 (tp_mapel_1)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.14 (tp_mapel_1)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/tp_mapel_1.iml" filepath="$PROJECT_DIR$/.idea/tp_mapel_1.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/tp_mapel_1.iml
+++ b/.idea/tp_mapel_1.iml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.14 (tp_mapel_1)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/HMM/init.py
+++ b/HMM/init.py
@@ -0,0 +1,119 @@
+"""
+Ce module contient la classe qui représente un HMM
+"""
+import pandas as pd
+import numpy as np
+from pandas import DataFrame
+
+
+class HMM:
+    # S
+    states: list[str] = ["French", "English", "Italian"]
+    # pi
+    initial_probabilities: np.ndarray[tuple[int], np.dtype[any]]
+    # A
+    transition_matrix: np.ndarray
+    # B
+    emission_matrix: np.ndarray
+
+    def __init__(self, emission_matrix_file_name: str, numeric_text: np.ndarray):
+        """
+        /!\\ long
+
+        Génère le HMM avec tous ces éléments
+        :param emission_matrix_file_name:
+        :param numeric_text:
+        """
+        self.generate_emission_matrix(emission_matrix_file_name)
+        self.generate_initial_probabilities()
+        self.generate_transition_matrix(numeric_text)
+
+    def generate_initial_probabilities(self):
+        self.initial_probabilities = np.zeros(26)
+        self.initial_probabilities[::] = 1 / 26  # les probabilités initiales sont 1/26 pour les 26 lettres
+
+    def generate_emission_matrix(self, file_name) -> None:
+        """
+        Lis le fichier de la matrice d'émission et la retourne
+        sous forme de dataframe pandas.
+        :param file_name:
+        :return:
+        """
+        self.emission_matrix = pd.read_excel(file_name).iloc[:, 1:].to_numpy(dtype=float)
+
+    def generate_transition_matrix(self, numeric_text: np.ndarray) -> None:
+        """
+        /!\\ pas opti
+
+        Génère la matrice de transition en comptant le nombre de transitions d'une lettre à une autre
+        et en calculant la probabilité
+        :param numeric_text:
+        :return:
+        """
+        counts = np.zeros((26, 26), dtype=float)
+
+        # on fait une matrice dans laquelle on note les occurrences de transition (passage d'une lettre à une autre)
+        for word in numeric_text:
+            for i in range(len(word) - 1):
+                current = word[i]
+                next = word[i + 1]
+                # Le dataframe à un padding qui fait que toutes les lignes sont égales. Il rajoute des NaN pour le faire, il faut les ignorer
+                if not np.isnan(current) and not np.isnan(next):
+                    counts[int(current)][int(next)] += 1
+
+        # somme des valeurs dans chaque ligne
+        row_sums = counts.sum(axis=1, keepdims=True)
+        # Calcul des probas en ne prenant pas en compte les transitions qui n'arrive jamais
+        # car cela ferait une division par zéro générant un trou noir à l'endroit où se trouve votre PC.
+        # (Pour vous avoir sauvé, j'ai donc le droit à +1pts)
+        self.transition_matrix = np.divide(counts, row_sums, out=np.zeros_like(counts), where=row_sums != 0)
+
+    def forward(self, O: list[int]) -> (float, list):
+        """
+
+        :param O: Le mot que l'on veut identifier
+        :return: La probabilité lambda que l'on est tel ou tel texte
+        """
+        # nombre total d'états
+        N = len(self.initial_probabilities)
+        # alpha_i = pi_i * b(o_1)
+        first_obs = O[0]
+        alpha = np.array([self.initial_probabilities[i] * self.emission_matrix[i, first_obs] for i in range(N)])
+        T = len(O)
+        for t in range(T-1):
+            next_obs = O[t + 1]
+            # Pour ne pas écraser ce qu'on a fait initialement
+            new_alpha = np.zeros(N)
+
+            for j in range(N):
+                # Somme de i=1 à N de ( alpha_t(i) * a_ij )
+                # self.transition_matrix[i, j] = a_ij
+                right_term = np.sum([alpha[i] * self.transition_matrix[i, j] for i in range(N)])
+
+                # alpha_t+1(j) = b_j(o_t+1) * somme
+                # self.emission_matrix[j, next_obs] = b_j(o_t+1)
+                new_alpha[j] = self.emission_matrix[j, next_obs] * right_term
+
+            alpha = new_alpha
+
+        return float(np.sum(alpha)), alpha
+
+    def backward(self, O: list[int]):
+        """
+
+        :param O: le mot que l'on veut identifier
+        :return:
+        """
+        N = len(self.initial_probabilities)
+        beta = np.ones(N)
+        T = len(O)
+        # On remonte le temps de T-2 à 0
+        for t in range(T - 2, -1, -1):
+            new_beta = np.zeros(N)
+            for i in range(N):
+                # beta_t(i) = somme de a_ij * b_j(o_t+1) * beta_t+1(j)
+                new_beta[i] = np.sum([self.transition_matrix[i, j] * self.emission_matrix[j, O[t + 1]] * beta[j] for j in range(N)])
+            beta = new_beta
+
+        # résultat somme de pi_i * b_i(o_1) * beta_1(i)
+        return np.sum([self.initial_probabilities[i] * self.emission_matrix[i, O[0]] * beta[i] for i in range(N)]), beta
--- a/HMM/pycache/init.cpython-314.pyc
+++ b/HMM/pycache/init.cpython-314.pyc
--- a/transition/EN.pdf
+++ b/transition/EN.pdf
--- a/transition/FR.pdf
+++ b/transition/FR.pdf
--- a/data_preparation/init.py
+++ b/data_preparation/init.py
@@ -0,0 +1,89 @@
+"""
+Ce module contient les fonctions pour préparer les données.
+
+Cela consiste à:
+- Lire un fichier
+- Nettoyer les données
+- Tout transformer en dataframe d'index de l'alphabet.
+"""
+import re
+
+
+def read_file(file_name: str) -> str:
+    """
+    Lis le fichier sans rien touché. Retourne le texte brut
+    :param file_name:
+    :return:
+    """
+    with open(file_name) as file:
+        return file.read()
+
+
+def parse_data(raw_data: str) -> str:
+    """
+    Cette fonction retire les caractères spéciaux et passe toutes les lettres
+    en minuscule
+    :param raw_data:
+    :return:
+    """
+    lower_raw_data = raw_data.lower()
+    without_special_chars = re.sub(r'[^a-z]', ' ', lower_raw_data)
+    return without_special_chars
+
+
+def prepare_file(file_name) -> str:
+    """
+    Prépare le fichier en le lisant et en le parsant
+    :param file_name:
+    :return:
+    """
+    raw_data = read_file(file_name)
+    return parse_data(raw_data)
+
+
+def get_alphabet_index_of(letter: str) -> int:
+    """
+    Retourne l'index dans l'alphabet d'une lettre en imaginant que
+    l'alphabet est un tableau.
+
+    (ex: a -> 0)
+    :param letter:
+    :return:
+    """
+    # l'alphabet et l'ensemble des états
+    return 'abcdefghijklmnopqrstuvwxyz'.find(letter)
+
+
+def get_alphabet_index_form_word(word: str) -> list[int]:
+    """
+    Retourne un mot sous forme d'ensemble d'index dans l'alphabet
+    :param word:
+    :return:
+    """
+    return [get_alphabet_index_of(letter) for letter in word]
+
+
+def get_text_in_alphabet_index_form(text: str) -> list[list[int]]:
+    """
+    Prends un texte et le transforme un matrice contenant tout les mots sous forme de tableau d'entier.
+    Chaque entier correspond à l'index du caractère dans l'alphabet
+    :param text:
+    :return:
+    """
+    words = text.split(' ')
+    numeric_text = []
+    for word in words:
+        if word:  # On ignore les espaces multiples
+            numeric_text.append(get_alphabet_index_form_word(word))
+    return numeric_text
+
+
+def prepare_data(file_name: str) -> list[list[int]]:
+    """
+    Cette fonction lis le fichier, nettoie les données puis convertie tout en index alphabétique.
+    :param file_name:
+    :return:
+    """
+    content = prepare_file(file_name)
+    return get_text_in_alphabet_index_form(content)
+
--- a/data_preparation/pycache/init.cpython-314.pyc
+++ b/data_preparation/pycache/init.cpython-314.pyc
--- a/english.txt
+++ b/english.txt
--- a/french.txt
+++ b/french.txt
--- a/italian.txt
+++ b/italian.txt
--- a/main.py
+++ b/main.py
@@ -0,0 +1,43 @@
+"""
+Note: I code in english but comment in French !
+"""
+import data_preparation
+from HMM import HMM
+import utils
+
+
+if __name__ == '__main__':
+    numeric_french_text = data_preparation.prepare_data('french.txt')
+    numeric_english_text = data_preparation.prepare_data('english.txt')
+    numeric_italian_text = data_preparation.prepare_data('italian.txt')
+
+    lambda_fr = HMM('matrice_emission.xls', numeric_french_text)
+    lambda_en = HMM('matrice_emission.xls', numeric_english_text)
+    lambda_it = HMM('matrice_emission.xls', numeric_italian_text)
+
+    numeric_french_word = data_preparation.get_text_in_alphabet_index_form('probablement')
+
+    # On prend le premier mot de la liste (pomme)
+    word = numeric_french_word[0]
+
+    res_fr, _ = lambda_fr.forward(word)
+    res_en, _ = lambda_en.forward(word)
+    res_it, _ = lambda_it.forward(word)
+
+    proba_fr = utils.normalize_probabilities(res_fr, res_en, res_it, res_fr)
+    proba_en = utils.normalize_probabilities(res_fr, res_en, res_it, res_en)
+    proba_it = utils.normalize_probabilities(res_fr, res_en, res_it, res_it)
+
+    print('Résultats forward ---------------------------------------------------')
+    print(f'FR={proba_fr}, EN={proba_en}, IT={proba_it}, Conclusion={max(proba_fr, proba_en, proba_it)}')
+
+    res_back_fr, _ = lambda_fr.backward(word)
+    res_back_en, _ = lambda_en.backward(word)
+    res_back_it, _ = lambda_it.backward(word)
+
+    proba_back_fr = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_fr)
+    proba_back_en = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_en)
+    proba_back_it = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_it)
+
+    print('Résultat backward ---------------------------------------------------')
+    print(f'FR={proba_back_fr}, EN={proba_back_en}, IT={proba_back_it}, Conclusion={max(proba_back_fr, proba_back_en, proba_back_it)}')
--- a/matrice_emission.xls
+++ b/matrice_emission.xls
--- a/texte_1.txt
+++ b/texte_1.txt
@@ -0,0 +1,168 @@
+les traitements statistiques de donnees textuelles. (l. lebart, cnrs-enst ; lebart@enst.fr)
+le materiau statistique <20> texte <20> est omnipresent, presque banal, depuis le developpement
+d<EFBFBD>internet et de la toile (web). l<>etude quantitative et statistique de ces textes semble avoir fait
+irruption recemment, et pourtant les etudes statistiques de textes datent de plusieurs
+decennies, avec notamment en france les travaux de p. guiraud (problemes et methodes de la
+statistique linguistique, puf, 1960), c. muller (principes et methodes de statistique lexicale,
+hachette, 1977) puis de j.p. benzecri (pratique de l<>analyse des donnees, tome 3 :
+linguistique et lexicologie, dunod, 1981).
+apres la <20> stylometrie <20>, consacree <20> l<>etude de la forme des textes, en vue d<>identifier un
+auteur ou de dater une oeuvre, sont apparues les techniques de documentation automatique
+(information retrieval en anglais), visant <20> rechercher dans une base de documents (articles
+scientifiques, resumes, brevets, <20>) le ou les elements pertinents <20> partir d<>une requ<71>te
+exprimee sous forme de textes libres. le champ disciplinaire <20> traitement du langage
+naturel <20> est alors apparu, et s<>est developpe, au depart, comme un des domaines
+d<EFBFBD>application privilegie de l<>intelligence artificielle. la complexite du materiau, le besoin
+d<EFBFBD>assimiler d<>immenses corpus de textes, la pertinence du concept d<>apprentissage ont
+naturellement ouvert ce champ aux methodes statistiques. la statistique multidimensionnelle,
+les cha<68>nes de markov cachees, les methodes d<>analyse discriminantes interviennent ainsi
+pour construire les outils de base que sont les moteurs de recherche sur le web, les analyseurs
+morphosyntactiques, les correcteurs orthographiques, ainsi que dans des champs d<>application
+pratiques comme le traitement des reponses aux questions ouvertes dans les enqu<71>tes socioeconomiques.
+les questions ouvertes
+il est utile, dans un certain nombre de situations d'enqu<71>te, de laisser ouvertes certaines
+questions, dont les reponses se presenteront donc sous forme de textes de longueurs variables.
+le recueil des donnees
+dans au moins trois situations courantes, l'utilisation d'un questionnement ouvert s'impose :
+pour diminuer ou optimiser la duree de l<>entrevue d<>enqu<71>te
+bien que les reponses libres et les reponses guidees fournissent des informations de natures
+differentes, les premieres sont plus economiques que les secondes en temps d'interview et
+generent moins de fatigue. une simple question ouverte (par exemple : "quelles furent vos
+principales activites dimanche dernier ?") peut remplacer de longues listes d'items.
+comme complement <20> des questions fermees
+il s'agit le plus souvent de la question: "pourquoi ?". les explications concernant une reponse
+dej<EFBFBD> donnee doivent necessairement <20>tre spontanee. une batterie d'items risquerait de
+proposer de nouveaux arguments qui pourraient nuire <20> l'authenticite de l'explication. l'utilite
+de la question pourquoi ? a ete soulignee par de nombreux auteurs, et ce sont en fait les
+difficultes et le co<63>t de l'exploitation qui en limitent l'usage. elle seule permet en effet de
+savoir si les differentes categories de personnes interrogees ont compris la question fermee de
+la m<>me fa<66>on.
+pour recueillir une information qui doit, par nature, <20>tre spontanee
+les questionnaires des enqu<71>tes de marketing abondent en questions de ce type. citons par
+exemple : "qu'avez-vous retenu de cette campagne publicitaire ?", "que pensez-vous de cette
+voiture ?". notons cependant que les questions ouvertes sont considerees comme peu
+adaptees aux problemes de memorisation de comportement. "quels magazines avez-vous lus
+la semaine derniere ?", "quelles sont les dernieres emissions de television que vous avez
+aimees ?". pour ces questions qui font l'objet d'enqu<71>tes periodiques, il a ete prouve maintes
+fois que les questions fermees donnent des taux d'oubli plus faibles. en revanche, quand la
+qualite de la memorisation est en jeu, la forme ouverte reste indispensable.
+voici quatre exemples de reponses <20> la question <20> quelle est pour vous la chose la plus
+importante dans la vie ? <20> (question posee <20> des echantillons d<>environ mille personnes dans
+sept pays en 1991).
+1) la sante, ne pas manquer d'argent, avoir une bonne ambiance familiale, je voudrais
+pouvoir aider les enfants abandonnes, leur redonner le go<67>t <20> la vie, pouvoir aider les
+personnes <20>gees handicapees, secourir les gens autour de soi.
+2) c'est de faire ce qu'on veut. lire, voyager si je pouvais. les loisirs si on pouvait.
+3) la sante puisqu'il faut toujours travailler quand on est commer<65>ant. une bonne entente en
+famille. avoir assez d'argent pour vivre.
+4) la famille, ma famille, mon foyer, vivre avec la societe : mon entourage les voisins, pour
+faire quelque chose qu'il y ait moins de malheureux, donner du travail aux jeunes surtout.
+ces exemples illustrent <20> la fois la complexite et la richesse des reponses.
+les unites statistiques
+les programmes travaillent <20> partir du texte brut, en extrayant automatiquement des unites
+statistiques, la plupart du temps des formes graphiques (sequences de caracteres nonseparateurs).
+on utilise le vocable forme graphique parce que le mot <20> mot <20> lui-m<>me est
+ambigu. il designe en effet selon les contextes l<>occurrence d<>un mot (quand on dit qu<71>un
+texte a huit cent mots, on parle bien s<>r d<>occurrences, et non de mots differents), le type (qui
+correspond <20> la forme graphique) et le lemme (avoir est le lemme de avait, et, dans certains
+cas seulement, de avions). la premiere reponse de l<>exemple ci-dessus contient 38
+occurrences, mais la forme graphique <20> les <20> appara<72>t trois fois, <20> pouvoir <20> appara<72>t deux
+fois. le lemme de <20> bonne <20> est bon (le masculin singulier, selon une convention fran<61>aise),
+celui de <20> voudrais <20> est <20> vouloir <20>.
+dans le cas de l<>exemple precedent, pour 1009 reponses, on obtient 14337 occurrences de
+1394 formes distinctes (ou types). il est bien connu que la distribution de frequence des mots
+est tres dissymetrique (loi dite de zipf, apparentee <20> la distribution de pareto). ainsi, en ne
+retenant que les formes apparaissant au moins 20 fois, il reste un texte de 10 994 formes, avec
+seulement 97 formes distinctes (ainsi 7 % des mots distincts correspondent <20> 77 % du texte
+global). en particulier, pres de la moitie des formes grahiques distinctes n<>apparaissent qu<71>une
+fois ( ce sont les <20> hapax <20>).
+le post-codage
+le pretraitement empirique appele "post-codage" permet de fermer a posteriori les questions
+ouvertes. cette technique courante consiste <20> construire une batterie d'items <20> partir d'un sousechantillon
+de reponses, puis <20> codifier l'ensemble des reponses de fa<66>on <20> remplacer la
+question ouverte par une ou plusieurs questions fermees. pour l<>exemple ci-dessus, la seconde
+reponse, la plus simple, donnerait les items <20> lecture <20>, voyage <20>, <20> loisirs <20>, sous reserve que
+ces items apparaissent avec une certaine frequence dans l<>echantillon de reponses. en
+revanche la premiere reponse est plus delicate <20> post-coder.
+les outils statistique de base
+les outils de base sont la selection de formes caracteristiques, la selection de reponses
+modales, l'analyse des correspondances et la classification des tableaux lexicaux.
+formes ou segments caracteristiques (ou specificites)
+les formes caracteristiques sont les formes "anormalement" frequentes dans les reponses d'un
+groupe d'individus (technique propose par p. lafon en 1980). un test elementaire fonde sur la
+loi hypergeometrique permet de selectionner les mots (formes graphiques ou lemmes) dont la
+frequence dans un groupe est notablement superieure (ou inferieure pour les mots anticaracteristiques)
+<EFBFBD> la frequence moyenne dans le corpus. il s<>agit de test classique de
+comparaisons de frequences, maisla repetition de ce test conduit <20> prendre des seuils de
+signification tres severes (phenomene de comparaisons multiples bien connu des statisticiens).
+dans l<>exemple evoque plus haut, la frequence moyenne du mot travail dans le corpus etait de
+3.4 %; pour le groupe des femmes de plus de 55 ans, la frequence n<>est que de 1.2 %. cette
+difference est en fait hautement significative ( on peut exprimer le test de comparaison de
+frequences en termes d<>ecart-types : dans l<>hypothese d<>homogeneite des frequences, la
+valeur1.2% est <20> 4.5 ecart-types de la valeur moyenne 3.4). comme il s<>agit d<>une frequence
+anormalement faible, on parlera de mots anti-caracteristiques. [l<>individu statistique est ici
+l<EFBFBD>occurrence de mots. les femmes de plus de 55 ans ont emis 1349 mots dans leurs reponses.
+la variance de la frequence d<>un mot dont la frequence <20>theorique<75> est de 0.034 est donnee
+par la formule classique 0.034(1 <20> 0.034) /1349. on voit dans ces conditions qe la frequence
+observee de 0.012 est <20> 4.5 ecart-types de 0.034].
+les selections des reponses modales
+pour un groupe d'individus donne, et donc pour le regroupement de reponses correspondant,
+les reponses modales (ou encore phrases caracteristiques, ou documents-type, la terminologie
+variant selon les domaines d'application) sont des reponses originales du corpus de base, ayant
+la propriete de caracteriser au mieux le groupe. on peut, pour chaque regroupement, calculer
+la distance du profil lexical d'un individu au profil lexical moyen du groupement. on peut
+ensuite classer les distances par ordre croissant, et donc selectionner les reponses les plus
+representatives au sens du profil lexical, qui correspondront aux plus petites distances. on
+obtient ainsi une sorte de resume des reponses de chaque regroupement, forme de reponses
+originales (l. lebart et a. salem, statistique textuelle, dunod, 1994). toujours dans le cas de
+notre exemple, <20>etre heureux, avoir un bon travail, reussite professionnelle et familiale<6C> est
+ainsi une reponse caracteristique des jeunes hommes; <20>la sante, la famille<6C> est une reponse
+caracterisant les plus <20>ges. on utilise en pratique plusierus reponses caracteristiques par
+groupe.
+analyse des correspondances et classification
+le volume des donnees demande que l<>on fasse appel <20> de puissants outils de description. les
+methodes d<>analyses des correspondances et de classification peuvent decrire les tables de contingence
+croisant les reponses et les formes graphiques, ou des groupes de reponses (par exemple regroupement
+selon le niveau d'instruction des repondants) et les formes graphiques. elles permettent de visualiser
+sous forme de series de cartes planes (ou de dendrogrammes dans le cas des methodes de
+classification, ou de cartes auto-associatives de kohonen, methode <20>neuronale<6C> de visualisation) les
+associations entre mots (formes) et groupes ou modalites. ainsi, une visualisation des proximites entre
+mots et categories socioprofessionnelles pourra aider la lecture des reponses de chacune de ces
+categories.
+conclusions et ouvertures
+pour des reponses simples et stereotypees, nous l<>avons vu, les procedures de post-codage
+peuvent fonctionner. mentionnons cependant parmi les defauts de ce type de traitement :
+la mediation du chiffreur: les decisions <20> prendre sont parfois difficiles.
+la qualite de l'expression, le registre du vocabulaire, la tonalite generale de l'entretien sont
+des elements d'analyse perdus lors d'un post-codage (doit-on coder differemment <20> je ne sais
+pas<EFBFBD> et <20>je prefere ne rien dire<72> ?.
+les reponses composites, complexes, d'une grande diversite, sont tres difficile <20> post-coder,
+et c'est souvent dans ce cas que la valeur heuristique des reponses libres est la plus grande.
+les reponses peu frequentes, originales, peu claires en premiere lecture sont considerees
+comme du <20>bruit<69>, et affectees <20> des items residuels (<28>autres<65>) qui sont donc tres
+heterogenes et sont difficiles <20> manipuler.
+sans qu<71>il soit necessaire de proceder <20> un post codage, on peut, actuellement, <20> partir d'une
+ensemble de textes, et d'un seuil de frequence pour les formes graphiques, obtenir une
+visualisation des proximites entre textes (vis-<2D>-vis de leurs profils lexicaux) et entre formes
+graphiques (vis-<2D>-vis de leur repartition dans les textes). l'enrichissement des unites
+statistiques par les segments repetes,(cf. a. salem, pratique des segments repetes,
+klincksieck, 1987), leurs regroupements par categorisation morphologique, l'utilisation des
+formes caracteristiques ou specificites, l'adjonction des reponses modales ou des phrases ou
+unites de contexte caracteristiques ont perfectionne ces approches, et mis <20> la disposition de
+beaucoup d'utilisateurs des methodes et des logiciels utiles. dans certains domaines
+d'application precis (comme le traitement automatique des reponses aux questions ouvertes,
+qui nous interesse ici), l'efficacite de la methode, comme complement des approches
+traditionnelles, est reconnue.
+parallelement aux travaux relevant de l<>industrie de la langue, que nous avons evoques plus
+haut, et qui relevent d<>une ingenierie statistique complexe, il existe donc des applications
+textuelles de la statistique qui restent <20> portee de main. elles necessitent certes des logiciels
+specifiques, mais la nature familiere et vivante du materiau de base compense en quelque
+sorte la relative complexite des traitements et les difficultes d<>interpretation.
+proche des bases de donnees, de l<>intelligence artificielle et des reseaux de neurones, de la
+theorie de l<>apprentissage, des techniques recentes d<>extraction et de gestion des
+connaissances, le domaine textuel illustre bien la polyvalence et la puissance de la
+methodologie statistique. m<>me quand les methodes prennent parfois les noms plus exotiques
+de fouille de texte ou de text mining, le statisticien est toujours sollicite quand il s<>agit de
+conna<EFBFBD>tre la portee reelle des faits observes et des traits structuraux obtenus, de savoir ce que
+l<EFBFBD>on a le droit de dire ou le devoir de ne pas dire, c<>est-<2D>-dire finalement de donner un statut
+scientifique aux resultats.
--- a/texte_2.txt
+++ b/texte_2.txt
@@ -0,0 +1,758 @@
+text classification using machine learning techniques
+m. ikonomakis
+department of mathematics
+university of patras, greece
+ikonomakis@mailbox.gr
+s. kotsiantis
+department of mathematics
+university of patras, greece
+sotos@math.upatras.gr
+v. tampakas
+technological educational
+institute of patras, greece
+tampakas@teipat.gr
+abstract: automated text classification has been considered as a vital method to manage and process a vast
+amount of documents in digital forms that are widespread and continuously increasing. in general, text
+classification plays an important role in information extraction and summarization, text retrieval, and questionanswering.
+this paper illustrates the text classification process using machine learning techniques. the
+references cited cover the major theoretical issues and guide the researcher to interesting research directions.
+key-words: text mining, learning algorithms, feature selection, text representation
+1 introduction
+automatic text classification has always been an
+important application and research topic since the
+inception of digital documents. today, text
+classification is a necessity due to the very large
+amount of text documents that we have to deal with
+daily.
+in general, text classification includes topic based
+text classification and text genre-based
+classification. topic-based text categorization
+classifies documents according to their topics [33].
+texts can also be written in many genres, for
+instance: scientific articles, news reports, movie
+reviews, and advertisements. genre is defined on
+the way a text was created, the way it was edited,
+the register of language it uses, and the kind of
+audience to it is addressed. previous work on
+genre classification recognized that this task differs
+from topic-based categorization [13].
+typically, most data for genre classification are
+collected from the web, through newsgroups,
+bulletin boards, and broadcast or printed news.
+they are multi-source, and consequently have
+different formats, different preferred vocabularies
+and often significantly different writing styles even
+for documents within one genre. namely, the data
+are heterogenous.
+intuitively text classification is the task of
+classifying a document under a predefined
+category. more formally, if i d is a document of the
+entire set of documents d and { } 1 2 , ,..., n c c c is the
+set of all the categories, then text classification
+assigns one category j c to a document i d .
+as in every supervised machine learning task, an
+initial dataset is needed. a document may be
+assigned to more than one category (ranking
+classification), but in this paper only researches on
+hard categorization (assigning a single category to
+each document) are taken into consideration.
+moreover, approaches, that take into consideration
+other information besides the pure text, such as
+hierarchical structure of the texts or date of
+publication, are not presented. this is because the
+main issue of this paper is to present techniques
+that exploit the most of the text of each document
+and perform best under this condition.
+sebastiani gave an excellent review of text
+classification domain [25]. thus, in this work apart
+from the brief description of the text classification
+we refer to some more recent works than those in
+sebastiani<EFBFBD>s article as well as few articles that were
+not referred by sebastiani. in figure 1 is given the
+graphical representation of the text classification
+process.
+.
+fig. 1. text classification process
+the task of constructing a classifier for
+documents does not differ a lot from other tasks of
+machine learning. the main issue is the
+representation of a document [16]. in section 2 the
+document representation is presented. one
+particularity of the text categorization problem is
+read
+document
+tokenize
+text
+stemming
+delete
+stopwords
+vector representation of
+text
+feature selection and/or
+feature transformation
+learning
+algorithm
+that the number of features (unique words or
+phrases) can easily reach orders of tens of
+thousands. this raises big hurdles in applying many
+sophisticated learning algorithms to the text
+categorization
+thus dimension reduction methods are called for.
+two possibilities exist, either selecting a subset of
+the original features [3], or transforming the
+features into new ones, that is, computing new
+features as some functions of the old ones [10]. we
+examine both in turn in section 3 and section 4.
+after the previous steps a machine learning
+algorithm can be applied. some algorithms have
+been proven to perform better in text classification
+tasks and are more often used; such as support
+vector machines. a brief description of recent
+modification of learning algorithms in order to be
+applied in text classification is given in section 5.
+there are a number of methods to evaluate the
+performance of a machine learning algorithms in
+text classification. most of these methods are
+described in section 6. some open problems are
+mentioned in the last section.
+2 vector space document
+representations
+a document is a sequence of words [16]. so each
+document is usually represented by an array of
+words. the set of all the words of a training set is
+called vocabulary, or feature set. so a document
+can be presented by a binary vector, assigning the
+value 1 if the document contains the feature-word
+or 0 if the word does not appear in the document.
+this can be translated as positioning a document in
+a rv space, were v denotes the size of the
+vocabulary v .
+not all of the words presented in a document can
+be used in order to train the classifier [19]. there
+are useless words such as auxiliary verbs,
+conjunctions and articles. these words are called
+stopwords. there exist many lists of such words
+which are removed as a preprocess task. this is
+done because these words appear in most of the
+documents.
+stemming is another common preprocessing step.
+in order to reduce the size of the initial feature set
+is to remove misspelled or words with the same
+stem. a stemmer (an algorithm which performs
+stemming), removes words with the same stem and
+keeps the stem or the most common of them as
+feature. for example, the words <20>train<69>, <20>training<6E>,
+<EFBFBD>trainer<EFBFBD> and <20>trains<6E> can be replaced with <20>train<69>.
+although stemming is considered by the text
+classification community to amplify the classifiers
+performance, there are some doubts on the actual
+importance of aggressive stemming, such as
+performed by the porter stemmer [25].
+an ancillary feature engineering choice is the
+representation of the feature value [16]. often a
+boolean indicator of whether the word occurred in
+the document is sufficient. other possibilities
+include the count of the number of times the word
+occurred in the document, the frequency of its
+occurrence normalized by the length of the
+document, the count normalized by the inverse
+document frequency of the word. in situations
+where the document length varies widely, it may be
+important to normalize the counts. further, in short
+documents words are unlikely to repeat, making
+boolean word indicators nearly as informative as
+counts. this yields a great savings in training
+resources and in the search space of the induction
+algorithm. it may otherwise try to discretize each
+feature optimally, searching over the number of
+bins and each bin<69>s threshold.
+most of the text categorization algorithms in the
+literature represent documents as collections of
+words. an alternative which has not been
+sufficiently explored is the use of word meanings,
+also known as senses. kehagias et al. using several
+algorithms, they compared the categorization
+accuracy of classifiers based on words to that of
+classifiers based on senses [12]. the document
+collection on which this comparison took place is a
+subset of the annotated brown corpus semantic
+concordance. a series of experiments indicated that
+the use of senses does not result in any significant
+categorization improvement.
+3 feature selection
+the aim of feature-selection methods is the
+reduction of the dimensionality of the dataset by
+removing features that are considered irrelevant for
+the classification [6]. this transformation
+procedure has been shown to present a number of
+advantages, including smaller dataset size, smaller
+computational requirements for the text
+categorization algorithms (especially those that do
+not scale well with the feature set size) and
+considerable shrinking of the search space. the
+goal is the reduction of the curse of dimensionality
+to yield improved classification accuracy. another
+benefit of feature selection is its tendency to reduce
+overfitting, i.e. the phenomenon by which a
+classifier is tuned also to the contingent
+characteristics of the training data rather than the
+constitutive characteristics of the categories, and
+therefore, to increase generalization.
+methods for feature subset selection for text
+document classification task use an evaluation
+function that is applied to a single word [27].
+scoring of individual words (best individual
+features) can be performed using some of the
+measures, for instance, document frequency, term
+frequency, mutual information, information gain,
+odds ratio, ?2 statistic and term strength [3], [30],
+[6], [28], [27]. what is common to all of these
+feature-scoring methods is that they conclude by
+ranking the features by their independently
+determined scores, and then select the top scoring
+features. the most common metrics are presented
+in table 1. the symbolisms that are presented in
+table 1 are described in table 2.
+on the contrary with best individual features
+(bif) methods, sequential forward selection (sfs)
+methods firstly select the best single word
+evaluated by given criterion [20]; then, add one
+word at a time until the number of selected words
+reaches desired k words. sfs methods do not result
+in the optimal words subset but they take note of
+dependencies between words as opposed to the bif
+methods. therefore sfs often give better results
+than bif. however, sfs are not usually used in
+text classification because of their computation cost
+due to large vocabulary size.
+forman has present benchmark comparison of 12
+metrics on well known training sets [6]. according
+to forman, bns performed best by wide margin
+using 500 to 1000 features, while information gain
+outperforms the other metrics  the features
+vary between 20 and 50. accuracy 2 performed
+equally well as information gain. concerning the
+performance of chi-square, it was consistently
+worse the information gain. since there is no
+metric that performs constantly better than all
+others, researchers often combine two metrics in
+order to benefit from both metrics [6].
+novovicova et al. used sfs that took into
+account, not only the mutual information between a
+class and a word but also between a class and two
+words [22]. the results were slightly better.
+although machine learning based text
+classification is a good method as far as
+performance is concerned, it is inefficient for it to
+handle the very large training corpus. thus, apart
+from feature selection, many times instance
+selection is needed.
+c a class of the training set
+c the set of classes of the training set
+d a document of the training set
+d or db the set of documents of the training set
+t or w a term or word
+p(c) or ( ) i p c the probability of the class c or i c respectively how often the class appears in the
+training set
+p(<28>c) or p(c) the probability of the class not occurring
+p(c|t) the probability of the class c given that the term t appears respectively, p(c |t)
+denotes the probability of class c not occurring, given that the term t appears
+p(c,t) the probability of the class c and term t occurring simultaneously
+h(c) the entropy of the set c
+( ) i df t the document frequency of term k t
+( ) n df t the frequency of term t in documents containing t in every of their n splits
+( ) ~
+df t
+the document frequency, taking into consideration only documents in which t appears
+more than once
+#(c) or #(t ) the number of documents which belong to class or respectively contain the term t
+#(c,t) the number of documents containing term t and belong to class c
+table 2. symbolisms
+guan and zhou proposed a training-corpus
+pruning based approach to speedup the process [8].
+by using this approach, the size of training corpus
+can be reduced significantly while classification
+performance can be kept at a level close to that of
+without training documents pruning according to
+their experiments.
+fragoudis et al. [7] integrated feature and
+instance selection for text classification with even
+better results. their method works in two steps. in
+the first step, their method sequentially selects
+features that have high precision in predicting the
+target class. all documents that do not contain at
+least one such feature are dropped from the training
+set. in the second step, their method searches
+within this subset of the initial dataset for a set of
+features that tend to predict the complement of the
+target class and these features are also selected. the
+sum of the features selected during these two steps
+is the new feature set and the documents selected
+from the first step comprise the training set
+4 feature transformation
+feature transformation varies significantly from
+feature selection approaches, but like them its
+purpose is to reduce the feature set size [10]. this
+approach does not weight terms in order to discard
+the lower weighted but compacts the vocabulary
+based on feature concurrencies.
+principal component analysis is a well known
+method for feature transformation [38]. its aim is to
+learn a discriminative transformation matrix in
+order to reduce the initial feature space into a lower
+dimensional feature space in order to reduce the
+complexity of the classification task without any
+trade-off in accuracy. the transform is derived
+from the eigenvectors corresponding. the
+covariance matrix of data in pca corresponds to
+the document term matrix multiplied by its
+transpose. entries in the covariance matrix
+represent co-occurring terms in the documents.
+eigenvectors of this matrix corresponding to the
+dominant eigenvalues are now directions related to
+dominant combinations can be called <20>topics<63> or
+<EFBFBD>semantic concepts<74>. a transform matrix
+constructed from these eigenvectors projects a
+document onto these <20>latent semantic concepts<74>,
+and the new low dimensional representation
+consists of the magnitudes of these projections. the
+eigenanalysis can be computed efficiently by a
+sparse variant of singular value decomposition of
+the document-term matrix [11].
+in the information retrieval community this
+method has been named latent semantic indexing
+(lsi) [23]. this approach is not intuitive
+discernible for a human but has a good
+performance.
+qiang et al [37] performed experiments using k-
+nn lsi, a new combination of the standard k-nn
+method on top of lsi, and applying a new matrix
+decomposition algorithm, semi-discrete matrix
+decomposition, to decompose the vector matrix.
+the experimental results showed that text
+categorization effectiveness in this space was better
+and it was also computationally less costly, because
+it needed a lower dimensional space.
+the authors of [4] present a comparison of the
+performance of a number of text categorization
+methods in two different data sets. in particular,
+they evaluate the vector and lsi methods, a
+classifier based on support vector machines
+(svm) and the k-nearest neighbor variations of
+the vector and lsi models. their results show that
+overall, svms and k-nn lsi perform better than
+the other methods, in a statistically significant way.
+5 machine learning algorithms
+after feature selection and transformation the
+documents can be easily represented in a form that
+can be used by a ml algorithm. many text
+classifiers have been proposed in the literature
+using machine learning techniques, probabilistic
+models, etc. they often differ in the approach
+adopted: decision trees, naive-bayes, rule
+induction, neural networks, nearest neighbors, and
+lately, support vector machines. although many
+approaches have been proposed, automated text
+classification is still a major area of research
+primarily because the effectiveness of current
+automated text classifiers is not faultless and still
+needs improvement.
+naive bayes is often used in text classification
+applications and experiments because of its
+simplicity and effectiveness [14]. however, its
+performance is often degraded because it does not
+model text well. schneider addressed the problems
+and show that they can be solved by some simple
+corrections [24]. klopotek and woch presented
+results of empirical evaluation of a bayesian
+multinet classifier based on a new method of
+learning very large tree-like bayesian networks
+[15]. the study suggests that tree-like bayesian
+networks are able to handle a text classification
+task in one hundred thousand variables with
+sufficient speed and accuracy.
+support vector machines (svm),  applied to
+text classification provide excellent precision, but
+poor recall. one means of customizing svms to
+improve recall, is to adjust the threshold associated
+with an svm. shanahan and roma described an
+automatic process for adjusting the thresholds of
+generic svm [26] with better results.
+johnson et al. described a fast decision tree
+construction algorithm that takes advantage of the
+sparsity of text data, and a rule simplification
+method that converts the decision tree into a
+logically equivalent rule set [9].
+lim proposed a method which improves
+performance of knn based text classification by
+using well estimated parameters [18]. some
+variants of the knn method with different decision
+functions, k values, and feature sets were proposed
+and evaluated to find out adequate parameters.
+corner classification (cc) network is a kind of
+feed forward neural network for instantly document
+classification. a training algorithm, named as
+textcc is presented in [34].
+the level of difficulty of text classification tasks
+naturally varies. as the number of distinct classes
+increases, so does the difficulty, and therefore the
+size of the training set needed. in any multi-class
+text classification task, inevitably some classes will
+be more difficult than others to classify. reasons
+for this may be: (1) very few positive training
+examples for the class, and/or (2) lack of good
+predictive features for that class.
+ training a binary classifier per category in
+text categorization, we use all the documents in the
+training corpus that belong to that category as
+relevant training data and all the documents in the
+training corpus that belong to all the other
+categories as non-relevant training data. it is often
+the case that there is an overwhelming number of
+non relevant training documents especially 
+there is a large collection of categories with each
+assigned to a small number of documents, which is
+typically an <20>imbalanced data problem". this
+problem presents a particular challenge to
+classification algorithms, which can achieve high
+accuracy by simply classifying every example as
+negative. to overcome this problem, cost sensitive
+learning is needed [5].
+a scalability analysis of a number of classifiers
+in text categorization is given in [32]. vinciarelli
+presents categorization experiments performed over
+noisy texts [31]. by noisy it is meant any text
+obtained through an extraction process (affected by
+errors) from media other than digital texts (e.g.
+transcriptions of speech recordings extracted with a
+recognition system). the performance of the
+categorization system over the clean and noisy
+(word error rate between ~10 and ~50 percent)
+versions of the same documents is compared. the
+noisy texts are obtained through handwriting
+recognition and simulation of optical character
+recognition. the results show that the performance
+loss is acceptable.
+other authors [36] also proposed to parallelize
+and distribute the process of text classification.
+with such a procedure, the performance of
+classifiers can be improved in both accuracy and
+time complexity.
+recently in the area of machine learning the
+concept of combining classifiers is proposed as a
+new direction for the improvement of the
+performance of individual classifiers. numerous
+methods have been suggested for the creation of
+ensemble of classifiers. mechanisms that are used
+to build ensemble of classifiers include: i) using
+different subset of training data with a single
+learning method, ii) using different training
+parameters with a single training method (e.g. using
+different initial weights for each neural network in
+an ensemble), iii) using different learning methods.
+in the context of combining multiple classifiers
+for text categorization, a number of researchers
+have shown that combining different classifiers can
+improve classification accuracy [1], [29].
+comparison between the best individual classifier
+and the combined method, it is observed that the
+performance of the combined method is superior
+[2]. nardiello et al. [21] also proposed algorithms
+in the family of "boosting"-based learners for
+automated text classification with good results.
+6 evaluation
+there are various methods to determine
+effectiveness; however, precision, recall, and
+accuracy are most often used. to determine these,
+one must first begin by understanding if the
+classification of a document was a true positive
+(tp), false positive (fp), true negative (tn), or
+false negative (fn) (see table 3).
+tp determined as a document being classified
+correctly as relating to a category.
+fp determined as a document that is said to be
+related to the category incorrectly.
+fn determined as a document that is not marked
+as related to a category but should be.
+tn documents that should not be marked as being
+in a particular category and are not.
+table 3. classification of a document
+precision (pi) is determined as the conditional
+probability that a random document d is classified
+under ci, or what would be deemed the correct
+category. it represents the classifiers ability to place
+a document as being under the correct category as
+opposed to all documents place in that category,
+both correct and incorrect:
+i
+i i
+tp
+i tp fp p + =
+recall (?i) is defined as the probability that, if a
+random document dx should be classified under
+category (ci), this decision is taken.
+i
+i i
+tp
+i tp fn ? + =
+accuracy is commonly used as a measure for
+categorization techniques. accuracy values,
+however, are much less reluctant to variations in
+the number of correct decisions than precision and
+recall:
+i i i i
+i i
+tp tn fp fn
+tp tn
+i a + + +
+= +
+many times there are very few instances of the
+interesting category in text categorization. this
+overrepresentation of the negative class in
+information retrieval problems can cause problems
+in evaluating classifiers' performances using
+accuracy. since accuracy is not a good metric for
+skewed datasets, the classification performance of
+algorithms in this case is measured by precision
+and recall [5].
+furthermore, precision and recall are often
+combined in order to get a better picture of the
+performance of the classifier. this is done by
+combining them in the following formula:
+( 2 )
+2
+1
+f<EFBFBD>
+<EFBFBD> p?
+<EFBFBD> p ?
+
+=
+
+,
+where p and ? denote presicion and recall
+respectively. <20> is a positive parameter, which
+represents the goal of the evaluation task. if
+presicion is considered to be more important that
+recall, then the value of <20> converges to zero. on the
+other hand, if recall is more important than
+presicion then <20> converges to infinity. usually <20> is
+set to 1, because in this way equal importance is
+given to each presicion and recall.
+reuters corpus volume i (rcv1) is an archive
+of over 800,000 manually categorized newswire
+stories recently made available by reuters, ltd. for
+research purposes [17]. using this collection, we
+can compare the learning algorithms.
+although research in the pass years had shown
+that training corpus could impact classification
+performance, little work was done to explore the
+underlying causes. the authors of [35] try to
+propose an approach to build semi-automatically
+high-quality training corpuses for better
+classification performance by first exploring the
+properties of training corpuses, and then giving an
+algorithm for constructing training corpuses semiautomatically.
+7 conclusion
+the text classification problem is an artificial
+intelligence research topic, especially given the
+vast number of documents available in the form of
+web pages and other electronic texts like emails,
+discussion forum postings and other electronic
+documents.
+it has observed that even for a specified
+classification method, classification performances
+of the classifiers based on different training text
+corpuses are different; and in some cases such
+differences are quite substantial. this observation
+implies that a) classifier performance is relevant to
+its training corpus in some degree, and b) good or
+high quality training corpuses may derive
+classifiers of good performance. unfortunately, up
+to now little research work in the literature has been
+seen on how to exploit training text corpuses to
+improve classifier<65>s performance.
+some important conclusions have not been
+reached yet, including:
+<EFBFBD> which feature selection methods are both
+computationally scalable and high-performing
+across classifiers and collections? given the
+high variability of text collections, do such
+methods even exist?
+<EFBFBD> would combining uncorrelated, but wellperforming
+methods yield a performance
+increase?
+<EFBFBD> change the thinking from word frequency
+based vector space to concepts based vector
+space. study the methodology of feature
+selection under concepts, to see if these will
+help in text categorization.
+<EFBFBD> make the dimensionality reduction more
+efficient over large corpus.
+moreover, there are other two open problems in
+text mining: polysemy, synonymy. polysemy refers
+to the fact that a word can have multiple meanings.
+distinguishing between different meanings of a
+word (called word sense disambiguation) is not
+easy, often requiring the context in which the word
+appears. synonymy means that different words can
+have the same or similar meaning.
+references:
+[1] bao y. and ishii n., <20>combining multiple knn
+classifiers for text categorization by
+reducts<EFBFBD>, lncs 2534, 2002, pp. 340-347
+[2] bi y., bell d., wang h., guo g., greer k.,
+<EFBFBD>combining multiple classifiers using
+dempster's rule of combination for text
+categorization<EFBFBD>, mdai, 2004, 127-138.
+[3] brank j., grobelnik m., milic-frayling n.,
+mladenic d., <20>interaction of feature selection
+methods and linear classification models<6C>,
+proc. of the 19th international conference on
+machine learning, australia, 2002.
+[4] ana cardoso-cachopo, arlindo l. oliveira, an
+empirical comparison of text categorization
+methods, lecture notes in computer science,
+volume 2857, jan 2003, pages 183 - 196
+[5] chawla, n. v., bowyer, k. w., hall, l. o.,
+kegelmeyer, w. p., <20>smote: synthetic
+minority over-sampling technique,<2C> journal
+of ai research, 16 2002, pp. 321-357.
+[6] forman, g., an experimental study of feature
+selection metrics for text categorization.
+journal of machine learning research, 3 2003,
+pp. 1289-1305
+[7] fragoudis d., meretakis d., likothanassis s.,
+<EFBFBD>integrating feature and instance selection for
+text classification<6F>, sigkdd <20>02, july 23-26,
+2002, edmonton, alberta, canada.
+[8] guan j., zhou s., <20>pruning training corpus to
+speedup text classification<6F>, dexa 2002, pp.
+831-840
+[9] d. e. johnson, f. j. oles, t. zhang, t. goetz,
+<EFBFBD>a decision-tree-based symbolic rule induction
+system for text categorization<6F>, ibm systems
+journal, september 2002.
+[10] han x., zu g., ohyama w., wakabayashi
+t., kimura f., accuracy improvement of
+automatic text classification based on
+feature transformation and multi-classifier
+combination, lncs, volume 3309, jan 2004,
+pp. 463-468
+[11] ke h., shaoping m., <20>text categorization
+based on concept indexing and principal
+component analysis<69>, proc. tencon 2002
+conference on computers, communications,
+control and power engineering, 2002, pp. 51-
+56.
+[12] kehagias a., petridis v., kaburlasos v.,
+fragkou p., <20>a comparison of word- and
+sense-based text categorization using
+several classification algorithms<6D>, jiis,
+volume 21, issue 3, 2003, pp. 227-247.
+[13] b. kessler, g. nunberg, and h. schutze.
+automatic detection of text genre. in
+proceedings of the thirty-fifth acl and
+eacl, pages 32<33>38, 1997.
+[14] kim s. b., rim h. c., yook d. s. and lim
+h. s., <20>effective methods for improving naive
+bayes text classifiers<72>, lnai 2417, 2002, pp.
+414-423
+[15] klopotek m. and woch m., <20>very large
+bayesian networks in text classification<6F>,
+iccs 2003, lncs 2657, 2003, pp. 397-406
+[16] leopold, edda & kindermann, j<>rg, <20>text
+categorization with support vector machines.
+how to represent texts in input space?<3F>,
+machine learning 46, 2002, pp. 423 - 444.
+[17] lewis d., yang y., rose t., li f., <20>rcv1:
+a new benchmark collection for text
+categorization research<63>, journal of machine
+learning research 5, 2004, pp. 361-397.
+[18] heui lim, improving knn based text
+classification with well estimated parameters,
+lncs, vol. 3316, oct 2004, pages 516 - 523.
+[19] madsen r. e., sigurdsson s., hansen l. k.
+and lansen j., <20>pruning the vocabulary for
+better context recognition<6F>, 7th international
+conference on pattern recognition, 2004
+[20] montanes e., quevedo j. r. and diaz i.,
+<EFBFBD>a wrapper approach with support vector
+machines for text categorization<6F>, lncs
+2686, 2003, pp. 230-237
+[21] nardiello p., sebastiani f., sperduti a.,
+<EFBFBD>discretizing continuous attributes in
+adaboost for text categorization<6F>, lncs,
+volume 2633, jan 2003, pp. 320-334
+[22] novovicova j., malik a., and pudil p.,
+<EFBFBD>feature selection using improved mutual
+information for text classification<6F>,
+sspr&spr 2004, lncs 3138, pp. 1010<31>
+1017, 2004
+[23] qiang w., xiaolong w., yi g., <20>a study
+of semi-discrete matrix decomposition for lsi
+in automated text categorization<6F>, lncs,
+volume 3248, jan 2005, pp. 606-615.
+[24] schneider, k., techniques for improving
+the performance of naive bayes for text
+classification, lncs, vol. 3406, 2005, 682-
+693.
+[25] sebastiani f., <20>machine learning in
+automated text categorization<6F>, acm
+computing surveys, vol. 34 (1),2002, pp. 1-47.
+[26] shanahan j. and roma n., improving svm
+text classification performance through
+threshold adjustment, lnai 2837, 2003, 361-
+372
+[27] soucy p. and mineau g., <20>feature
+selection strategies for text categorization<6F>,
+ai 2003, lnai 2671, 2003, pp. 505-509
+[28] sousa p., pimentao j. p., santos b. r. and
+moura-pires f., <20>feature selection algorithms
+to improve documents classification
+performance<EFBFBD>, lnai 2663, 2003, pp. 288-296
+[29] sung-bae cho, jee-haeng lee, learning
+neural network ensemble for practical text
+classification, lecture notes in computer
+science, volume 2690, aug 2003, pages 1032
+<EFBFBD> 1036.
+[30] torkkola k., <20>discriminative features for
+text document classification<6F>, proc.
+international conference on pattern
+recognition, canada, 2002.
+[31] vinciarelli a., <20>noisy text categorization,
+pattern recognition<6F>, 17th international
+conference on (icpr'04) , 2004, pp. 554-557
+[32] y. yang, j. zhang and b. kisiel., <20>a
+scalability analysis of classifiers in text
+categorization<EFBFBD>, acm sigir'03, 2003, pp 96-
+103
+[33] y. yang. an evaluation of statistical
+approaches to text categorization. journal of
+information retrieval, 1(1/2):67<36>88, 1999.
+[34] zhenya zhang, shuguang zhang, enhong
+chen, xufa wang, hongmei cheng, textcc:
+new feed forward neural network for
+classifying documents instantly, lecture
+notes in computer science, volume 3497, jan
+2005, pages 232 <20> 237.
+[35] shuigeng zhou, jihong guan, evaluation
+and construction of training corpuses for text
+classification: a preliminary study, lecture
+notes in computer science, volume 2553, jan
+2002, page 97-108.
+[36] verayuth lertnattee, thanaruk
+theeramunkong, parallel text categorization
+for multi-dimensional data, lecture notes in
+computer science, volume 3320, jan 2004,
+pages 38 - 41
+[37] wang qiang, wang xiaolong, guan yi, a
+study of semi-discrete matrix decomposition
+for lsi in automated text categorization,
+lecture notes in computer science, volume
+3248, jan 2005, pages 606 <20> 615.
+[38] zu g., ohyama w., wakabayashi t.,
+kimura f., "accuracy improvement of
+automatic text classification based on feature
+transformation": proc: the 2003 acm
+symposium on document engineering,
+november 20-22, 2003, pp.118-120
--- a/texte_3.txt
+++ b/texte_3.txt
@@ -0,0 +1,30 @@
+Trattamenti statistici dei dati testuali (L. Lebart, CNRS-ENST; lebart@enst.fr)
+Il materiale statistico <20> testo <20> <20> onnipresente, quasi banale, sin dallo sviluppo di Internet e del web. Lo studio quantitativo e statistico di questi testi sembra essere apparso di recente, eppure gli studi statistici sui testi risalgono a diversi decenni fa, in particolare in Francia con i lavori di P. Guiraud (<28> Problemi e metodi di statistica linguistica <20>, PUF, 1960), C. Muller (<28> Principi e metodi di statistica lessicale <20>, Hachette, 1977) e successivamente J.P. Benz<6E>cri (<28> Pratica dell'analisi dei dati, vol. 3: Linguistica e lessicologia <20>, Dunod, 1981).
+Dopo la <20> stilometria <20>, dedicata allo studio della forma dei testi, al fine di identificare un autore o di datare un'opera, sono apparse le tecniche di documentazione automatica (information retrieval in inglese), che mirano a ricercare in un database di documenti (articoli scientifici, riassunti, brevetti, ecc.) gli elementi pertinenti a partire da una richiesta espressa sotto forma di testo libero. Il campo disciplinare <20> trattamento del linguaggio naturale <20> <20> poi emerso e si <20> sviluppato inizialmente come uno dei campi di applicazione privilegiati dell'intelligenza artificiale. La complessit<69> del materiale, la necessit<69> di assimilare enormi corpus di testi, la rilevanza del concetto di apprendimento hanno naturalmente aperto questo campo ai metodi statistici. La statistica multidimensionale, le catene di Markov nascoste, i metodi di analisi discriminante intervengono quindi per costruire gli strumenti di base che sono i motori di ricerca sul web, gli analizzatori morfosintattici, i correttori ortografici, nonch<63> in campi applicativi pratici come il trattamento delle risposte alle domande aperte nelle indagini socio-economiche.
+Le domande aperte
+In un certo numero di situazioni di indagine, <20> utile lasciare aperte alcune domande, le cui risposte si presenteranno quindi sotto forma di testi di lunghezza variabile.
+La raccolta dei dati
+In almeno tre situazioni comuni, l'uso di domande aperte <20> necessario:
+Per ridurre o ottimizzare la durata dell'intervista di indagine: Sebbene le risposte libere e quelle guidate forniscano informazioni di natura diversa, le prime sono pi<70> economiche in termini di tempo di intervista e generano meno stanchezza. Una semplice domanda aperta (ad esempio: <20> Quali sono state le tue principali attivit<69> domenica scorsa? <20>) pu<70> sostituire lunghe liste di elementi.
+Come complemento a domande chiuse: Di solito si tratta della domanda <20> Perch<63>? <20>. Le spiegazioni riguardanti una risposta gi<67> data devono necessariamente essere spontanee. Un elenco di elementi potrebbe suggerire nuovi argomenti che potrebbero compromettere l'autenticit<69> dell'argomentazione.
+Per raccogliere informazioni che devono essere spontanee per loro natura: I questionari delle indagini di marketing abbondano di domande di questo tipo. Esempi includono: <20> Cosa ricordi di questa campagna pubblicitaria? <20> oppure <20> Cosa pensi di questa auto? <20>.
+Unit<EFBFBD> statistiche
+I programmi lavorano a partire dal testo grezzo, estraendo automaticamente delle unit<69> statistiche, per lo pi<70> forme grafiche (sequenze di caratteri non separatori). Si usa il termine forma grafica perch<63> la parola <20> parola <20> <20> ambigua. Pu<50> infatti riferirsi all'occorrenza di una parola, al tipo, oppure al lemma (ad esempio, <20> avere <20> <20> il lemma di <20> aveva <20>).
+nel caso dell<6C>esempio precedente per 1009 risposte si ottengono 14337 occorrenze di 1394 forme distinte (o tipi) <20> ben noto che la distribuzione di frequenza delle parole <20> molto asimmetrica (legge di zipf, simile alla distribuzione di pareto) cos<6F> selezionando solo le forme che appaiono almeno 20 volte rimane un testo di 10994 forme con solo 97 forme distinte (cos<6F> il 7% delle parole distinte corrisponde al 77% del testo totale) in particolare quasi la met<65> delle forme grafiche distinte appare una sola volta (queste sono gli <20> hapax <20>)
+il post-codifica
+il pretrattamento empirico chiamato <20> post-codifica <20> permette di chiudere a posteriori le domande aperte questa tecnica comune consiste nel costruire una serie di elementi a partire da un sotto-campione di risposte per poi codificare tutte le risposte in modo da sostituire la domanda aperta con una o pi<70> domande chiuse per l<>esempio sopra la seconda risposta la pi<70> semplice darebbe gli elementi <20> lettura <20> <20> viaggi <20> <20> tempo libero <20> a condizione che questi elementi appaiano con una certa frequenza nel campione di risposte tuttavia la prima risposta <20> pi<70> difficile da post-codificare
+gli strumenti statistici di base
+gli strumenti di base comprendono la selezione di forme caratteristiche la selezione di risposte modali l'analisi delle corrispondenze e la classificazione delle tabelle lessicali
+forme o segmenti caratteristici (o specificit<69>)
+le forme caratteristiche sono le forme <20> anormalmente <20> frequenti nelle risposte di un gruppo di individui (tecnica proposta da p lafon nel 1980) un test elementare basato sulla legge ipergeometrica permette di selezionare le parole (forme grafiche o lemmi) la cui frequenza in un gruppo <20> significativamente superiore (o inferiore per le parole anti-caratteristiche) alla frequenza media nel corpus si tratta di test classici di confronto delle frequenze ma la ripetizione di questo test porta a prendere soglie di significativit<69> molto rigide (fenomeno di confronti multipli ben noto agli statistici)
+nell<EFBFBD>esempio citato sopra la frequenza media della parola lavoro nel corpus era del 3,4%; per il gruppo delle donne oltre i 55 anni la frequenza <20> solo dell<6C>1,2% questa differenza <20> altamente significativa (si pu<70> esprimere il test di confronto delle frequenze in termini di scarti standard nella ipotesi di omogeneit<69> delle frequenze il valore del 1,2% <20> a 4,5 scarti standard dal valore medio del 3,4%) poich<63> si tratta di una frequenza anormalmente bassa si parler<65> di parole anti-caratteristiche
+le selezioni delle risposte modali
+per un gruppo di individui e quindi per il raggruppamento delle risposte corrispondenti le risposte modali (o frasi caratteristiche o documenti-tipo la terminologia varia a seconda dei campi di applicazione) sono risposte originali del corpus di base che caratterizzano meglio il gruppo si pu<70> per ogni raggruppamento calcolare la distanza del profilo lessicale di un individuo dal profilo lessicale medio del gruppo poi si possono ordinare le distanze in ordine crescente e selezionare le risposte pi<70> rappresentative in termini di profilo lessicale che corrisponderanno alle distanze minori si ottiene cos<6F> una sorta di sintesi delle risposte di ogni gruppo costituita da risposte originali (l lebart e a salem statistica testuale dunod 1994) sempre nel caso del nostro esempio <20> essere felice avere un buon lavoro successo professionale e familiare <20> <20> una risposta caratteristica dei giovani uomini <20> la salute la famiglia <20> <20> una risposta che caratterizza le persone pi<70> anziane in pratica si utilizzano pi<70> risposte caratteristiche per ogni gruppo
+analisi delle corrispondenze e classificazione
+il volume dei dati richiede l'uso di potenti strumenti di descrizione i metodi di analisi delle corrispondenze e di classificazione possono descrivere le tabelle di contingenza che incrociano le risposte con le forme grafiche o gruppi di risposte (ad esempio raggruppamenti in base al livello di istruzione dei rispondenti) e le forme grafiche questi strumenti permettono di visualizzare sotto forma di serie di mappe piane (o dendrogrammi nel caso dei metodi di classificazione o mappe auto-organizzate di kohonen metodo <20> neurale <20> di visualizzazione) le associazioni tra parole (forme) e gruppi o modalit<69> cos<6F> una visualizzazione delle prossimit<69> tra parole e categorie socio-professionali pu<70> aiutare a leggere le risposte di ciascuna di queste categorie
+conclusioni e prospettive
+per risposte semplici e stereotipate come abbiamo visto le procedure di post-codifica possono funzionare tuttavia tra i difetti di questo tipo di trattamento si possono menzionare:
+la mediazione del codificatore: le decisioni da prendere sono talvolta difficili
+la qualit<69> dell'espressione il registro del vocabolario la tonalit<69> generale dell'intervista sono elementi di analisi persi durante la post-codifica (bisogna codificare in modo diverso <20> non lo so <20> e <20> preferisco non dire nulla <20>?)
+le risposte composite complesse e molto diverse sono difficili da post-codificare ed <20> spesso in questi casi che il valore euristico delle risposte libere <20> maggiore
+le risposte poco frequenti originali e poco chiare a una prima lettura sono considerate come <20> rumore <20> e assegnate a categorie residuali (<28> altre <20>) che sono quindi molto eterogenee e difficili da gestire senza che sia necessario procedere a una post-codifica attualmente <20> possibile a partire da un insieme di testi e da una soglia di frequenza per le forme grafiche ottenere una visualizzazione delle prossimit<69> tra testi in base ai loro profili lessicali e tra forme grafiche in base alla loro distribuzione nei testi l'arricchimento delle unit<69> statistiche con segmenti ripetuti cf a salem pratica dei segmenti ripetuti klincksieck 1987 i loro raggruppamenti per categorizzazione morfologica l'utilizzo delle forme caratteristiche o specificit<69> l'aggiunta delle risposte modali o delle frasi o unit<69> di contesto caratteristiche hanno perfezionato questi approcci e messo a disposizione di molti utenti metodi e software utili in alcuni specifici ambiti applicativi come il trattamento automatico delle risposte alle domande aperte che ci interessa qui l'efficacia del metodo come complemento alle approcci tradizionali <20> riconosciuta parallelamente ai lavori dell'industria della lingua che abbiamo menzionato in precedenza e che fanno parte di un'ingegneria statistica complessa esistono quindi applicazioni testuali della statistica a portata di mano richiedono sicuramente software specifici ma la natura familiare e viva del materiale di base compensa in qualche modo la relativa complessit<69> dei trattamenti e le difficolt<6C> di interpretazione vicino alle basi di dati all'intelligenza artificiale e alle reti neurali alla teoria dell'apprendimento alle tecniche recenti di estrazione e gestione della conoscenza il dominio testuale illustra bene la polivalenza e la potenza della metodologia statistica anche quando i metodi assumono nomi pi<70> esotici come text mining o text mining il lavoro dello statistico <20> sempre necessario quando si tratta di conoscere la portata reale dei fatti osservati e dei tratti strutturali ottenuti di sapere cosa si pu<70> affermare e cosa non si deve dire ovvero di dare uno statuto scientifico ai risultati
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,8 @@
+"""
+Ce module contient des fonctions utilisataires
+"""
+
+
+def normalize_probabilities(prob_fr: float, prob_en: float, prob_it: float, searched: float) -> float:
+    sum = prob_fr + prob_en + prob_it
+    return searched / sum
--- a/utils/pycache/init.cpython-314.pyc
+++ b/utils/pycache/init.cpython-314.pyc