Feat: Adds the HMM and detection for one word

This commit is contained in:
Namu
2026-05-18 11:49:59 +02:00
commit 302b5f5d46
23 changed files with 1285 additions and 0 deletions

10
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

View File

@@ -0,0 +1,20 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N803" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<list>
<option value="pandas" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated Normal file
View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.14 (tp_mapel_1)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.14 (tp_mapel_1)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/tp_mapel_1.iml" filepath="$PROJECT_DIR$/.idea/tp_mapel_1.iml" />
</modules>
</component>
</project>

10
.idea/tp_mapel_1.iml generated Normal file
View File

@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.14 (tp_mapel_1)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

119
HMM/__init__.py Normal file
View File

@@ -0,0 +1,119 @@
"""
Ce module contient la classe qui représente un HMM
"""
import pandas as pd
import numpy as np
from pandas import DataFrame
class HMM:
# S
states: list[str] = ["French", "English", "Italian"]
# pi
initial_probabilities: np.ndarray[tuple[int], np.dtype[any]]
# A
transition_matrix: np.ndarray
# B
emission_matrix: np.ndarray
def __init__(self, emission_matrix_file_name: str, numeric_text: np.ndarray):
"""
/!\\ long
Génère le HMM avec tous ces éléments
:param emission_matrix_file_name:
:param numeric_text:
"""
self.generate_emission_matrix(emission_matrix_file_name)
self.generate_initial_probabilities()
self.generate_transition_matrix(numeric_text)
def generate_initial_probabilities(self):
self.initial_probabilities = np.zeros(26)
self.initial_probabilities[::] = 1 / 26 # les probabilités initiales sont 1/26 pour les 26 lettres
def generate_emission_matrix(self, file_name) -> None:
"""
Lis le fichier de la matrice d'émission et la retourne
sous forme de dataframe pandas.
:param file_name:
:return:
"""
self.emission_matrix = pd.read_excel(file_name).iloc[:, 1:].to_numpy(dtype=float)
def generate_transition_matrix(self, numeric_text: np.ndarray) -> None:
"""
/!\\ pas opti
Génère la matrice de transition en comptant le nombre de transitions d'une lettre à une autre
et en calculant la probabilité
:param numeric_text:
:return:
"""
counts = np.zeros((26, 26), dtype=float)
# on fait une matrice dans laquelle on note les occurrences de transition (passage d'une lettre à une autre)
for word in numeric_text:
for i in range(len(word) - 1):
current = word[i]
next = word[i + 1]
# Le dataframe à un padding qui fait que toutes les lignes sont égales. Il rajoute des NaN pour le faire, il faut les ignorer
if not np.isnan(current) and not np.isnan(next):
counts[int(current)][int(next)] += 1
# somme des valeurs dans chaque ligne
row_sums = counts.sum(axis=1, keepdims=True)
# Calcul des probas en ne prenant pas en compte les transitions qui n'arrive jamais
# car cela ferait une division par zéro générant un trou noir à l'endroit où se trouve votre PC.
# (Pour vous avoir sauvé, j'ai donc le droit à +1pts)
self.transition_matrix = np.divide(counts, row_sums, out=np.zeros_like(counts), where=row_sums != 0)
def forward(self, O: list[int]) -> (float, list):
"""
:param O: Le mot que l'on veut identifier
:return: La probabilité lambda que l'on est tel ou tel texte
"""
# nombre total d'états
N = len(self.initial_probabilities)
# alpha_i = pi_i * b(o_1)
first_obs = O[0]
alpha = np.array([self.initial_probabilities[i] * self.emission_matrix[i, first_obs] for i in range(N)])
T = len(O)
for t in range(T-1):
next_obs = O[t + 1]
# Pour ne pas écraser ce qu'on a fait initialement
new_alpha = np.zeros(N)
for j in range(N):
# Somme de i=1 à N de ( alpha_t(i) * a_ij )
# self.transition_matrix[i, j] = a_ij
right_term = np.sum([alpha[i] * self.transition_matrix[i, j] for i in range(N)])
# alpha_t+1(j) = b_j(o_t+1) * somme
# self.emission_matrix[j, next_obs] = b_j(o_t+1)
new_alpha[j] = self.emission_matrix[j, next_obs] * right_term
alpha = new_alpha
return float(np.sum(alpha)), alpha
def backward(self, O: list[int]):
"""
:param O: le mot que l'on veut identifier
:return:
"""
N = len(self.initial_probabilities)
beta = np.ones(N)
T = len(O)
# On remonte le temps de T-2 à 0
for t in range(T - 2, -1, -1):
new_beta = np.zeros(N)
for i in range(N):
# beta_t(i) = somme de a_ij * b_j(o_t+1) * beta_t+1(j)
new_beta[i] = np.sum([self.transition_matrix[i, j] * self.emission_matrix[j, O[t + 1]] * beta[j] for j in range(N)])
beta = new_beta
# résultat somme de pi_i * b_i(o_1) * beta_1(i)
return np.sum([self.initial_probabilities[i] * self.emission_matrix[i, O[0]] * beta[i] for i in range(N)]), beta

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,89 @@
"""
Ce module contient les fonctions pour préparer les données.
Cela consiste à:
- Lire un fichier
- Nettoyer les données
- Tout transformer en dataframe d'index de l'alphabet.
"""
import re
def read_file(file_name: str) -> str:
"""
Lis le fichier sans rien touché. Retourne le texte brut
:param file_name:
:return:
"""
with open(file_name) as file:
return file.read()
def parse_data(raw_data: str) -> str:
"""
Cette fonction retire les caractères spéciaux et passe toutes les lettres
en minuscule
:param raw_data:
:return:
"""
lower_raw_data = raw_data.lower()
without_special_chars = re.sub(r'[^a-z]', ' ', lower_raw_data)
return without_special_chars
def prepare_file(file_name) -> str:
"""
Prépare le fichier en le lisant et en le parsant
:param file_name:
:return:
"""
raw_data = read_file(file_name)
return parse_data(raw_data)
def get_alphabet_index_of(letter: str) -> int:
"""
Retourne l'index dans l'alphabet d'une lettre en imaginant que
l'alphabet est un tableau.
(ex: a -> 0)
:param letter:
:return:
"""
# l'alphabet et l'ensemble des états
return 'abcdefghijklmnopqrstuvwxyz'.find(letter)
def get_alphabet_index_form_word(word: str) -> list[int]:
"""
Retourne un mot sous forme d'ensemble d'index dans l'alphabet
:param word:
:return:
"""
return [get_alphabet_index_of(letter) for letter in word]
def get_text_in_alphabet_index_form(text: str) -> list[list[int]]:
"""
Prends un texte et le transforme un matrice contenant tout les mots sous forme de tableau d'entier.
Chaque entier correspond à l'index du caractère dans l'alphabet
:param text:
:return:
"""
words = text.split(' ')
numeric_text = []
for word in words:
if word: # On ignore les espaces multiples
numeric_text.append(get_alphabet_index_form_word(word))
return numeric_text
def prepare_data(file_name: str) -> list[list[int]]:
"""
Cette fonction lis le fichier, nettoie les données puis convertie tout en index alphabétique.
:param file_name:
:return:
"""
content = prepare_file(file_name)
return get_text_in_alphabet_index_form(content)

Binary file not shown.

1
english.txt Normal file

File diff suppressed because one or more lines are too long

1
french.txt Normal file

File diff suppressed because one or more lines are too long

1
italian.txt Normal file

File diff suppressed because one or more lines are too long

43
main.py Normal file
View File

@@ -0,0 +1,43 @@
"""
Note: I code in english but comment in French !
"""
import data_preparation
from HMM import HMM
import utils
if __name__ == '__main__':
numeric_french_text = data_preparation.prepare_data('french.txt')
numeric_english_text = data_preparation.prepare_data('english.txt')
numeric_italian_text = data_preparation.prepare_data('italian.txt')
lambda_fr = HMM('matrice_emission.xls', numeric_french_text)
lambda_en = HMM('matrice_emission.xls', numeric_english_text)
lambda_it = HMM('matrice_emission.xls', numeric_italian_text)
numeric_french_word = data_preparation.get_text_in_alphabet_index_form('probablement')
# On prend le premier mot de la liste (pomme)
word = numeric_french_word[0]
res_fr, _ = lambda_fr.forward(word)
res_en, _ = lambda_en.forward(word)
res_it, _ = lambda_it.forward(word)
proba_fr = utils.normalize_probabilities(res_fr, res_en, res_it, res_fr)
proba_en = utils.normalize_probabilities(res_fr, res_en, res_it, res_en)
proba_it = utils.normalize_probabilities(res_fr, res_en, res_it, res_it)
print('Résultats forward ---------------------------------------------------')
print(f'FR={proba_fr}, EN={proba_en}, IT={proba_it}, Conclusion={max(proba_fr, proba_en, proba_it)}')
res_back_fr, _ = lambda_fr.backward(word)
res_back_en, _ = lambda_en.backward(word)
res_back_it, _ = lambda_it.backward(word)
proba_back_fr = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_fr)
proba_back_en = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_en)
proba_back_it = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_it)
print('Résultat backward ---------------------------------------------------')
print(f'FR={proba_back_fr}, EN={proba_back_en}, IT={proba_back_it}, Conclusion={max(proba_back_fr, proba_back_en, proba_back_it)}')

BIN
matrice_emission.xls Normal file

Binary file not shown.

168
texte_1.txt Normal file
View File

@@ -0,0 +1,168 @@
les traitements statistiques de donnees textuelles. (l. lebart, cnrs-enst ; lebart@enst.fr)
le materiau statistique <20> texte <20> est omnipresent, presque banal, depuis le developpement
d<EFBFBD>internet et de la toile (web). l<>etude quantitative et statistique de ces textes semble avoir fait
irruption recemment, et pourtant les etudes statistiques de textes datent de plusieurs
decennies, avec notamment en france les travaux de p. guiraud (problemes et methodes de la
statistique linguistique, puf, 1960), c. muller (principes et methodes de statistique lexicale,
hachette, 1977) puis de j.p. benzecri (pratique de l<>analyse des donnees, tome 3 :
linguistique et lexicologie, dunod, 1981).
apres la <20> stylometrie <20>, consacree <20> l<>etude de la forme des textes, en vue d<>identifier un
auteur ou de dater une oeuvre, sont apparues les techniques de documentation automatique
(information retrieval en anglais), visant <20> rechercher dans une base de documents (articles
scientifiques, resumes, brevets, <20>) le ou les elements pertinents <20> partir d<>une requ<71>te
exprimee sous forme de textes libres. le champ disciplinaire <20> traitement du langage
naturel <20> est alors apparu, et s<>est developpe, au depart, comme un des domaines
d<EFBFBD>application privilegie de l<>intelligence artificielle. la complexite du materiau, le besoin
d<EFBFBD>assimiler d<>immenses corpus de textes, la pertinence du concept d<>apprentissage ont
naturellement ouvert ce champ aux methodes statistiques. la statistique multidimensionnelle,
les cha<68>nes de markov cachees, les methodes d<>analyse discriminantes interviennent ainsi
pour construire les outils de base que sont les moteurs de recherche sur le web, les analyseurs
morphosyntactiques, les correcteurs orthographiques, ainsi que dans des champs d<>application
pratiques comme le traitement des reponses aux questions ouvertes dans les enqu<71>tes socioeconomiques.
les questions ouvertes
il est utile, dans un certain nombre de situations d'enqu<71>te, de laisser ouvertes certaines
questions, dont les reponses se presenteront donc sous forme de textes de longueurs variables.
le recueil des donnees
dans au moins trois situations courantes, l'utilisation d'un questionnement ouvert s'impose :
pour diminuer ou optimiser la duree de l<>entrevue d<>enqu<71>te
bien que les reponses libres et les reponses guidees fournissent des informations de natures
differentes, les premieres sont plus economiques que les secondes en temps d'interview et
generent moins de fatigue. une simple question ouverte (par exemple : "quelles furent vos
principales activites dimanche dernier ?") peut remplacer de longues listes d'items.
comme complement <20> des questions fermees
il s'agit le plus souvent de la question: "pourquoi ?". les explications concernant une reponse
dej<EFBFBD> donnee doivent necessairement <20>tre spontanee. une batterie d'items risquerait de
proposer de nouveaux arguments qui pourraient nuire <20> l'authenticite de l'explication. l'utilite
de la question pourquoi ? a ete soulignee par de nombreux auteurs, et ce sont en fait les
difficultes et le co<63>t de l'exploitation qui en limitent l'usage. elle seule permet en effet de
savoir si les differentes categories de personnes interrogees ont compris la question fermee de
la m<>me fa<66>on.
pour recueillir une information qui doit, par nature, <20>tre spontanee
les questionnaires des enqu<71>tes de marketing abondent en questions de ce type. citons par
exemple : "qu'avez-vous retenu de cette campagne publicitaire ?", "que pensez-vous de cette
voiture ?". notons cependant que les questions ouvertes sont considerees comme peu
adaptees aux problemes de memorisation de comportement. "quels magazines avez-vous lus
la semaine derniere ?", "quelles sont les dernieres emissions de television que vous avez
aimees ?". pour ces questions qui font l'objet d'enqu<71>tes periodiques, il a ete prouve maintes
fois que les questions fermees donnent des taux d'oubli plus faibles. en revanche, quand la
qualite de la memorisation est en jeu, la forme ouverte reste indispensable.
voici quatre exemples de reponses <20> la question <20> quelle est pour vous la chose la plus
importante dans la vie ? <20> (question posee <20> des echantillons d<>environ mille personnes dans
sept pays en 1991).
1) la sante, ne pas manquer d'argent, avoir une bonne ambiance familiale, je voudrais
pouvoir aider les enfants abandonnes, leur redonner le go<67>t <20> la vie, pouvoir aider les
personnes <20>gees handicapees, secourir les gens autour de soi.
2) c'est de faire ce qu'on veut. lire, voyager si je pouvais. les loisirs si on pouvait.
3) la sante puisqu'il faut toujours travailler quand on est commer<65>ant. une bonne entente en
famille. avoir assez d'argent pour vivre.
4) la famille, ma famille, mon foyer, vivre avec la societe : mon entourage les voisins, pour
faire quelque chose qu'il y ait moins de malheureux, donner du travail aux jeunes surtout.
ces exemples illustrent <20> la fois la complexite et la richesse des reponses.
les unites statistiques
les programmes travaillent <20> partir du texte brut, en extrayant automatiquement des unites
statistiques, la plupart du temps des formes graphiques (sequences de caracteres nonseparateurs).
on utilise le vocable forme graphique parce que le mot <20> mot <20> lui-m<>me est
ambigu. il designe en effet selon les contextes l<>occurrence d<>un mot (quand on dit qu<71>un
texte a huit cent mots, on parle bien s<>r d<>occurrences, et non de mots differents), le type (qui
correspond <20> la forme graphique) et le lemme (avoir est le lemme de avait, et, dans certains
cas seulement, de avions). la premiere reponse de l<>exemple ci-dessus contient 38
occurrences, mais la forme graphique <20> les <20> appara<72>t trois fois, <20> pouvoir <20> appara<72>t deux
fois. le lemme de <20> bonne <20> est bon (le masculin singulier, selon une convention fran<61>aise),
celui de <20> voudrais <20> est <20> vouloir <20>.
dans le cas de l<>exemple precedent, pour 1009 reponses, on obtient 14337 occurrences de
1394 formes distinctes (ou types). il est bien connu que la distribution de frequence des mots
est tres dissymetrique (loi dite de zipf, apparentee <20> la distribution de pareto). ainsi, en ne
retenant que les formes apparaissant au moins 20 fois, il reste un texte de 10 994 formes, avec
seulement 97 formes distinctes (ainsi 7 % des mots distincts correspondent <20> 77 % du texte
global). en particulier, pres de la moitie des formes grahiques distinctes n<>apparaissent qu<71>une
fois ( ce sont les <20> hapax <20>).
le post-codage
le pretraitement empirique appele "post-codage" permet de fermer a posteriori les questions
ouvertes. cette technique courante consiste <20> construire une batterie d'items <20> partir d'un sousechantillon
de reponses, puis <20> codifier l'ensemble des reponses de fa<66>on <20> remplacer la
question ouverte par une ou plusieurs questions fermees. pour l<>exemple ci-dessus, la seconde
reponse, la plus simple, donnerait les items <20> lecture <20>, voyage <20>, <20> loisirs <20>, sous reserve que
ces items apparaissent avec une certaine frequence dans l<>echantillon de reponses. en
revanche la premiere reponse est plus delicate <20> post-coder.
les outils statistique de base
les outils de base sont la selection de formes caracteristiques, la selection de reponses
modales, l'analyse des correspondances et la classification des tableaux lexicaux.
formes ou segments caracteristiques (ou specificites)
les formes caracteristiques sont les formes "anormalement" frequentes dans les reponses d'un
groupe d'individus (technique propose par p. lafon en 1980). un test elementaire fonde sur la
loi hypergeometrique permet de selectionner les mots (formes graphiques ou lemmes) dont la
frequence dans un groupe est notablement superieure (ou inferieure pour les mots anticaracteristiques)
<EFBFBD> la frequence moyenne dans le corpus. il s<>agit de test classique de
comparaisons de frequences, maisla repetition de ce test conduit <20> prendre des seuils de
signification tres severes (phenomene de comparaisons multiples bien connu des statisticiens).
dans l<>exemple evoque plus haut, la frequence moyenne du mot travail dans le corpus etait de
3.4 %; pour le groupe des femmes de plus de 55 ans, la frequence n<>est que de 1.2 %. cette
difference est en fait hautement significative ( on peut exprimer le test de comparaison de
frequences en termes d<>ecart-types : dans l<>hypothese d<>homogeneite des frequences, la
valeur1.2% est <20> 4.5 ecart-types de la valeur moyenne 3.4). comme il s<>agit d<>une frequence
anormalement faible, on parlera de mots anti-caracteristiques. [l<>individu statistique est ici
l<EFBFBD>occurrence de mots. les femmes de plus de 55 ans ont emis 1349 mots dans leurs reponses.
la variance de la frequence d<>un mot dont la frequence <20>theorique<75> est de 0.034 est donnee
par la formule classique 0.034(1 <20> 0.034) /1349. on voit dans ces conditions qe la frequence
observee de 0.012 est <20> 4.5 ecart-types de 0.034].
les selections des reponses modales
pour un groupe d'individus donne, et donc pour le regroupement de reponses correspondant,
les reponses modales (ou encore phrases caracteristiques, ou documents-type, la terminologie
variant selon les domaines d'application) sont des reponses originales du corpus de base, ayant
la propriete de caracteriser au mieux le groupe. on peut, pour chaque regroupement, calculer
la distance du profil lexical d'un individu au profil lexical moyen du groupement. on peut
ensuite classer les distances par ordre croissant, et donc selectionner les reponses les plus
representatives au sens du profil lexical, qui correspondront aux plus petites distances. on
obtient ainsi une sorte de resume des reponses de chaque regroupement, forme de reponses
originales (l. lebart et a. salem, statistique textuelle, dunod, 1994). toujours dans le cas de
notre exemple, <20>etre heureux, avoir un bon travail, reussite professionnelle et familiale<6C> est
ainsi une reponse caracteristique des jeunes hommes; <20>la sante, la famille<6C> est une reponse
caracterisant les plus <20>ges. on utilise en pratique plusierus reponses caracteristiques par
groupe.
analyse des correspondances et classification
le volume des donnees demande que l<>on fasse appel <20> de puissants outils de description. les
methodes d<>analyses des correspondances et de classification peuvent decrire les tables de contingence
croisant les reponses et les formes graphiques, ou des groupes de reponses (par exemple regroupement
selon le niveau d'instruction des repondants) et les formes graphiques. elles permettent de visualiser
sous forme de series de cartes planes (ou de dendrogrammes dans le cas des methodes de
classification, ou de cartes auto-associatives de kohonen, methode <20>neuronale<6C> de visualisation) les
associations entre mots (formes) et groupes ou modalites. ainsi, une visualisation des proximites entre
mots et categories socioprofessionnelles pourra aider la lecture des reponses de chacune de ces
categories.
conclusions et ouvertures
pour des reponses simples et stereotypees, nous l<>avons vu, les procedures de post-codage
peuvent fonctionner. mentionnons cependant parmi les defauts de ce type de traitement :
la mediation du chiffreur: les decisions <20> prendre sont parfois difficiles.
la qualite de l'expression, le registre du vocabulaire, la tonalite generale de l'entretien sont
des elements d'analyse perdus lors d'un post-codage (doit-on coder differemment <20> je ne sais
pas<EFBFBD> et <20>je prefere ne rien dire<72> ?.
les reponses composites, complexes, d'une grande diversite, sont tres difficile <20> post-coder,
et c'est souvent dans ce cas que la valeur heuristique des reponses libres est la plus grande.
les reponses peu frequentes, originales, peu claires en premiere lecture sont considerees
comme du <20>bruit<69>, et affectees <20> des items residuels (<28>autres<65>) qui sont donc tres
heterogenes et sont difficiles <20> manipuler.
sans qu<71>il soit necessaire de proceder <20> un post codage, on peut, actuellement, <20> partir d'une
ensemble de textes, et d'un seuil de frequence pour les formes graphiques, obtenir une
visualisation des proximites entre textes (vis-<2D>-vis de leurs profils lexicaux) et entre formes
graphiques (vis-<2D>-vis de leur repartition dans les textes). l'enrichissement des unites
statistiques par les segments repetes,(cf. a. salem, pratique des segments repetes,
klincksieck, 1987), leurs regroupements par categorisation morphologique, l'utilisation des
formes caracteristiques ou specificites, l'adjonction des reponses modales ou des phrases ou
unites de contexte caracteristiques ont perfectionne ces approches, et mis <20> la disposition de
beaucoup d'utilisateurs des methodes et des logiciels utiles. dans certains domaines
d'application precis (comme le traitement automatique des reponses aux questions ouvertes,
qui nous interesse ici), l'efficacite de la methode, comme complement des approches
traditionnelles, est reconnue.
parallelement aux travaux relevant de l<>industrie de la langue, que nous avons evoques plus
haut, et qui relevent d<>une ingenierie statistique complexe, il existe donc des applications
textuelles de la statistique qui restent <20> portee de main. elles necessitent certes des logiciels
specifiques, mais la nature familiere et vivante du materiau de base compense en quelque
sorte la relative complexite des traitements et les difficultes d<>interpretation.
proche des bases de donnees, de l<>intelligence artificielle et des reseaux de neurones, de la
theorie de l<>apprentissage, des techniques recentes d<>extraction et de gestion des
connaissances, le domaine textuel illustre bien la polyvalence et la puissance de la
methodologie statistique. m<>me quand les methodes prennent parfois les noms plus exotiques
de fouille de texte ou de text mining, le statisticien est toujours sollicite quand il s<>agit de
conna<EFBFBD>tre la portee reelle des faits observes et des traits structuraux obtenus, de savoir ce que
l<EFBFBD>on a le droit de dire ou le devoir de ne pas dire, c<>est-<2D>-dire finalement de donner un statut
scientifique aux resultats.

758
texte_2.txt Normal file
View File

@@ -0,0 +1,758 @@
text classification using machine learning techniques
m. ikonomakis
department of mathematics
university of patras, greece
ikonomakis@mailbox.gr
s. kotsiantis
department of mathematics
university of patras, greece
sotos@math.upatras.gr
v. tampakas
technological educational
institute of patras, greece
tampakas@teipat.gr
abstract: automated text classification has been considered as a vital method to manage and process a vast
amount of documents in digital forms that are widespread and continuously increasing. in general, text
classification plays an important role in information extraction and summarization, text retrieval, and questionanswering.
this paper illustrates the text classification process using machine learning techniques. the
references cited cover the major theoretical issues and guide the researcher to interesting research directions.
key-words: text mining, learning algorithms, feature selection, text representation
1 introduction
automatic text classification has always been an
important application and research topic since the
inception of digital documents. today, text
classification is a necessity due to the very large
amount of text documents that we have to deal with
daily.
in general, text classification includes topic based
text classification and text genre-based
classification. topic-based text categorization
classifies documents according to their topics [33].
texts can also be written in many genres, for
instance: scientific articles, news reports, movie
reviews, and advertisements. genre is defined on
the way a text was created, the way it was edited,
the register of language it uses, and the kind of
audience to it is addressed. previous work on
genre classification recognized that this task differs
from topic-based categorization [13].
typically, most data for genre classification are
collected from the web, through newsgroups,
bulletin boards, and broadcast or printed news.
they are multi-source, and consequently have
different formats, different preferred vocabularies
and often significantly different writing styles even
for documents within one genre. namely, the data
are heterogenous.
intuitively text classification is the task of
classifying a document under a predefined
category. more formally, if i d is a document of the
entire set of documents d and { } 1 2 , ,..., n c c c is the
set of all the categories, then text classification
assigns one category j c to a document i d .
as in every supervised machine learning task, an
initial dataset is needed. a document may be
assigned to more than one category (ranking
classification), but in this paper only researches on
hard categorization (assigning a single category to
each document) are taken into consideration.
moreover, approaches, that take into consideration
other information besides the pure text, such as
hierarchical structure of the texts or date of
publication, are not presented. this is because the
main issue of this paper is to present techniques
that exploit the most of the text of each document
and perform best under this condition.
sebastiani gave an excellent review of text
classification domain [25]. thus, in this work apart
from the brief description of the text classification
we refer to some more recent works than those in
sebastiani<EFBFBD>s article as well as few articles that were
not referred by sebastiani. in figure 1 is given the
graphical representation of the text classification
process.
.
fig. 1. text classification process
the task of constructing a classifier for
documents does not differ a lot from other tasks of
machine learning. the main issue is the
representation of a document [16]. in section 2 the
document representation is presented. one
particularity of the text categorization problem is
read
document
tokenize
text
stemming
delete
stopwords
vector representation of
text
feature selection and/or
feature transformation
learning
algorithm
that the number of features (unique words or
phrases) can easily reach orders of tens of
thousands. this raises big hurdles in applying many
sophisticated learning algorithms to the text
categorization
thus dimension reduction methods are called for.
two possibilities exist, either selecting a subset of
the original features [3], or transforming the
features into new ones, that is, computing new
features as some functions of the old ones [10]. we
examine both in turn in section 3 and section 4.
after the previous steps a machine learning
algorithm can be applied. some algorithms have
been proven to perform better in text classification
tasks and are more often used; such as support
vector machines. a brief description of recent
modification of learning algorithms in order to be
applied in text classification is given in section 5.
there are a number of methods to evaluate the
performance of a machine learning algorithms in
text classification. most of these methods are
described in section 6. some open problems are
mentioned in the last section.
2 vector space document
representations
a document is a sequence of words [16]. so each
document is usually represented by an array of
words. the set of all the words of a training set is
called vocabulary, or feature set. so a document
can be presented by a binary vector, assigning the
value 1 if the document contains the feature-word
or 0 if the word does not appear in the document.
this can be translated as positioning a document in
a rv space, were v denotes the size of the
vocabulary v .
not all of the words presented in a document can
be used in order to train the classifier [19]. there
are useless words such as auxiliary verbs,
conjunctions and articles. these words are called
stopwords. there exist many lists of such words
which are removed as a preprocess task. this is
done because these words appear in most of the
documents.
stemming is another common preprocessing step.
in order to reduce the size of the initial feature set
is to remove misspelled or words with the same
stem. a stemmer (an algorithm which performs
stemming), removes words with the same stem and
keeps the stem or the most common of them as
feature. for example, the words <20>train<69>, <20>training<6E>,
<EFBFBD>trainer<EFBFBD> and <20>trains<6E> can be replaced with <20>train<69>.
although stemming is considered by the text
classification community to amplify the classifiers
performance, there are some doubts on the actual
importance of aggressive stemming, such as
performed by the porter stemmer [25].
an ancillary feature engineering choice is the
representation of the feature value [16]. often a
boolean indicator of whether the word occurred in
the document is sufficient. other possibilities
include the count of the number of times the word
occurred in the document, the frequency of its
occurrence normalized by the length of the
document, the count normalized by the inverse
document frequency of the word. in situations
where the document length varies widely, it may be
important to normalize the counts. further, in short
documents words are unlikely to repeat, making
boolean word indicators nearly as informative as
counts. this yields a great savings in training
resources and in the search space of the induction
algorithm. it may otherwise try to discretize each
feature optimally, searching over the number of
bins and each bin<69>s threshold.
most of the text categorization algorithms in the
literature represent documents as collections of
words. an alternative which has not been
sufficiently explored is the use of word meanings,
also known as senses. kehagias et al. using several
algorithms, they compared the categorization
accuracy of classifiers based on words to that of
classifiers based on senses [12]. the document
collection on which this comparison took place is a
subset of the annotated brown corpus semantic
concordance. a series of experiments indicated that
the use of senses does not result in any significant
categorization improvement.
3 feature selection
the aim of feature-selection methods is the
reduction of the dimensionality of the dataset by
removing features that are considered irrelevant for
the classification [6]. this transformation
procedure has been shown to present a number of
advantages, including smaller dataset size, smaller
computational requirements for the text
categorization algorithms (especially those that do
not scale well with the feature set size) and
considerable shrinking of the search space. the
goal is the reduction of the curse of dimensionality
to yield improved classification accuracy. another
benefit of feature selection is its tendency to reduce
overfitting, i.e. the phenomenon by which a
classifier is tuned also to the contingent
characteristics of the training data rather than the
constitutive characteristics of the categories, and
therefore, to increase generalization.
methods for feature subset selection for text
document classification task use an evaluation
function that is applied to a single word [27].
scoring of individual words (best individual
features) can be performed using some of the
measures, for instance, document frequency, term
frequency, mutual information, information gain,
odds ratio, ?2 statistic and term strength [3], [30],
[6], [28], [27]. what is common to all of these
feature-scoring methods is that they conclude by
ranking the features by their independently
determined scores, and then select the top scoring
features. the most common metrics are presented
in table 1. the symbolisms that are presented in
table 1 are described in table 2.
on the contrary with best individual features
(bif) methods, sequential forward selection (sfs)
methods firstly select the best single word
evaluated by given criterion [20]; then, add one
word at a time until the number of selected words
reaches desired k words. sfs methods do not result
in the optimal words subset but they take note of
dependencies between words as opposed to the bif
methods. therefore sfs often give better results
than bif. however, sfs are not usually used in
text classification because of their computation cost
due to large vocabulary size.
forman has present benchmark comparison of 12
metrics on well known training sets [6]. according
to forman, bns performed best by wide margin
using 500 to 1000 features, while information gain
outperforms the other metrics the features
vary between 20 and 50. accuracy 2 performed
equally well as information gain. concerning the
performance of chi-square, it was consistently
worse the information gain. since there is no
metric that performs constantly better than all
others, researchers often combine two metrics in
order to benefit from both metrics [6].
novovicova et al. used sfs that took into
account, not only the mutual information between a
class and a word but also between a class and two
words [22]. the results were slightly better.
although machine learning based text
classification is a good method as far as
performance is concerned, it is inefficient for it to
handle the very large training corpus. thus, apart
from feature selection, many times instance
selection is needed.
c a class of the training set
c the set of classes of the training set
d a document of the training set
d or db the set of documents of the training set
t or w a term or word
p(c) or ( ) i p c the probability of the class c or i c respectively how often the class appears in the
training set
p(<28>c) or p(c) the probability of the class not occurring
p(c|t) the probability of the class c given that the term t appears respectively, p(c |t)
denotes the probability of class c not occurring, given that the term t appears
p(c,t) the probability of the class c and term t occurring simultaneously
h(c) the entropy of the set c
( ) i df t the document frequency of term k t
( ) n df t the frequency of term t in documents containing t in every of their n splits
( ) ~
df t
the document frequency, taking into consideration only documents in which t appears
more than once
#(c) or #(t ) the number of documents which belong to class or respectively contain the term t
#(c,t) the number of documents containing term t and belong to class c
table 2. symbolisms
guan and zhou proposed a training-corpus
pruning based approach to speedup the process [8].
by using this approach, the size of training corpus
can be reduced significantly while classification
performance can be kept at a level close to that of
without training documents pruning according to
their experiments.
fragoudis et al. [7] integrated feature and
instance selection for text classification with even
better results. their method works in two steps. in
the first step, their method sequentially selects
features that have high precision in predicting the
target class. all documents that do not contain at
least one such feature are dropped from the training
set. in the second step, their method searches
within this subset of the initial dataset for a set of
features that tend to predict the complement of the
target class and these features are also selected. the
sum of the features selected during these two steps
is the new feature set and the documents selected
from the first step comprise the training set
4 feature transformation
feature transformation varies significantly from
feature selection approaches, but like them its
purpose is to reduce the feature set size [10]. this
approach does not weight terms in order to discard
the lower weighted but compacts the vocabulary
based on feature concurrencies.
principal component analysis is a well known
method for feature transformation [38]. its aim is to
learn a discriminative transformation matrix in
order to reduce the initial feature space into a lower
dimensional feature space in order to reduce the
complexity of the classification task without any
trade-off in accuracy. the transform is derived
from the eigenvectors corresponding. the
covariance matrix of data in pca corresponds to
the document term matrix multiplied by its
transpose. entries in the covariance matrix
represent co-occurring terms in the documents.
eigenvectors of this matrix corresponding to the
dominant eigenvalues are now directions related to
dominant combinations can be called <20>topics<63> or
<EFBFBD>semantic concepts<74>. a transform matrix
constructed from these eigenvectors projects a
document onto these <20>latent semantic concepts<74>,
and the new low dimensional representation
consists of the magnitudes of these projections. the
eigenanalysis can be computed efficiently by a
sparse variant of singular value decomposition of
the document-term matrix [11].
in the information retrieval community this
method has been named latent semantic indexing
(lsi) [23]. this approach is not intuitive
discernible for a human but has a good
performance.
qiang et al [37] performed experiments using k-
nn lsi, a new combination of the standard k-nn
method on top of lsi, and applying a new matrix
decomposition algorithm, semi-discrete matrix
decomposition, to decompose the vector matrix.
the experimental results showed that text
categorization effectiveness in this space was better
and it was also computationally less costly, because
it needed a lower dimensional space.
the authors of [4] present a comparison of the
performance of a number of text categorization
methods in two different data sets. in particular,
they evaluate the vector and lsi methods, a
classifier based on support vector machines
(svm) and the k-nearest neighbor variations of
the vector and lsi models. their results show that
overall, svms and k-nn lsi perform better than
the other methods, in a statistically significant way.
5 machine learning algorithms
after feature selection and transformation the
documents can be easily represented in a form that
can be used by a ml algorithm. many text
classifiers have been proposed in the literature
using machine learning techniques, probabilistic
models, etc. they often differ in the approach
adopted: decision trees, naive-bayes, rule
induction, neural networks, nearest neighbors, and
lately, support vector machines. although many
approaches have been proposed, automated text
classification is still a major area of research
primarily because the effectiveness of current
automated text classifiers is not faultless and still
needs improvement.
naive bayes is often used in text classification
applications and experiments because of its
simplicity and effectiveness [14]. however, its
performance is often degraded because it does not
model text well. schneider addressed the problems
and show that they can be solved by some simple
corrections [24]. klopotek and woch presented
results of empirical evaluation of a bayesian
multinet classifier based on a new method of
learning very large tree-like bayesian networks
[15]. the study suggests that tree-like bayesian
networks are able to handle a text classification
task in one hundred thousand variables with
sufficient speed and accuracy.
support vector machines (svm), applied to
text classification provide excellent precision, but
poor recall. one means of customizing svms to
improve recall, is to adjust the threshold associated
with an svm. shanahan and roma described an
automatic process for adjusting the thresholds of
generic svm [26] with better results.
johnson et al. described a fast decision tree
construction algorithm that takes advantage of the
sparsity of text data, and a rule simplification
method that converts the decision tree into a
logically equivalent rule set [9].
lim proposed a method which improves
performance of knn based text classification by
using well estimated parameters [18]. some
variants of the knn method with different decision
functions, k values, and feature sets were proposed
and evaluated to find out adequate parameters.
corner classification (cc) network is a kind of
feed forward neural network for instantly document
classification. a training algorithm, named as
textcc is presented in [34].
the level of difficulty of text classification tasks
naturally varies. as the number of distinct classes
increases, so does the difficulty, and therefore the
size of the training set needed. in any multi-class
text classification task, inevitably some classes will
be more difficult than others to classify. reasons
for this may be: (1) very few positive training
examples for the class, and/or (2) lack of good
predictive features for that class.
training a binary classifier per category in
text categorization, we use all the documents in the
training corpus that belong to that category as
relevant training data and all the documents in the
training corpus that belong to all the other
categories as non-relevant training data. it is often
the case that there is an overwhelming number of
non relevant training documents especially
there is a large collection of categories with each
assigned to a small number of documents, which is
typically an <20>imbalanced data problem". this
problem presents a particular challenge to
classification algorithms, which can achieve high
accuracy by simply classifying every example as
negative. to overcome this problem, cost sensitive
learning is needed [5].
a scalability analysis of a number of classifiers
in text categorization is given in [32]. vinciarelli
presents categorization experiments performed over
noisy texts [31]. by noisy it is meant any text
obtained through an extraction process (affected by
errors) from media other than digital texts (e.g.
transcriptions of speech recordings extracted with a
recognition system). the performance of the
categorization system over the clean and noisy
(word error rate between ~10 and ~50 percent)
versions of the same documents is compared. the
noisy texts are obtained through handwriting
recognition and simulation of optical character
recognition. the results show that the performance
loss is acceptable.
other authors [36] also proposed to parallelize
and distribute the process of text classification.
with such a procedure, the performance of
classifiers can be improved in both accuracy and
time complexity.
recently in the area of machine learning the
concept of combining classifiers is proposed as a
new direction for the improvement of the
performance of individual classifiers. numerous
methods have been suggested for the creation of
ensemble of classifiers. mechanisms that are used
to build ensemble of classifiers include: i) using
different subset of training data with a single
learning method, ii) using different training
parameters with a single training method (e.g. using
different initial weights for each neural network in
an ensemble), iii) using different learning methods.
in the context of combining multiple classifiers
for text categorization, a number of researchers
have shown that combining different classifiers can
improve classification accuracy [1], [29].
comparison between the best individual classifier
and the combined method, it is observed that the
performance of the combined method is superior
[2]. nardiello et al. [21] also proposed algorithms
in the family of "boosting"-based learners for
automated text classification with good results.
6 evaluation
there are various methods to determine
effectiveness; however, precision, recall, and
accuracy are most often used. to determine these,
one must first begin by understanding if the
classification of a document was a true positive
(tp), false positive (fp), true negative (tn), or
false negative (fn) (see table 3).
tp determined as a document being classified
correctly as relating to a category.
fp determined as a document that is said to be
related to the category incorrectly.
fn determined as a document that is not marked
as related to a category but should be.
tn documents that should not be marked as being
in a particular category and are not.
table 3. classification of a document
precision (pi) is determined as the conditional
probability that a random document d is classified
under ci, or what would be deemed the correct
category. it represents the classifiers ability to place
a document as being under the correct category as
opposed to all documents place in that category,
both correct and incorrect:
i
i i
tp
i tp fp p + =
recall (?i) is defined as the probability that, if a
random document dx should be classified under
category (ci), this decision is taken.
i
i i
tp
i tp fn ? + =
accuracy is commonly used as a measure for
categorization techniques. accuracy values,
however, are much less reluctant to variations in
the number of correct decisions than precision and
recall:
i i i i
i i
tp tn fp fn
tp tn
i a + + +
= +
many times there are very few instances of the
interesting category in text categorization. this
overrepresentation of the negative class in
information retrieval problems can cause problems
in evaluating classifiers' performances using
accuracy. since accuracy is not a good metric for
skewed datasets, the classification performance of
algorithms in this case is measured by precision
and recall [5].
furthermore, precision and recall are often
combined in order to get a better picture of the
performance of the classifier. this is done by
combining them in the following formula:
( 2 )
2
1
f<EFBFBD>
<EFBFBD> p?
<EFBFBD> p ?
+
=
+
,
where p and ? denote presicion and recall
respectively. <20> is a positive parameter, which
represents the goal of the evaluation task. if
presicion is considered to be more important that
recall, then the value of <20> converges to zero. on the
other hand, if recall is more important than
presicion then <20> converges to infinity. usually <20> is
set to 1, because in this way equal importance is
given to each presicion and recall.
reuters corpus volume i (rcv1) is an archive
of over 800,000 manually categorized newswire
stories recently made available by reuters, ltd. for
research purposes [17]. using this collection, we
can compare the learning algorithms.
although research in the pass years had shown
that training corpus could impact classification
performance, little work was done to explore the
underlying causes. the authors of [35] try to
propose an approach to build semi-automatically
high-quality training corpuses for better
classification performance by first exploring the
properties of training corpuses, and then giving an
algorithm for constructing training corpuses semiautomatically.
7 conclusion
the text classification problem is an artificial
intelligence research topic, especially given the
vast number of documents available in the form of
web pages and other electronic texts like emails,
discussion forum postings and other electronic
documents.
it has observed that even for a specified
classification method, classification performances
of the classifiers based on different training text
corpuses are different; and in some cases such
differences are quite substantial. this observation
implies that a) classifier performance is relevant to
its training corpus in some degree, and b) good or
high quality training corpuses may derive
classifiers of good performance. unfortunately, up
to now little research work in the literature has been
seen on how to exploit training text corpuses to
improve classifier<65>s performance.
some important conclusions have not been
reached yet, including:
<EFBFBD> which feature selection methods are both
computationally scalable and high-performing
across classifiers and collections? given the
high variability of text collections, do such
methods even exist?
<EFBFBD> would combining uncorrelated, but wellperforming
methods yield a performance
increase?
<EFBFBD> change the thinking from word frequency
based vector space to concepts based vector
space. study the methodology of feature
selection under concepts, to see if these will
help in text categorization.
<EFBFBD> make the dimensionality reduction more
efficient over large corpus.
moreover, there are other two open problems in
text mining: polysemy, synonymy. polysemy refers
to the fact that a word can have multiple meanings.
distinguishing between different meanings of a
word (called word sense disambiguation) is not
easy, often requiring the context in which the word
appears. synonymy means that different words can
have the same or similar meaning.
references:
[1] bao y. and ishii n., <20>combining multiple knn
classifiers for text categorization by
reducts<EFBFBD>, lncs 2534, 2002, pp. 340-347
[2] bi y., bell d., wang h., guo g., greer k.,
<EFBFBD>combining multiple classifiers using
dempster's rule of combination for text
categorization<EFBFBD>, mdai, 2004, 127-138.
[3] brank j., grobelnik m., milic-frayling n.,
mladenic d., <20>interaction of feature selection
methods and linear classification models<6C>,
proc. of the 19th international conference on
machine learning, australia, 2002.
[4] ana cardoso-cachopo, arlindo l. oliveira, an
empirical comparison of text categorization
methods, lecture notes in computer science,
volume 2857, jan 2003, pages 183 - 196
[5] chawla, n. v., bowyer, k. w., hall, l. o.,
kegelmeyer, w. p., <20>smote: synthetic
minority over-sampling technique,<2C> journal
of ai research, 16 2002, pp. 321-357.
[6] forman, g., an experimental study of feature
selection metrics for text categorization.
journal of machine learning research, 3 2003,
pp. 1289-1305
[7] fragoudis d., meretakis d., likothanassis s.,
<EFBFBD>integrating feature and instance selection for
text classification<6F>, sigkdd <20>02, july 23-26,
2002, edmonton, alberta, canada.
[8] guan j., zhou s., <20>pruning training corpus to
speedup text classification<6F>, dexa 2002, pp.
831-840
[9] d. e. johnson, f. j. oles, t. zhang, t. goetz,
<EFBFBD>a decision-tree-based symbolic rule induction
system for text categorization<6F>, ibm systems
journal, september 2002.
[10] han x., zu g., ohyama w., wakabayashi
t., kimura f., accuracy improvement of
automatic text classification based on
feature transformation and multi-classifier
combination, lncs, volume 3309, jan 2004,
pp. 463-468
[11] ke h., shaoping m., <20>text categorization
based on concept indexing and principal
component analysis<69>, proc. tencon 2002
conference on computers, communications,
control and power engineering, 2002, pp. 51-
56.
[12] kehagias a., petridis v., kaburlasos v.,
fragkou p., <20>a comparison of word- and
sense-based text categorization using
several classification algorithms<6D>, jiis,
volume 21, issue 3, 2003, pp. 227-247.
[13] b. kessler, g. nunberg, and h. schutze.
automatic detection of text genre. in
proceedings of the thirty-fifth acl and
eacl, pages 32<33>38, 1997.
[14] kim s. b., rim h. c., yook d. s. and lim
h. s., <20>effective methods for improving naive
bayes text classifiers<72>, lnai 2417, 2002, pp.
414-423
[15] klopotek m. and woch m., <20>very large
bayesian networks in text classification<6F>,
iccs 2003, lncs 2657, 2003, pp. 397-406
[16] leopold, edda & kindermann, j<>rg, <20>text
categorization with support vector machines.
how to represent texts in input space?<3F>,
machine learning 46, 2002, pp. 423 - 444.
[17] lewis d., yang y., rose t., li f., <20>rcv1:
a new benchmark collection for text
categorization research<63>, journal of machine
learning research 5, 2004, pp. 361-397.
[18] heui lim, improving knn based text
classification with well estimated parameters,
lncs, vol. 3316, oct 2004, pages 516 - 523.
[19] madsen r. e., sigurdsson s., hansen l. k.
and lansen j., <20>pruning the vocabulary for
better context recognition<6F>, 7th international
conference on pattern recognition, 2004
[20] montanes e., quevedo j. r. and diaz i.,
<EFBFBD>a wrapper approach with support vector
machines for text categorization<6F>, lncs
2686, 2003, pp. 230-237
[21] nardiello p., sebastiani f., sperduti a.,
<EFBFBD>discretizing continuous attributes in
adaboost for text categorization<6F>, lncs,
volume 2633, jan 2003, pp. 320-334
[22] novovicova j., malik a., and pudil p.,
<EFBFBD>feature selection using improved mutual
information for text classification<6F>,
sspr&spr 2004, lncs 3138, pp. 1010<31>
1017, 2004
[23] qiang w., xiaolong w., yi g., <20>a study
of semi-discrete matrix decomposition for lsi
in automated text categorization<6F>, lncs,
volume 3248, jan 2005, pp. 606-615.
[24] schneider, k., techniques for improving
the performance of naive bayes for text
classification, lncs, vol. 3406, 2005, 682-
693.
[25] sebastiani f., <20>machine learning in
automated text categorization<6F>, acm
computing surveys, vol. 34 (1),2002, pp. 1-47.
[26] shanahan j. and roma n., improving svm
text classification performance through
threshold adjustment, lnai 2837, 2003, 361-
372
[27] soucy p. and mineau g., <20>feature
selection strategies for text categorization<6F>,
ai 2003, lnai 2671, 2003, pp. 505-509
[28] sousa p., pimentao j. p., santos b. r. and
moura-pires f., <20>feature selection algorithms
to improve documents classification
performance<EFBFBD>, lnai 2663, 2003, pp. 288-296
[29] sung-bae cho, jee-haeng lee, learning
neural network ensemble for practical text
classification, lecture notes in computer
science, volume 2690, aug 2003, pages 1032
<EFBFBD> 1036.
[30] torkkola k., <20>discriminative features for
text document classification<6F>, proc.
international conference on pattern
recognition, canada, 2002.
[31] vinciarelli a., <20>noisy text categorization,
pattern recognition<6F>, 17th international
conference on (icpr'04) , 2004, pp. 554-557
[32] y. yang, j. zhang and b. kisiel., <20>a
scalability analysis of classifiers in text
categorization<EFBFBD>, acm sigir'03, 2003, pp 96-
103
[33] y. yang. an evaluation of statistical
approaches to text categorization. journal of
information retrieval, 1(1/2):67<36>88, 1999.
[34] zhenya zhang, shuguang zhang, enhong
chen, xufa wang, hongmei cheng, textcc:
new feed forward neural network for
classifying documents instantly, lecture
notes in computer science, volume 3497, jan
2005, pages 232 <20> 237.
[35] shuigeng zhou, jihong guan, evaluation
and construction of training corpuses for text
classification: a preliminary study, lecture
notes in computer science, volume 2553, jan
2002, page 97-108.
[36] verayuth lertnattee, thanaruk
theeramunkong, parallel text categorization
for multi-dimensional data, lecture notes in
computer science, volume 3320, jan 2004,
pages 38 - 41
[37] wang qiang, wang xiaolong, guan yi, a
study of semi-discrete matrix decomposition
for lsi in automated text categorization,
lecture notes in computer science, volume
3248, jan 2005, pages 606 <20> 615.
[38] zu g., ohyama w., wakabayashi t.,
kimura f., "accuracy improvement of
automatic text classification based on feature
transformation": proc: the 2003 acm
symposium on document engineering,
november 20-22, 2003, pp.118-120

30
texte_3.txt Normal file
View File

@@ -0,0 +1,30 @@
Trattamenti statistici dei dati testuali (L. Lebart, CNRS-ENST; lebart@enst.fr)
Il materiale statistico <20> testo <20> <20> onnipresente, quasi banale, sin dallo sviluppo di Internet e del web. Lo studio quantitativo e statistico di questi testi sembra essere apparso di recente, eppure gli studi statistici sui testi risalgono a diversi decenni fa, in particolare in Francia con i lavori di P. Guiraud (<28> Problemi e metodi di statistica linguistica <20>, PUF, 1960), C. Muller (<28> Principi e metodi di statistica lessicale <20>, Hachette, 1977) e successivamente J.P. Benz<6E>cri (<28> Pratica dell'analisi dei dati, vol. 3: Linguistica e lessicologia <20>, Dunod, 1981).
Dopo la <20> stilometria <20>, dedicata allo studio della forma dei testi, al fine di identificare un autore o di datare un'opera, sono apparse le tecniche di documentazione automatica (information retrieval in inglese), che mirano a ricercare in un database di documenti (articoli scientifici, riassunti, brevetti, ecc.) gli elementi pertinenti a partire da una richiesta espressa sotto forma di testo libero. Il campo disciplinare <20> trattamento del linguaggio naturale <20> <20> poi emerso e si <20> sviluppato inizialmente come uno dei campi di applicazione privilegiati dell'intelligenza artificiale. La complessit<69> del materiale, la necessit<69> di assimilare enormi corpus di testi, la rilevanza del concetto di apprendimento hanno naturalmente aperto questo campo ai metodi statistici. La statistica multidimensionale, le catene di Markov nascoste, i metodi di analisi discriminante intervengono quindi per costruire gli strumenti di base che sono i motori di ricerca sul web, gli analizzatori morfosintattici, i correttori ortografici, nonch<63> in campi applicativi pratici come il trattamento delle risposte alle domande aperte nelle indagini socio-economiche.
Le domande aperte
In un certo numero di situazioni di indagine, <20> utile lasciare aperte alcune domande, le cui risposte si presenteranno quindi sotto forma di testi di lunghezza variabile.
La raccolta dei dati
In almeno tre situazioni comuni, l'uso di domande aperte <20> necessario:
Per ridurre o ottimizzare la durata dell'intervista di indagine: Sebbene le risposte libere e quelle guidate forniscano informazioni di natura diversa, le prime sono pi<70> economiche in termini di tempo di intervista e generano meno stanchezza. Una semplice domanda aperta (ad esempio: <20> Quali sono state le tue principali attivit<69> domenica scorsa? <20>) pu<70> sostituire lunghe liste di elementi.
Come complemento a domande chiuse: Di solito si tratta della domanda <20> Perch<63>? <20>. Le spiegazioni riguardanti una risposta gi<67> data devono necessariamente essere spontanee. Un elenco di elementi potrebbe suggerire nuovi argomenti che potrebbero compromettere l'autenticit<69> dell'argomentazione.
Per raccogliere informazioni che devono essere spontanee per loro natura: I questionari delle indagini di marketing abbondano di domande di questo tipo. Esempi includono: <20> Cosa ricordi di questa campagna pubblicitaria? <20> oppure <20> Cosa pensi di questa auto? <20>.
Unit<EFBFBD> statistiche
I programmi lavorano a partire dal testo grezzo, estraendo automaticamente delle unit<69> statistiche, per lo pi<70> forme grafiche (sequenze di caratteri non separatori). Si usa il termine forma grafica perch<63> la parola <20> parola <20> <20> ambigua. Pu<50> infatti riferirsi all'occorrenza di una parola, al tipo, oppure al lemma (ad esempio, <20> avere <20> <20> il lemma di <20> aveva <20>).
nel caso dell<6C>esempio precedente per 1009 risposte si ottengono 14337 occorrenze di 1394 forme distinte (o tipi) <20> ben noto che la distribuzione di frequenza delle parole <20> molto asimmetrica (legge di zipf, simile alla distribuzione di pareto) cos<6F> selezionando solo le forme che appaiono almeno 20 volte rimane un testo di 10994 forme con solo 97 forme distinte (cos<6F> il 7% delle parole distinte corrisponde al 77% del testo totale) in particolare quasi la met<65> delle forme grafiche distinte appare una sola volta (queste sono gli <20> hapax <20>)
il post-codifica
il pretrattamento empirico chiamato <20> post-codifica <20> permette di chiudere a posteriori le domande aperte questa tecnica comune consiste nel costruire una serie di elementi a partire da un sotto-campione di risposte per poi codificare tutte le risposte in modo da sostituire la domanda aperta con una o pi<70> domande chiuse per l<>esempio sopra la seconda risposta la pi<70> semplice darebbe gli elementi <20> lettura <20> <20> viaggi <20> <20> tempo libero <20> a condizione che questi elementi appaiano con una certa frequenza nel campione di risposte tuttavia la prima risposta <20> pi<70> difficile da post-codificare
gli strumenti statistici di base
gli strumenti di base comprendono la selezione di forme caratteristiche la selezione di risposte modali l'analisi delle corrispondenze e la classificazione delle tabelle lessicali
forme o segmenti caratteristici (o specificit<69>)
le forme caratteristiche sono le forme <20> anormalmente <20> frequenti nelle risposte di un gruppo di individui (tecnica proposta da p lafon nel 1980) un test elementare basato sulla legge ipergeometrica permette di selezionare le parole (forme grafiche o lemmi) la cui frequenza in un gruppo <20> significativamente superiore (o inferiore per le parole anti-caratteristiche) alla frequenza media nel corpus si tratta di test classici di confronto delle frequenze ma la ripetizione di questo test porta a prendere soglie di significativit<69> molto rigide (fenomeno di confronti multipli ben noto agli statistici)
nell<EFBFBD>esempio citato sopra la frequenza media della parola lavoro nel corpus era del 3,4%; per il gruppo delle donne oltre i 55 anni la frequenza <20> solo dell<6C>1,2% questa differenza <20> altamente significativa (si pu<70> esprimere il test di confronto delle frequenze in termini di scarti standard nella ipotesi di omogeneit<69> delle frequenze il valore del 1,2% <20> a 4,5 scarti standard dal valore medio del 3,4%) poich<63> si tratta di una frequenza anormalmente bassa si parler<65> di parole anti-caratteristiche
le selezioni delle risposte modali
per un gruppo di individui e quindi per il raggruppamento delle risposte corrispondenti le risposte modali (o frasi caratteristiche o documenti-tipo la terminologia varia a seconda dei campi di applicazione) sono risposte originali del corpus di base che caratterizzano meglio il gruppo si pu<70> per ogni raggruppamento calcolare la distanza del profilo lessicale di un individuo dal profilo lessicale medio del gruppo poi si possono ordinare le distanze in ordine crescente e selezionare le risposte pi<70> rappresentative in termini di profilo lessicale che corrisponderanno alle distanze minori si ottiene cos<6F> una sorta di sintesi delle risposte di ogni gruppo costituita da risposte originali (l lebart e a salem statistica testuale dunod 1994) sempre nel caso del nostro esempio <20> essere felice avere un buon lavoro successo professionale e familiare <20> <20> una risposta caratteristica dei giovani uomini <20> la salute la famiglia <20> <20> una risposta che caratterizza le persone pi<70> anziane in pratica si utilizzano pi<70> risposte caratteristiche per ogni gruppo
analisi delle corrispondenze e classificazione
il volume dei dati richiede l'uso di potenti strumenti di descrizione i metodi di analisi delle corrispondenze e di classificazione possono descrivere le tabelle di contingenza che incrociano le risposte con le forme grafiche o gruppi di risposte (ad esempio raggruppamenti in base al livello di istruzione dei rispondenti) e le forme grafiche questi strumenti permettono di visualizzare sotto forma di serie di mappe piane (o dendrogrammi nel caso dei metodi di classificazione o mappe auto-organizzate di kohonen metodo <20> neurale <20> di visualizzazione) le associazioni tra parole (forme) e gruppi o modalit<69> cos<6F> una visualizzazione delle prossimit<69> tra parole e categorie socio-professionali pu<70> aiutare a leggere le risposte di ciascuna di queste categorie
conclusioni e prospettive
per risposte semplici e stereotipate come abbiamo visto le procedure di post-codifica possono funzionare tuttavia tra i difetti di questo tipo di trattamento si possono menzionare:
la mediazione del codificatore: le decisioni da prendere sono talvolta difficili
la qualit<69> dell'espressione il registro del vocabolario la tonalit<69> generale dell'intervista sono elementi di analisi persi durante la post-codifica (bisogna codificare in modo diverso <20> non lo so <20> e <20> preferisco non dire nulla <20>?)
le risposte composite complesse e molto diverse sono difficili da post-codificare ed <20> spesso in questi casi che il valore euristico delle risposte libere <20> maggiore
le risposte poco frequenti originali e poco chiare a una prima lettura sono considerate come <20> rumore <20> e assegnate a categorie residuali (<28> altre <20>) che sono quindi molto eterogenee e difficili da gestire senza che sia necessario procedere a una post-codifica attualmente <20> possibile a partire da un insieme di testi e da una soglia di frequenza per le forme grafiche ottenere una visualizzazione delle prossimit<69> tra testi in base ai loro profili lessicali e tra forme grafiche in base alla loro distribuzione nei testi l'arricchimento delle unit<69> statistiche con segmenti ripetuti cf a salem pratica dei segmenti ripetuti klincksieck 1987 i loro raggruppamenti per categorizzazione morfologica l'utilizzo delle forme caratteristiche o specificit<69> l'aggiunta delle risposte modali o delle frasi o unit<69> di contesto caratteristiche hanno perfezionato questi approcci e messo a disposizione di molti utenti metodi e software utili in alcuni specifici ambiti applicativi come il trattamento automatico delle risposte alle domande aperte che ci interessa qui l'efficacia del metodo come complemento alle approcci tradizionali <20> riconosciuta parallelamente ai lavori dell'industria della lingua che abbiamo menzionato in precedenza e che fanno parte di un'ingegneria statistica complessa esistono quindi applicazioni testuali della statistica a portata di mano richiedono sicuramente software specifici ma la natura familiare e viva del materiale di base compensa in qualche modo la relativa complessit<69> dei trattamenti e le difficolt<6C> di interpretazione vicino alle basi di dati all'intelligenza artificiale e alle reti neurali alla teoria dell'apprendimento alle tecniche recenti di estrazione e gestione della conoscenza il dominio testuale illustra bene la polivalenza e la potenza della metodologia statistica anche quando i metodi assumono nomi pi<70> esotici come text mining o text mining il lavoro dello statistico <20> sempre necessario quando si tratta di conoscere la portata reale dei fatti osservati e dei tratti strutturali ottenuti di sapere cosa si pu<70> affermare e cosa non si deve dire ovvero di dare uno statuto scientifico ai risultati

8
utils/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""
Ce module contient des fonctions utilisataires
"""
def normalize_probabilities(prob_fr: float, prob_en: float, prob_it: float, searched: float) -> float:
sum = prob_fr + prob_en + prob_it
return searched / sum

Binary file not shown.