Feat: Adds the HMM and detection for one word
This commit is contained in:
10
.idea/.gitignore
generated
vendored
Normal file
10
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Ignored default folder with query files
|
||||
/queries/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
20
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
20
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N803" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<list>
|
||||
<option value="pandas" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.14 (tp_mapel_1)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.14 (tp_mapel_1)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/tp_mapel_1.iml" filepath="$PROJECT_DIR$/.idea/tp_mapel_1.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
10
.idea/tp_mapel_1.iml
generated
Normal file
10
.idea/tp_mapel_1.iml
generated
Normal file
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.14 (tp_mapel_1)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
119
HMM/__init__.py
Normal file
119
HMM/__init__.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
Ce module contient la classe qui représente un HMM
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class HMM:
|
||||
# S
|
||||
states: list[str] = ["French", "English", "Italian"]
|
||||
# pi
|
||||
initial_probabilities: np.ndarray[tuple[int], np.dtype[any]]
|
||||
# A
|
||||
transition_matrix: np.ndarray
|
||||
# B
|
||||
emission_matrix: np.ndarray
|
||||
|
||||
def __init__(self, emission_matrix_file_name: str, numeric_text: np.ndarray):
|
||||
"""
|
||||
/!\\ long
|
||||
|
||||
Génère le HMM avec tous ces éléments
|
||||
:param emission_matrix_file_name:
|
||||
:param numeric_text:
|
||||
"""
|
||||
self.generate_emission_matrix(emission_matrix_file_name)
|
||||
self.generate_initial_probabilities()
|
||||
self.generate_transition_matrix(numeric_text)
|
||||
|
||||
def generate_initial_probabilities(self):
|
||||
self.initial_probabilities = np.zeros(26)
|
||||
self.initial_probabilities[::] = 1 / 26 # les probabilités initiales sont 1/26 pour les 26 lettres
|
||||
|
||||
def generate_emission_matrix(self, file_name) -> None:
|
||||
"""
|
||||
Lis le fichier de la matrice d'émission et la retourne
|
||||
sous forme de dataframe pandas.
|
||||
:param file_name:
|
||||
:return:
|
||||
"""
|
||||
self.emission_matrix = pd.read_excel(file_name).iloc[:, 1:].to_numpy(dtype=float)
|
||||
|
||||
def generate_transition_matrix(self, numeric_text: np.ndarray) -> None:
|
||||
"""
|
||||
/!\\ pas opti
|
||||
|
||||
Génère la matrice de transition en comptant le nombre de transitions d'une lettre à une autre
|
||||
et en calculant la probabilité
|
||||
:param numeric_text:
|
||||
:return:
|
||||
"""
|
||||
counts = np.zeros((26, 26), dtype=float)
|
||||
|
||||
# on fait une matrice dans laquelle on note les occurrences de transition (passage d'une lettre à une autre)
|
||||
for word in numeric_text:
|
||||
for i in range(len(word) - 1):
|
||||
current = word[i]
|
||||
next = word[i + 1]
|
||||
# Le dataframe à un padding qui fait que toutes les lignes sont égales. Il rajoute des NaN pour le faire, il faut les ignorer
|
||||
if not np.isnan(current) and not np.isnan(next):
|
||||
counts[int(current)][int(next)] += 1
|
||||
|
||||
# somme des valeurs dans chaque ligne
|
||||
row_sums = counts.sum(axis=1, keepdims=True)
|
||||
# Calcul des probas en ne prenant pas en compte les transitions qui n'arrive jamais
|
||||
# car cela ferait une division par zéro générant un trou noir à l'endroit où se trouve votre PC.
|
||||
# (Pour vous avoir sauvé, j'ai donc le droit à +1pts)
|
||||
self.transition_matrix = np.divide(counts, row_sums, out=np.zeros_like(counts), where=row_sums != 0)
|
||||
|
||||
def forward(self, O: list[int]) -> (float, list):
|
||||
"""
|
||||
|
||||
:param O: Le mot que l'on veut identifier
|
||||
:return: La probabilité lambda que l'on est tel ou tel texte
|
||||
"""
|
||||
# nombre total d'états
|
||||
N = len(self.initial_probabilities)
|
||||
# alpha_i = pi_i * b(o_1)
|
||||
first_obs = O[0]
|
||||
alpha = np.array([self.initial_probabilities[i] * self.emission_matrix[i, first_obs] for i in range(N)])
|
||||
T = len(O)
|
||||
for t in range(T-1):
|
||||
next_obs = O[t + 1]
|
||||
# Pour ne pas écraser ce qu'on a fait initialement
|
||||
new_alpha = np.zeros(N)
|
||||
|
||||
for j in range(N):
|
||||
# Somme de i=1 à N de ( alpha_t(i) * a_ij )
|
||||
# self.transition_matrix[i, j] = a_ij
|
||||
right_term = np.sum([alpha[i] * self.transition_matrix[i, j] for i in range(N)])
|
||||
|
||||
# alpha_t+1(j) = b_j(o_t+1) * somme
|
||||
# self.emission_matrix[j, next_obs] = b_j(o_t+1)
|
||||
new_alpha[j] = self.emission_matrix[j, next_obs] * right_term
|
||||
|
||||
alpha = new_alpha
|
||||
|
||||
return float(np.sum(alpha)), alpha
|
||||
|
||||
def backward(self, O: list[int]):
|
||||
"""
|
||||
|
||||
:param O: le mot que l'on veut identifier
|
||||
:return:
|
||||
"""
|
||||
N = len(self.initial_probabilities)
|
||||
beta = np.ones(N)
|
||||
T = len(O)
|
||||
# On remonte le temps de T-2 à 0
|
||||
for t in range(T - 2, -1, -1):
|
||||
new_beta = np.zeros(N)
|
||||
for i in range(N):
|
||||
# beta_t(i) = somme de a_ij * b_j(o_t+1) * beta_t+1(j)
|
||||
new_beta[i] = np.sum([self.transition_matrix[i, j] * self.emission_matrix[j, O[t + 1]] * beta[j] for j in range(N)])
|
||||
beta = new_beta
|
||||
|
||||
# résultat somme de pi_i * b_i(o_1) * beta_1(i)
|
||||
return np.sum([self.initial_probabilities[i] * self.emission_matrix[i, O[0]] * beta[i] for i in range(N)]), beta
|
||||
BIN
HMM/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
HMM/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
Matrice de transition/EN.pdf
Normal file
BIN
Matrice de transition/EN.pdf
Normal file
Binary file not shown.
BIN
Matrice de transition/FR.pdf
Normal file
BIN
Matrice de transition/FR.pdf
Normal file
Binary file not shown.
89
data_preparation/__init__.py
Normal file
89
data_preparation/__init__.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""
|
||||
Ce module contient les fonctions pour préparer les données.
|
||||
|
||||
Cela consiste à:
|
||||
- Lire un fichier
|
||||
- Nettoyer les données
|
||||
- Tout transformer en dataframe d'index de l'alphabet.
|
||||
"""
|
||||
import re
|
||||
|
||||
|
||||
def read_file(file_name: str) -> str:
|
||||
"""
|
||||
Lis le fichier sans rien touché. Retourne le texte brut
|
||||
:param file_name:
|
||||
:return:
|
||||
"""
|
||||
with open(file_name) as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def parse_data(raw_data: str) -> str:
|
||||
"""
|
||||
Cette fonction retire les caractères spéciaux et passe toutes les lettres
|
||||
en minuscule
|
||||
:param raw_data:
|
||||
:return:
|
||||
"""
|
||||
lower_raw_data = raw_data.lower()
|
||||
without_special_chars = re.sub(r'[^a-z]', ' ', lower_raw_data)
|
||||
return without_special_chars
|
||||
|
||||
|
||||
def prepare_file(file_name) -> str:
|
||||
"""
|
||||
Prépare le fichier en le lisant et en le parsant
|
||||
:param file_name:
|
||||
:return:
|
||||
"""
|
||||
raw_data = read_file(file_name)
|
||||
return parse_data(raw_data)
|
||||
|
||||
|
||||
def get_alphabet_index_of(letter: str) -> int:
|
||||
"""
|
||||
Retourne l'index dans l'alphabet d'une lettre en imaginant que
|
||||
l'alphabet est un tableau.
|
||||
|
||||
(ex: a -> 0)
|
||||
:param letter:
|
||||
:return:
|
||||
"""
|
||||
# l'alphabet et l'ensemble des états
|
||||
return 'abcdefghijklmnopqrstuvwxyz'.find(letter)
|
||||
|
||||
|
||||
def get_alphabet_index_form_word(word: str) -> list[int]:
|
||||
"""
|
||||
Retourne un mot sous forme d'ensemble d'index dans l'alphabet
|
||||
:param word:
|
||||
:return:
|
||||
"""
|
||||
return [get_alphabet_index_of(letter) for letter in word]
|
||||
|
||||
|
||||
def get_text_in_alphabet_index_form(text: str) -> list[list[int]]:
|
||||
"""
|
||||
Prends un texte et le transforme un matrice contenant tout les mots sous forme de tableau d'entier.
|
||||
Chaque entier correspond à l'index du caractère dans l'alphabet
|
||||
:param text:
|
||||
:return:
|
||||
"""
|
||||
words = text.split(' ')
|
||||
numeric_text = []
|
||||
for word in words:
|
||||
if word: # On ignore les espaces multiples
|
||||
numeric_text.append(get_alphabet_index_form_word(word))
|
||||
return numeric_text
|
||||
|
||||
|
||||
def prepare_data(file_name: str) -> list[list[int]]:
|
||||
"""
|
||||
Cette fonction lis le fichier, nettoie les données puis convertie tout en index alphabétique.
|
||||
:param file_name:
|
||||
:return:
|
||||
"""
|
||||
content = prepare_file(file_name)
|
||||
return get_text_in_alphabet_index_form(content)
|
||||
|
||||
BIN
data_preparation/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
data_preparation/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
1
english.txt
Normal file
1
english.txt
Normal file
File diff suppressed because one or more lines are too long
1
french.txt
Normal file
1
french.txt
Normal file
File diff suppressed because one or more lines are too long
1
italian.txt
Normal file
1
italian.txt
Normal file
File diff suppressed because one or more lines are too long
43
main.py
Normal file
43
main.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Note: I code in english but comment in French !
|
||||
"""
|
||||
import data_preparation
|
||||
from HMM import HMM
|
||||
import utils
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
numeric_french_text = data_preparation.prepare_data('french.txt')
|
||||
numeric_english_text = data_preparation.prepare_data('english.txt')
|
||||
numeric_italian_text = data_preparation.prepare_data('italian.txt')
|
||||
|
||||
lambda_fr = HMM('matrice_emission.xls', numeric_french_text)
|
||||
lambda_en = HMM('matrice_emission.xls', numeric_english_text)
|
||||
lambda_it = HMM('matrice_emission.xls', numeric_italian_text)
|
||||
|
||||
numeric_french_word = data_preparation.get_text_in_alphabet_index_form('probablement')
|
||||
|
||||
# On prend le premier mot de la liste (pomme)
|
||||
word = numeric_french_word[0]
|
||||
|
||||
res_fr, _ = lambda_fr.forward(word)
|
||||
res_en, _ = lambda_en.forward(word)
|
||||
res_it, _ = lambda_it.forward(word)
|
||||
|
||||
proba_fr = utils.normalize_probabilities(res_fr, res_en, res_it, res_fr)
|
||||
proba_en = utils.normalize_probabilities(res_fr, res_en, res_it, res_en)
|
||||
proba_it = utils.normalize_probabilities(res_fr, res_en, res_it, res_it)
|
||||
|
||||
print('Résultats forward ---------------------------------------------------')
|
||||
print(f'FR={proba_fr}, EN={proba_en}, IT={proba_it}, Conclusion={max(proba_fr, proba_en, proba_it)}')
|
||||
|
||||
res_back_fr, _ = lambda_fr.backward(word)
|
||||
res_back_en, _ = lambda_en.backward(word)
|
||||
res_back_it, _ = lambda_it.backward(word)
|
||||
|
||||
proba_back_fr = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_fr)
|
||||
proba_back_en = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_en)
|
||||
proba_back_it = utils.normalize_probabilities(res_back_fr, res_back_en, res_back_it, res_back_it)
|
||||
|
||||
print('Résultat backward ---------------------------------------------------')
|
||||
print(f'FR={proba_back_fr}, EN={proba_back_en}, IT={proba_back_it}, Conclusion={max(proba_back_fr, proba_back_en, proba_back_it)}')
|
||||
BIN
matrice_emission.xls
Normal file
BIN
matrice_emission.xls
Normal file
Binary file not shown.
168
texte_1.txt
Normal file
168
texte_1.txt
Normal file
@@ -0,0 +1,168 @@
|
||||
les traitements statistiques de donnees textuelles. (l. lebart, cnrs-enst ; lebart@enst.fr)
|
||||
le materiau statistique <20> texte <20> est omnipresent, presque banal, depuis le developpement
|
||||
d<EFBFBD>internet et de la toile (web). l<>etude quantitative et statistique de ces textes semble avoir fait
|
||||
irruption recemment, et pourtant les etudes statistiques de textes datent de plusieurs
|
||||
decennies, avec notamment en france les travaux de p. guiraud (problemes et methodes de la
|
||||
statistique linguistique, puf, 1960), c. muller (principes et methodes de statistique lexicale,
|
||||
hachette, 1977) puis de j.p. benzecri (pratique de l<>analyse des donnees, tome 3 :
|
||||
linguistique et lexicologie, dunod, 1981).
|
||||
apres la <20> stylometrie <20>, consacree <20> l<>etude de la forme des textes, en vue d<>identifier un
|
||||
auteur ou de dater une oeuvre, sont apparues les techniques de documentation automatique
|
||||
(information retrieval en anglais), visant <20> rechercher dans une base de documents (articles
|
||||
scientifiques, resumes, brevets, <20>) le ou les elements pertinents <20> partir d<>une requ<71>te
|
||||
exprimee sous forme de textes libres. le champ disciplinaire <20> traitement du langage
|
||||
naturel <20> est alors apparu, et s<>est developpe, au depart, comme un des domaines
|
||||
d<EFBFBD>application privilegie de l<>intelligence artificielle. la complexite du materiau, le besoin
|
||||
d<EFBFBD>assimiler d<>immenses corpus de textes, la pertinence du concept d<>apprentissage ont
|
||||
naturellement ouvert ce champ aux methodes statistiques. la statistique multidimensionnelle,
|
||||
les cha<68>nes de markov cachees, les methodes d<>analyse discriminantes interviennent ainsi
|
||||
pour construire les outils de base que sont les moteurs de recherche sur le web, les analyseurs
|
||||
morphosyntactiques, les correcteurs orthographiques, ainsi que dans des champs d<>application
|
||||
pratiques comme le traitement des reponses aux questions ouvertes dans les enqu<71>tes socioeconomiques.
|
||||
les questions ouvertes
|
||||
il est utile, dans un certain nombre de situations d'enqu<71>te, de laisser ouvertes certaines
|
||||
questions, dont les reponses se presenteront donc sous forme de textes de longueurs variables.
|
||||
le recueil des donnees
|
||||
dans au moins trois situations courantes, l'utilisation d'un questionnement ouvert s'impose :
|
||||
pour diminuer ou optimiser la duree de l<>entrevue d<>enqu<71>te
|
||||
bien que les reponses libres et les reponses guidees fournissent des informations de natures
|
||||
differentes, les premieres sont plus economiques que les secondes en temps d'interview et
|
||||
generent moins de fatigue. une simple question ouverte (par exemple : "quelles furent vos
|
||||
principales activites dimanche dernier ?") peut remplacer de longues listes d'items.
|
||||
comme complement <20> des questions fermees
|
||||
il s'agit le plus souvent de la question: "pourquoi ?". les explications concernant une reponse
|
||||
dej<EFBFBD> donnee doivent necessairement <20>tre spontanee. une batterie d'items risquerait de
|
||||
proposer de nouveaux arguments qui pourraient nuire <20> l'authenticite de l'explication. l'utilite
|
||||
de la question pourquoi ? a ete soulignee par de nombreux auteurs, et ce sont en fait les
|
||||
difficultes et le co<63>t de l'exploitation qui en limitent l'usage. elle seule permet en effet de
|
||||
savoir si les differentes categories de personnes interrogees ont compris la question fermee de
|
||||
la m<>me fa<66>on.
|
||||
pour recueillir une information qui doit, par nature, <20>tre spontanee
|
||||
les questionnaires des enqu<71>tes de marketing abondent en questions de ce type. citons par
|
||||
exemple : "qu'avez-vous retenu de cette campagne publicitaire ?", "que pensez-vous de cette
|
||||
voiture ?". notons cependant que les questions ouvertes sont considerees comme peu
|
||||
adaptees aux problemes de memorisation de comportement. "quels magazines avez-vous lus
|
||||
la semaine derniere ?", "quelles sont les dernieres emissions de television que vous avez
|
||||
aimees ?". pour ces questions qui font l'objet d'enqu<71>tes periodiques, il a ete prouve maintes
|
||||
fois que les questions fermees donnent des taux d'oubli plus faibles. en revanche, quand la
|
||||
qualite de la memorisation est en jeu, la forme ouverte reste indispensable.
|
||||
voici quatre exemples de reponses <20> la question <20> quelle est pour vous la chose la plus
|
||||
importante dans la vie ? <20> (question posee <20> des echantillons d<>environ mille personnes dans
|
||||
sept pays en 1991).
|
||||
1) la sante, ne pas manquer d'argent, avoir une bonne ambiance familiale, je voudrais
|
||||
pouvoir aider les enfants abandonnes, leur redonner le go<67>t <20> la vie, pouvoir aider les
|
||||
personnes <20>gees handicapees, secourir les gens autour de soi.
|
||||
2) c'est de faire ce qu'on veut. lire, voyager si je pouvais. les loisirs si on pouvait.
|
||||
3) la sante puisqu'il faut toujours travailler quand on est commer<65>ant. une bonne entente en
|
||||
famille. avoir assez d'argent pour vivre.
|
||||
4) la famille, ma famille, mon foyer, vivre avec la societe : mon entourage les voisins, pour
|
||||
faire quelque chose qu'il y ait moins de malheureux, donner du travail aux jeunes surtout.
|
||||
ces exemples illustrent <20> la fois la complexite et la richesse des reponses.
|
||||
les unites statistiques
|
||||
les programmes travaillent <20> partir du texte brut, en extrayant automatiquement des unites
|
||||
statistiques, la plupart du temps des formes graphiques (sequences de caracteres nonseparateurs).
|
||||
on utilise le vocable forme graphique parce que le mot <20> mot <20> lui-m<>me est
|
||||
ambigu. il designe en effet selon les contextes l<>occurrence d<>un mot (quand on dit qu<71>un
|
||||
texte a huit cent mots, on parle bien s<>r d<>occurrences, et non de mots differents), le type (qui
|
||||
correspond <20> la forme graphique) et le lemme (avoir est le lemme de avait, et, dans certains
|
||||
cas seulement, de avions). la premiere reponse de l<>exemple ci-dessus contient 38
|
||||
occurrences, mais la forme graphique <20> les <20> appara<72>t trois fois, <20> pouvoir <20> appara<72>t deux
|
||||
fois. le lemme de <20> bonne <20> est bon (le masculin singulier, selon une convention fran<61>aise),
|
||||
celui de <20> voudrais <20> est <20> vouloir <20>.
|
||||
dans le cas de l<>exemple precedent, pour 1009 reponses, on obtient 14337 occurrences de
|
||||
1394 formes distinctes (ou types). il est bien connu que la distribution de frequence des mots
|
||||
est tres dissymetrique (loi dite de zipf, apparentee <20> la distribution de pareto). ainsi, en ne
|
||||
retenant que les formes apparaissant au moins 20 fois, il reste un texte de 10 994 formes, avec
|
||||
seulement 97 formes distinctes (ainsi 7 % des mots distincts correspondent <20> 77 % du texte
|
||||
global). en particulier, pres de la moitie des formes grahiques distinctes n<>apparaissent qu<71>une
|
||||
fois ( ce sont les <20> hapax <20>).
|
||||
le post-codage
|
||||
le pretraitement empirique appele "post-codage" permet de fermer a posteriori les questions
|
||||
ouvertes. cette technique courante consiste <20> construire une batterie d'items <20> partir d'un sousechantillon
|
||||
de reponses, puis <20> codifier l'ensemble des reponses de fa<66>on <20> remplacer la
|
||||
question ouverte par une ou plusieurs questions fermees. pour l<>exemple ci-dessus, la seconde
|
||||
reponse, la plus simple, donnerait les items <20> lecture <20>, voyage <20>, <20> loisirs <20>, sous reserve que
|
||||
ces items apparaissent avec une certaine frequence dans l<>echantillon de reponses. en
|
||||
revanche la premiere reponse est plus delicate <20> post-coder.
|
||||
les outils statistique de base
|
||||
les outils de base sont la selection de formes caracteristiques, la selection de reponses
|
||||
modales, l'analyse des correspondances et la classification des tableaux lexicaux.
|
||||
formes ou segments caracteristiques (ou specificites)
|
||||
les formes caracteristiques sont les formes "anormalement" frequentes dans les reponses d'un
|
||||
groupe d'individus (technique propose par p. lafon en 1980). un test elementaire fonde sur la
|
||||
loi hypergeometrique permet de selectionner les mots (formes graphiques ou lemmes) dont la
|
||||
frequence dans un groupe est notablement superieure (ou inferieure pour les mots anticaracteristiques)
|
||||
<EFBFBD> la frequence moyenne dans le corpus. il s<>agit de test classique de
|
||||
comparaisons de frequences, maisla repetition de ce test conduit <20> prendre des seuils de
|
||||
signification tres severes (phenomene de comparaisons multiples bien connu des statisticiens).
|
||||
dans l<>exemple evoque plus haut, la frequence moyenne du mot travail dans le corpus etait de
|
||||
3.4 %; pour le groupe des femmes de plus de 55 ans, la frequence n<>est que de 1.2 %. cette
|
||||
difference est en fait hautement significative ( on peut exprimer le test de comparaison de
|
||||
frequences en termes d<>ecart-types : dans l<>hypothese d<>homogeneite des frequences, la
|
||||
valeur1.2% est <20> 4.5 ecart-types de la valeur moyenne 3.4). comme il s<>agit d<>une frequence
|
||||
anormalement faible, on parlera de mots anti-caracteristiques. [l<>individu statistique est ici
|
||||
l<EFBFBD>occurrence de mots. les femmes de plus de 55 ans ont emis 1349 mots dans leurs reponses.
|
||||
la variance de la frequence d<>un mot dont la frequence <20>theorique<75> est de 0.034 est donnee
|
||||
par la formule classique 0.034(1 <20> 0.034) /1349. on voit dans ces conditions qe la frequence
|
||||
observee de 0.012 est <20> 4.5 ecart-types de 0.034].
|
||||
les selections des reponses modales
|
||||
pour un groupe d'individus donne, et donc pour le regroupement de reponses correspondant,
|
||||
les reponses modales (ou encore phrases caracteristiques, ou documents-type, la terminologie
|
||||
variant selon les domaines d'application) sont des reponses originales du corpus de base, ayant
|
||||
la propriete de caracteriser au mieux le groupe. on peut, pour chaque regroupement, calculer
|
||||
la distance du profil lexical d'un individu au profil lexical moyen du groupement. on peut
|
||||
ensuite classer les distances par ordre croissant, et donc selectionner les reponses les plus
|
||||
representatives au sens du profil lexical, qui correspondront aux plus petites distances. on
|
||||
obtient ainsi une sorte de resume des reponses de chaque regroupement, forme de reponses
|
||||
originales (l. lebart et a. salem, statistique textuelle, dunod, 1994). toujours dans le cas de
|
||||
notre exemple, <20>etre heureux, avoir un bon travail, reussite professionnelle et familiale<6C> est
|
||||
ainsi une reponse caracteristique des jeunes hommes; <20>la sante, la famille<6C> est une reponse
|
||||
caracterisant les plus <20>ges. on utilise en pratique plusierus reponses caracteristiques par
|
||||
groupe.
|
||||
analyse des correspondances et classification
|
||||
le volume des donnees demande que l<>on fasse appel <20> de puissants outils de description. les
|
||||
methodes d<>analyses des correspondances et de classification peuvent decrire les tables de contingence
|
||||
croisant les reponses et les formes graphiques, ou des groupes de reponses (par exemple regroupement
|
||||
selon le niveau d'instruction des repondants) et les formes graphiques. elles permettent de visualiser
|
||||
sous forme de series de cartes planes (ou de dendrogrammes dans le cas des methodes de
|
||||
classification, ou de cartes auto-associatives de kohonen, methode <20>neuronale<6C> de visualisation) les
|
||||
associations entre mots (formes) et groupes ou modalites. ainsi, une visualisation des proximites entre
|
||||
mots et categories socioprofessionnelles pourra aider la lecture des reponses de chacune de ces
|
||||
categories.
|
||||
conclusions et ouvertures
|
||||
pour des reponses simples et stereotypees, nous l<>avons vu, les procedures de post-codage
|
||||
peuvent fonctionner. mentionnons cependant parmi les defauts de ce type de traitement :
|
||||
la mediation du chiffreur: les decisions <20> prendre sont parfois difficiles.
|
||||
la qualite de l'expression, le registre du vocabulaire, la tonalite generale de l'entretien sont
|
||||
des elements d'analyse perdus lors d'un post-codage (doit-on coder differemment <20> je ne sais
|
||||
pas<EFBFBD> et <20>je prefere ne rien dire<72> ?.
|
||||
les reponses composites, complexes, d'une grande diversite, sont tres difficile <20> post-coder,
|
||||
et c'est souvent dans ce cas que la valeur heuristique des reponses libres est la plus grande.
|
||||
les reponses peu frequentes, originales, peu claires en premiere lecture sont considerees
|
||||
comme du <20>bruit<69>, et affectees <20> des items residuels (<28>autres<65>) qui sont donc tres
|
||||
heterogenes et sont difficiles <20> manipuler.
|
||||
sans qu<71>il soit necessaire de proceder <20> un post codage, on peut, actuellement, <20> partir d'une
|
||||
ensemble de textes, et d'un seuil de frequence pour les formes graphiques, obtenir une
|
||||
visualisation des proximites entre textes (vis-<2D>-vis de leurs profils lexicaux) et entre formes
|
||||
graphiques (vis-<2D>-vis de leur repartition dans les textes). l'enrichissement des unites
|
||||
statistiques par les segments repetes,(cf. a. salem, pratique des segments repetes,
|
||||
klincksieck, 1987), leurs regroupements par categorisation morphologique, l'utilisation des
|
||||
formes caracteristiques ou specificites, l'adjonction des reponses modales ou des phrases ou
|
||||
unites de contexte caracteristiques ont perfectionne ces approches, et mis <20> la disposition de
|
||||
beaucoup d'utilisateurs des methodes et des logiciels utiles. dans certains domaines
|
||||
d'application precis (comme le traitement automatique des reponses aux questions ouvertes,
|
||||
qui nous interesse ici), l'efficacite de la methode, comme complement des approches
|
||||
traditionnelles, est reconnue.
|
||||
parallelement aux travaux relevant de l<>industrie de la langue, que nous avons evoques plus
|
||||
haut, et qui relevent d<>une ingenierie statistique complexe, il existe donc des applications
|
||||
textuelles de la statistique qui restent <20> portee de main. elles necessitent certes des logiciels
|
||||
specifiques, mais la nature familiere et vivante du materiau de base compense en quelque
|
||||
sorte la relative complexite des traitements et les difficultes d<>interpretation.
|
||||
proche des bases de donnees, de l<>intelligence artificielle et des reseaux de neurones, de la
|
||||
theorie de l<>apprentissage, des techniques recentes d<>extraction et de gestion des
|
||||
connaissances, le domaine textuel illustre bien la polyvalence et la puissance de la
|
||||
methodologie statistique. m<>me quand les methodes prennent parfois les noms plus exotiques
|
||||
de fouille de texte ou de text mining, le statisticien est toujours sollicite quand il s<>agit de
|
||||
conna<EFBFBD>tre la portee reelle des faits observes et des traits structuraux obtenus, de savoir ce que
|
||||
l<EFBFBD>on a le droit de dire ou le devoir de ne pas dire, c<>est-<2D>-dire finalement de donner un statut
|
||||
scientifique aux resultats.
|
||||
758
texte_2.txt
Normal file
758
texte_2.txt
Normal file
@@ -0,0 +1,758 @@
|
||||
text classification using machine learning techniques
|
||||
m. ikonomakis
|
||||
department of mathematics
|
||||
university of patras, greece
|
||||
ikonomakis@mailbox.gr
|
||||
s. kotsiantis
|
||||
department of mathematics
|
||||
university of patras, greece
|
||||
sotos@math.upatras.gr
|
||||
v. tampakas
|
||||
technological educational
|
||||
institute of patras, greece
|
||||
tampakas@teipat.gr
|
||||
abstract: automated text classification has been considered as a vital method to manage and process a vast
|
||||
amount of documents in digital forms that are widespread and continuously increasing. in general, text
|
||||
classification plays an important role in information extraction and summarization, text retrieval, and questionanswering.
|
||||
this paper illustrates the text classification process using machine learning techniques. the
|
||||
references cited cover the major theoretical issues and guide the researcher to interesting research directions.
|
||||
key-words: text mining, learning algorithms, feature selection, text representation
|
||||
1 introduction
|
||||
automatic text classification has always been an
|
||||
important application and research topic since the
|
||||
inception of digital documents. today, text
|
||||
classification is a necessity due to the very large
|
||||
amount of text documents that we have to deal with
|
||||
daily.
|
||||
in general, text classification includes topic based
|
||||
text classification and text genre-based
|
||||
classification. topic-based text categorization
|
||||
classifies documents according to their topics [33].
|
||||
texts can also be written in many genres, for
|
||||
instance: scientific articles, news reports, movie
|
||||
reviews, and advertisements. genre is defined on
|
||||
the way a text was created, the way it was edited,
|
||||
the register of language it uses, and the kind of
|
||||
audience to it is addressed. previous work on
|
||||
genre classification recognized that this task differs
|
||||
from topic-based categorization [13].
|
||||
typically, most data for genre classification are
|
||||
collected from the web, through newsgroups,
|
||||
bulletin boards, and broadcast or printed news.
|
||||
they are multi-source, and consequently have
|
||||
different formats, different preferred vocabularies
|
||||
and often significantly different writing styles even
|
||||
for documents within one genre. namely, the data
|
||||
are heterogenous.
|
||||
intuitively text classification is the task of
|
||||
classifying a document under a predefined
|
||||
category. more formally, if i d is a document of the
|
||||
entire set of documents d and { } 1 2 , ,..., n c c c is the
|
||||
set of all the categories, then text classification
|
||||
assigns one category j c to a document i d .
|
||||
as in every supervised machine learning task, an
|
||||
initial dataset is needed. a document may be
|
||||
assigned to more than one category (ranking
|
||||
classification), but in this paper only researches on
|
||||
hard categorization (assigning a single category to
|
||||
each document) are taken into consideration.
|
||||
moreover, approaches, that take into consideration
|
||||
other information besides the pure text, such as
|
||||
hierarchical structure of the texts or date of
|
||||
publication, are not presented. this is because the
|
||||
main issue of this paper is to present techniques
|
||||
that exploit the most of the text of each document
|
||||
and perform best under this condition.
|
||||
sebastiani gave an excellent review of text
|
||||
classification domain [25]. thus, in this work apart
|
||||
from the brief description of the text classification
|
||||
we refer to some more recent works than those in
|
||||
sebastiani<EFBFBD>s article as well as few articles that were
|
||||
not referred by sebastiani. in figure 1 is given the
|
||||
graphical representation of the text classification
|
||||
process.
|
||||
.
|
||||
fig. 1. text classification process
|
||||
the task of constructing a classifier for
|
||||
documents does not differ a lot from other tasks of
|
||||
machine learning. the main issue is the
|
||||
representation of a document [16]. in section 2 the
|
||||
document representation is presented. one
|
||||
particularity of the text categorization problem is
|
||||
read
|
||||
document
|
||||
tokenize
|
||||
text
|
||||
stemming
|
||||
delete
|
||||
stopwords
|
||||
vector representation of
|
||||
text
|
||||
feature selection and/or
|
||||
feature transformation
|
||||
learning
|
||||
algorithm
|
||||
that the number of features (unique words or
|
||||
phrases) can easily reach orders of tens of
|
||||
thousands. this raises big hurdles in applying many
|
||||
sophisticated learning algorithms to the text
|
||||
categorization
|
||||
thus dimension reduction methods are called for.
|
||||
two possibilities exist, either selecting a subset of
|
||||
the original features [3], or transforming the
|
||||
features into new ones, that is, computing new
|
||||
features as some functions of the old ones [10]. we
|
||||
examine both in turn in section 3 and section 4.
|
||||
after the previous steps a machine learning
|
||||
algorithm can be applied. some algorithms have
|
||||
been proven to perform better in text classification
|
||||
tasks and are more often used; such as support
|
||||
vector machines. a brief description of recent
|
||||
modification of learning algorithms in order to be
|
||||
applied in text classification is given in section 5.
|
||||
there are a number of methods to evaluate the
|
||||
performance of a machine learning algorithms in
|
||||
text classification. most of these methods are
|
||||
described in section 6. some open problems are
|
||||
mentioned in the last section.
|
||||
2 vector space document
|
||||
representations
|
||||
a document is a sequence of words [16]. so each
|
||||
document is usually represented by an array of
|
||||
words. the set of all the words of a training set is
|
||||
called vocabulary, or feature set. so a document
|
||||
can be presented by a binary vector, assigning the
|
||||
value 1 if the document contains the feature-word
|
||||
or 0 if the word does not appear in the document.
|
||||
this can be translated as positioning a document in
|
||||
a rv space, were v denotes the size of the
|
||||
vocabulary v .
|
||||
not all of the words presented in a document can
|
||||
be used in order to train the classifier [19]. there
|
||||
are useless words such as auxiliary verbs,
|
||||
conjunctions and articles. these words are called
|
||||
stopwords. there exist many lists of such words
|
||||
which are removed as a preprocess task. this is
|
||||
done because these words appear in most of the
|
||||
documents.
|
||||
stemming is another common preprocessing step.
|
||||
in order to reduce the size of the initial feature set
|
||||
is to remove misspelled or words with the same
|
||||
stem. a stemmer (an algorithm which performs
|
||||
stemming), removes words with the same stem and
|
||||
keeps the stem or the most common of them as
|
||||
feature. for example, the words <20>train<69>, <20>training<6E>,
|
||||
<EFBFBD>trainer<EFBFBD> and <20>trains<6E> can be replaced with <20>train<69>.
|
||||
although stemming is considered by the text
|
||||
classification community to amplify the classifiers
|
||||
performance, there are some doubts on the actual
|
||||
importance of aggressive stemming, such as
|
||||
performed by the porter stemmer [25].
|
||||
an ancillary feature engineering choice is the
|
||||
representation of the feature value [16]. often a
|
||||
boolean indicator of whether the word occurred in
|
||||
the document is sufficient. other possibilities
|
||||
include the count of the number of times the word
|
||||
occurred in the document, the frequency of its
|
||||
occurrence normalized by the length of the
|
||||
document, the count normalized by the inverse
|
||||
document frequency of the word. in situations
|
||||
where the document length varies widely, it may be
|
||||
important to normalize the counts. further, in short
|
||||
documents words are unlikely to repeat, making
|
||||
boolean word indicators nearly as informative as
|
||||
counts. this yields a great savings in training
|
||||
resources and in the search space of the induction
|
||||
algorithm. it may otherwise try to discretize each
|
||||
feature optimally, searching over the number of
|
||||
bins and each bin<69>s threshold.
|
||||
most of the text categorization algorithms in the
|
||||
literature represent documents as collections of
|
||||
words. an alternative which has not been
|
||||
sufficiently explored is the use of word meanings,
|
||||
also known as senses. kehagias et al. using several
|
||||
algorithms, they compared the categorization
|
||||
accuracy of classifiers based on words to that of
|
||||
classifiers based on senses [12]. the document
|
||||
collection on which this comparison took place is a
|
||||
subset of the annotated brown corpus semantic
|
||||
concordance. a series of experiments indicated that
|
||||
the use of senses does not result in any significant
|
||||
categorization improvement.
|
||||
3 feature selection
|
||||
the aim of feature-selection methods is the
|
||||
reduction of the dimensionality of the dataset by
|
||||
removing features that are considered irrelevant for
|
||||
the classification [6]. this transformation
|
||||
procedure has been shown to present a number of
|
||||
advantages, including smaller dataset size, smaller
|
||||
computational requirements for the text
|
||||
categorization algorithms (especially those that do
|
||||
not scale well with the feature set size) and
|
||||
considerable shrinking of the search space. the
|
||||
goal is the reduction of the curse of dimensionality
|
||||
to yield improved classification accuracy. another
|
||||
benefit of feature selection is its tendency to reduce
|
||||
overfitting, i.e. the phenomenon by which a
|
||||
classifier is tuned also to the contingent
|
||||
characteristics of the training data rather than the
|
||||
constitutive characteristics of the categories, and
|
||||
therefore, to increase generalization.
|
||||
methods for feature subset selection for text
|
||||
document classification task use an evaluation
|
||||
function that is applied to a single word [27].
|
||||
scoring of individual words (best individual
|
||||
features) can be performed using some of the
|
||||
measures, for instance, document frequency, term
|
||||
frequency, mutual information, information gain,
|
||||
odds ratio, ?2 statistic and term strength [3], [30],
|
||||
[6], [28], [27]. what is common to all of these
|
||||
feature-scoring methods is that they conclude by
|
||||
ranking the features by their independently
|
||||
determined scores, and then select the top scoring
|
||||
features. the most common metrics are presented
|
||||
in table 1. the symbolisms that are presented in
|
||||
table 1 are described in table 2.
|
||||
on the contrary with best individual features
|
||||
(bif) methods, sequential forward selection (sfs)
|
||||
methods firstly select the best single word
|
||||
evaluated by given criterion [20]; then, add one
|
||||
word at a time until the number of selected words
|
||||
reaches desired k words. sfs methods do not result
|
||||
in the optimal words subset but they take note of
|
||||
dependencies between words as opposed to the bif
|
||||
methods. therefore sfs often give better results
|
||||
than bif. however, sfs are not usually used in
|
||||
text classification because of their computation cost
|
||||
due to large vocabulary size.
|
||||
forman has present benchmark comparison of 12
|
||||
metrics on well known training sets [6]. according
|
||||
to forman, bns performed best by wide margin
|
||||
using 500 to 1000 features, while information gain
|
||||
outperforms the other metrics the features
|
||||
vary between 20 and 50. accuracy 2 performed
|
||||
equally well as information gain. concerning the
|
||||
performance of chi-square, it was consistently
|
||||
worse the information gain. since there is no
|
||||
metric that performs constantly better than all
|
||||
others, researchers often combine two metrics in
|
||||
order to benefit from both metrics [6].
|
||||
novovicova et al. used sfs that took into
|
||||
account, not only the mutual information between a
|
||||
class and a word but also between a class and two
|
||||
words [22]. the results were slightly better.
|
||||
although machine learning based text
|
||||
classification is a good method as far as
|
||||
performance is concerned, it is inefficient for it to
|
||||
handle the very large training corpus. thus, apart
|
||||
from feature selection, many times instance
|
||||
selection is needed.
|
||||
c a class of the training set
|
||||
c the set of classes of the training set
|
||||
d a document of the training set
|
||||
d or db the set of documents of the training set
|
||||
t or w a term or word
|
||||
p(c) or ( ) i p c the probability of the class c or i c respectively how often the class appears in the
|
||||
training set
|
||||
p(<28>c) or p(c) the probability of the class not occurring
|
||||
p(c|t) the probability of the class c given that the term t appears respectively, p(c |t)
|
||||
denotes the probability of class c not occurring, given that the term t appears
|
||||
p(c,t) the probability of the class c and term t occurring simultaneously
|
||||
h(c) the entropy of the set c
|
||||
( ) i df t the document frequency of term k t
|
||||
( ) n df t the frequency of term t in documents containing t in every of their n splits
|
||||
( ) ~
|
||||
df t
|
||||
the document frequency, taking into consideration only documents in which t appears
|
||||
more than once
|
||||
#(c) or #(t ) the number of documents which belong to class or respectively contain the term t
|
||||
#(c,t) the number of documents containing term t and belong to class c
|
||||
table 2. symbolisms
|
||||
guan and zhou proposed a training-corpus
|
||||
pruning based approach to speedup the process [8].
|
||||
by using this approach, the size of training corpus
|
||||
can be reduced significantly while classification
|
||||
performance can be kept at a level close to that of
|
||||
without training documents pruning according to
|
||||
their experiments.
|
||||
fragoudis et al. [7] integrated feature and
|
||||
instance selection for text classification with even
|
||||
better results. their method works in two steps. in
|
||||
the first step, their method sequentially selects
|
||||
features that have high precision in predicting the
|
||||
target class. all documents that do not contain at
|
||||
least one such feature are dropped from the training
|
||||
set. in the second step, their method searches
|
||||
within this subset of the initial dataset for a set of
|
||||
features that tend to predict the complement of the
|
||||
target class and these features are also selected. the
|
||||
sum of the features selected during these two steps
|
||||
is the new feature set and the documents selected
|
||||
from the first step comprise the training set
|
||||
4 feature transformation
|
||||
feature transformation varies significantly from
|
||||
feature selection approaches, but like them its
|
||||
purpose is to reduce the feature set size [10]. this
|
||||
approach does not weight terms in order to discard
|
||||
the lower weighted but compacts the vocabulary
|
||||
based on feature concurrencies.
|
||||
principal component analysis is a well known
|
||||
method for feature transformation [38]. its aim is to
|
||||
learn a discriminative transformation matrix in
|
||||
order to reduce the initial feature space into a lower
|
||||
dimensional feature space in order to reduce the
|
||||
complexity of the classification task without any
|
||||
trade-off in accuracy. the transform is derived
|
||||
from the eigenvectors corresponding. the
|
||||
covariance matrix of data in pca corresponds to
|
||||
the document term matrix multiplied by its
|
||||
transpose. entries in the covariance matrix
|
||||
represent co-occurring terms in the documents.
|
||||
eigenvectors of this matrix corresponding to the
|
||||
dominant eigenvalues are now directions related to
|
||||
dominant combinations can be called <20>topics<63> or
|
||||
<EFBFBD>semantic concepts<74>. a transform matrix
|
||||
constructed from these eigenvectors projects a
|
||||
document onto these <20>latent semantic concepts<74>,
|
||||
and the new low dimensional representation
|
||||
consists of the magnitudes of these projections. the
|
||||
eigenanalysis can be computed efficiently by a
|
||||
sparse variant of singular value decomposition of
|
||||
the document-term matrix [11].
|
||||
in the information retrieval community this
|
||||
method has been named latent semantic indexing
|
||||
(lsi) [23]. this approach is not intuitive
|
||||
discernible for a human but has a good
|
||||
performance.
|
||||
qiang et al [37] performed experiments using k-
|
||||
nn lsi, a new combination of the standard k-nn
|
||||
method on top of lsi, and applying a new matrix
|
||||
decomposition algorithm, semi-discrete matrix
|
||||
decomposition, to decompose the vector matrix.
|
||||
the experimental results showed that text
|
||||
categorization effectiveness in this space was better
|
||||
and it was also computationally less costly, because
|
||||
it needed a lower dimensional space.
|
||||
the authors of [4] present a comparison of the
|
||||
performance of a number of text categorization
|
||||
methods in two different data sets. in particular,
|
||||
they evaluate the vector and lsi methods, a
|
||||
classifier based on support vector machines
|
||||
(svm) and the k-nearest neighbor variations of
|
||||
the vector and lsi models. their results show that
|
||||
overall, svms and k-nn lsi perform better than
|
||||
the other methods, in a statistically significant way.
|
||||
5 machine learning algorithms
|
||||
after feature selection and transformation the
|
||||
documents can be easily represented in a form that
|
||||
can be used by a ml algorithm. many text
|
||||
classifiers have been proposed in the literature
|
||||
using machine learning techniques, probabilistic
|
||||
models, etc. they often differ in the approach
|
||||
adopted: decision trees, naive-bayes, rule
|
||||
induction, neural networks, nearest neighbors, and
|
||||
lately, support vector machines. although many
|
||||
approaches have been proposed, automated text
|
||||
classification is still a major area of research
|
||||
primarily because the effectiveness of current
|
||||
automated text classifiers is not faultless and still
|
||||
needs improvement.
|
||||
naive bayes is often used in text classification
|
||||
applications and experiments because of its
|
||||
simplicity and effectiveness [14]. however, its
|
||||
performance is often degraded because it does not
|
||||
model text well. schneider addressed the problems
|
||||
and show that they can be solved by some simple
|
||||
corrections [24]. klopotek and woch presented
|
||||
results of empirical evaluation of a bayesian
|
||||
multinet classifier based on a new method of
|
||||
learning very large tree-like bayesian networks
|
||||
[15]. the study suggests that tree-like bayesian
|
||||
networks are able to handle a text classification
|
||||
task in one hundred thousand variables with
|
||||
sufficient speed and accuracy.
|
||||
support vector machines (svm), applied to
|
||||
text classification provide excellent precision, but
|
||||
poor recall. one means of customizing svms to
|
||||
improve recall, is to adjust the threshold associated
|
||||
with an svm. shanahan and roma described an
|
||||
automatic process for adjusting the thresholds of
|
||||
generic svm [26] with better results.
|
||||
johnson et al. described a fast decision tree
|
||||
construction algorithm that takes advantage of the
|
||||
sparsity of text data, and a rule simplification
|
||||
method that converts the decision tree into a
|
||||
logically equivalent rule set [9].
|
||||
lim proposed a method which improves
|
||||
performance of knn based text classification by
|
||||
using well estimated parameters [18]. some
|
||||
variants of the knn method with different decision
|
||||
functions, k values, and feature sets were proposed
|
||||
and evaluated to find out adequate parameters.
|
||||
corner classification (cc) network is a kind of
|
||||
feed forward neural network for instantly document
|
||||
classification. a training algorithm, named as
|
||||
textcc is presented in [34].
|
||||
the level of difficulty of text classification tasks
|
||||
naturally varies. as the number of distinct classes
|
||||
increases, so does the difficulty, and therefore the
|
||||
size of the training set needed. in any multi-class
|
||||
text classification task, inevitably some classes will
|
||||
be more difficult than others to classify. reasons
|
||||
for this may be: (1) very few positive training
|
||||
examples for the class, and/or (2) lack of good
|
||||
predictive features for that class.
|
||||
training a binary classifier per category in
|
||||
text categorization, we use all the documents in the
|
||||
training corpus that belong to that category as
|
||||
relevant training data and all the documents in the
|
||||
training corpus that belong to all the other
|
||||
categories as non-relevant training data. it is often
|
||||
the case that there is an overwhelming number of
|
||||
non relevant training documents especially
|
||||
there is a large collection of categories with each
|
||||
assigned to a small number of documents, which is
|
||||
typically an <20>imbalanced data problem". this
|
||||
problem presents a particular challenge to
|
||||
classification algorithms, which can achieve high
|
||||
accuracy by simply classifying every example as
|
||||
negative. to overcome this problem, cost sensitive
|
||||
learning is needed [5].
|
||||
a scalability analysis of a number of classifiers
|
||||
in text categorization is given in [32]. vinciarelli
|
||||
presents categorization experiments performed over
|
||||
noisy texts [31]. by noisy it is meant any text
|
||||
obtained through an extraction process (affected by
|
||||
errors) from media other than digital texts (e.g.
|
||||
transcriptions of speech recordings extracted with a
|
||||
recognition system). the performance of the
|
||||
categorization system over the clean and noisy
|
||||
(word error rate between ~10 and ~50 percent)
|
||||
versions of the same documents is compared. the
|
||||
noisy texts are obtained through handwriting
|
||||
recognition and simulation of optical character
|
||||
recognition. the results show that the performance
|
||||
loss is acceptable.
|
||||
other authors [36] also proposed to parallelize
|
||||
and distribute the process of text classification.
|
||||
with such a procedure, the performance of
|
||||
classifiers can be improved in both accuracy and
|
||||
time complexity.
|
||||
recently in the area of machine learning the
|
||||
concept of combining classifiers is proposed as a
|
||||
new direction for the improvement of the
|
||||
performance of individual classifiers. numerous
|
||||
methods have been suggested for the creation of
|
||||
ensemble of classifiers. mechanisms that are used
|
||||
to build ensemble of classifiers include: i) using
|
||||
different subset of training data with a single
|
||||
learning method, ii) using different training
|
||||
parameters with a single training method (e.g. using
|
||||
different initial weights for each neural network in
|
||||
an ensemble), iii) using different learning methods.
|
||||
in the context of combining multiple classifiers
|
||||
for text categorization, a number of researchers
|
||||
have shown that combining different classifiers can
|
||||
improve classification accuracy [1], [29].
|
||||
comparison between the best individual classifier
|
||||
and the combined method, it is observed that the
|
||||
performance of the combined method is superior
|
||||
[2]. nardiello et al. [21] also proposed algorithms
|
||||
in the family of "boosting"-based learners for
|
||||
automated text classification with good results.
|
||||
6 evaluation
|
||||
there are various methods to determine
|
||||
effectiveness; however, precision, recall, and
|
||||
accuracy are most often used. to determine these,
|
||||
one must first begin by understanding if the
|
||||
classification of a document was a true positive
|
||||
(tp), false positive (fp), true negative (tn), or
|
||||
false negative (fn) (see table 3).
|
||||
tp determined as a document being classified
|
||||
correctly as relating to a category.
|
||||
fp determined as a document that is said to be
|
||||
related to the category incorrectly.
|
||||
fn determined as a document that is not marked
|
||||
as related to a category but should be.
|
||||
tn documents that should not be marked as being
|
||||
in a particular category and are not.
|
||||
table 3. classification of a document
|
||||
precision (pi) is determined as the conditional
|
||||
probability that a random document d is classified
|
||||
under ci, or what would be deemed the correct
|
||||
category. it represents the classifiers ability to place
|
||||
a document as being under the correct category as
|
||||
opposed to all documents place in that category,
|
||||
both correct and incorrect:
|
||||
i
|
||||
i i
|
||||
tp
|
||||
i tp fp p + =
|
||||
recall (?i) is defined as the probability that, if a
|
||||
random document dx should be classified under
|
||||
category (ci), this decision is taken.
|
||||
i
|
||||
i i
|
||||
tp
|
||||
i tp fn ? + =
|
||||
accuracy is commonly used as a measure for
|
||||
categorization techniques. accuracy values,
|
||||
however, are much less reluctant to variations in
|
||||
the number of correct decisions than precision and
|
||||
recall:
|
||||
i i i i
|
||||
i i
|
||||
tp tn fp fn
|
||||
tp tn
|
||||
i a + + +
|
||||
= +
|
||||
many times there are very few instances of the
|
||||
interesting category in text categorization. this
|
||||
overrepresentation of the negative class in
|
||||
information retrieval problems can cause problems
|
||||
in evaluating classifiers' performances using
|
||||
accuracy. since accuracy is not a good metric for
|
||||
skewed datasets, the classification performance of
|
||||
algorithms in this case is measured by precision
|
||||
and recall [5].
|
||||
furthermore, precision and recall are often
|
||||
combined in order to get a better picture of the
|
||||
performance of the classifier. this is done by
|
||||
combining them in the following formula:
|
||||
( 2 )
|
||||
2
|
||||
1
|
||||
f<EFBFBD>
|
||||
<EFBFBD> p?
|
||||
<EFBFBD> p ?
|
||||
+
|
||||
=
|
||||
+
|
||||
,
|
||||
where p and ? denote presicion and recall
|
||||
respectively. <20> is a positive parameter, which
|
||||
represents the goal of the evaluation task. if
|
||||
presicion is considered to be more important that
|
||||
recall, then the value of <20> converges to zero. on the
|
||||
other hand, if recall is more important than
|
||||
presicion then <20> converges to infinity. usually <20> is
|
||||
set to 1, because in this way equal importance is
|
||||
given to each presicion and recall.
|
||||
reuters corpus volume i (rcv1) is an archive
|
||||
of over 800,000 manually categorized newswire
|
||||
stories recently made available by reuters, ltd. for
|
||||
research purposes [17]. using this collection, we
|
||||
can compare the learning algorithms.
|
||||
although research in the pass years had shown
|
||||
that training corpus could impact classification
|
||||
performance, little work was done to explore the
|
||||
underlying causes. the authors of [35] try to
|
||||
propose an approach to build semi-automatically
|
||||
high-quality training corpuses for better
|
||||
classification performance by first exploring the
|
||||
properties of training corpuses, and then giving an
|
||||
algorithm for constructing training corpuses semiautomatically.
|
||||
7 conclusion
|
||||
the text classification problem is an artificial
|
||||
intelligence research topic, especially given the
|
||||
vast number of documents available in the form of
|
||||
web pages and other electronic texts like emails,
|
||||
discussion forum postings and other electronic
|
||||
documents.
|
||||
it has observed that even for a specified
|
||||
classification method, classification performances
|
||||
of the classifiers based on different training text
|
||||
corpuses are different; and in some cases such
|
||||
differences are quite substantial. this observation
|
||||
implies that a) classifier performance is relevant to
|
||||
its training corpus in some degree, and b) good or
|
||||
high quality training corpuses may derive
|
||||
classifiers of good performance. unfortunately, up
|
||||
to now little research work in the literature has been
|
||||
seen on how to exploit training text corpuses to
|
||||
improve classifier<65>s performance.
|
||||
some important conclusions have not been
|
||||
reached yet, including:
|
||||
<EFBFBD> which feature selection methods are both
|
||||
computationally scalable and high-performing
|
||||
across classifiers and collections? given the
|
||||
high variability of text collections, do such
|
||||
methods even exist?
|
||||
<EFBFBD> would combining uncorrelated, but wellperforming
|
||||
methods yield a performance
|
||||
increase?
|
||||
<EFBFBD> change the thinking from word frequency
|
||||
based vector space to concepts based vector
|
||||
space. study the methodology of feature
|
||||
selection under concepts, to see if these will
|
||||
help in text categorization.
|
||||
<EFBFBD> make the dimensionality reduction more
|
||||
efficient over large corpus.
|
||||
moreover, there are other two open problems in
|
||||
text mining: polysemy, synonymy. polysemy refers
|
||||
to the fact that a word can have multiple meanings.
|
||||
distinguishing between different meanings of a
|
||||
word (called word sense disambiguation) is not
|
||||
easy, often requiring the context in which the word
|
||||
appears. synonymy means that different words can
|
||||
have the same or similar meaning.
|
||||
references:
|
||||
[1] bao y. and ishii n., <20>combining multiple knn
|
||||
classifiers for text categorization by
|
||||
reducts<EFBFBD>, lncs 2534, 2002, pp. 340-347
|
||||
[2] bi y., bell d., wang h., guo g., greer k.,
|
||||
<EFBFBD>combining multiple classifiers using
|
||||
dempster's rule of combination for text
|
||||
categorization<EFBFBD>, mdai, 2004, 127-138.
|
||||
[3] brank j., grobelnik m., milic-frayling n.,
|
||||
mladenic d., <20>interaction of feature selection
|
||||
methods and linear classification models<6C>,
|
||||
proc. of the 19th international conference on
|
||||
machine learning, australia, 2002.
|
||||
[4] ana cardoso-cachopo, arlindo l. oliveira, an
|
||||
empirical comparison of text categorization
|
||||
methods, lecture notes in computer science,
|
||||
volume 2857, jan 2003, pages 183 - 196
|
||||
[5] chawla, n. v., bowyer, k. w., hall, l. o.,
|
||||
kegelmeyer, w. p., <20>smote: synthetic
|
||||
minority over-sampling technique,<2C> journal
|
||||
of ai research, 16 2002, pp. 321-357.
|
||||
[6] forman, g., an experimental study of feature
|
||||
selection metrics for text categorization.
|
||||
journal of machine learning research, 3 2003,
|
||||
pp. 1289-1305
|
||||
[7] fragoudis d., meretakis d., likothanassis s.,
|
||||
<EFBFBD>integrating feature and instance selection for
|
||||
text classification<6F>, sigkdd <20>02, july 23-26,
|
||||
2002, edmonton, alberta, canada.
|
||||
[8] guan j., zhou s., <20>pruning training corpus to
|
||||
speedup text classification<6F>, dexa 2002, pp.
|
||||
831-840
|
||||
[9] d. e. johnson, f. j. oles, t. zhang, t. goetz,
|
||||
<EFBFBD>a decision-tree-based symbolic rule induction
|
||||
system for text categorization<6F>, ibm systems
|
||||
journal, september 2002.
|
||||
[10] han x., zu g., ohyama w., wakabayashi
|
||||
t., kimura f., accuracy improvement of
|
||||
automatic text classification based on
|
||||
feature transformation and multi-classifier
|
||||
combination, lncs, volume 3309, jan 2004,
|
||||
pp. 463-468
|
||||
[11] ke h., shaoping m., <20>text categorization
|
||||
based on concept indexing and principal
|
||||
component analysis<69>, proc. tencon 2002
|
||||
conference on computers, communications,
|
||||
control and power engineering, 2002, pp. 51-
|
||||
56.
|
||||
[12] kehagias a., petridis v., kaburlasos v.,
|
||||
fragkou p., <20>a comparison of word- and
|
||||
sense-based text categorization using
|
||||
several classification algorithms<6D>, jiis,
|
||||
volume 21, issue 3, 2003, pp. 227-247.
|
||||
[13] b. kessler, g. nunberg, and h. schutze.
|
||||
automatic detection of text genre. in
|
||||
proceedings of the thirty-fifth acl and
|
||||
eacl, pages 32<33>38, 1997.
|
||||
[14] kim s. b., rim h. c., yook d. s. and lim
|
||||
h. s., <20>effective methods for improving naive
|
||||
bayes text classifiers<72>, lnai 2417, 2002, pp.
|
||||
414-423
|
||||
[15] klopotek m. and woch m., <20>very large
|
||||
bayesian networks in text classification<6F>,
|
||||
iccs 2003, lncs 2657, 2003, pp. 397-406
|
||||
[16] leopold, edda & kindermann, j<>rg, <20>text
|
||||
categorization with support vector machines.
|
||||
how to represent texts in input space?<3F>,
|
||||
machine learning 46, 2002, pp. 423 - 444.
|
||||
[17] lewis d., yang y., rose t., li f., <20>rcv1:
|
||||
a new benchmark collection for text
|
||||
categorization research<63>, journal of machine
|
||||
learning research 5, 2004, pp. 361-397.
|
||||
[18] heui lim, improving knn based text
|
||||
classification with well estimated parameters,
|
||||
lncs, vol. 3316, oct 2004, pages 516 - 523.
|
||||
[19] madsen r. e., sigurdsson s., hansen l. k.
|
||||
and lansen j., <20>pruning the vocabulary for
|
||||
better context recognition<6F>, 7th international
|
||||
conference on pattern recognition, 2004
|
||||
[20] montanes e., quevedo j. r. and diaz i.,
|
||||
<EFBFBD>a wrapper approach with support vector
|
||||
machines for text categorization<6F>, lncs
|
||||
2686, 2003, pp. 230-237
|
||||
[21] nardiello p., sebastiani f., sperduti a.,
|
||||
<EFBFBD>discretizing continuous attributes in
|
||||
adaboost for text categorization<6F>, lncs,
|
||||
volume 2633, jan 2003, pp. 320-334
|
||||
[22] novovicova j., malik a., and pudil p.,
|
||||
<EFBFBD>feature selection using improved mutual
|
||||
information for text classification<6F>,
|
||||
sspr&spr 2004, lncs 3138, pp. 1010<31>
|
||||
1017, 2004
|
||||
[23] qiang w., xiaolong w., yi g., <20>a study
|
||||
of semi-discrete matrix decomposition for lsi
|
||||
in automated text categorization<6F>, lncs,
|
||||
volume 3248, jan 2005, pp. 606-615.
|
||||
[24] schneider, k., techniques for improving
|
||||
the performance of naive bayes for text
|
||||
classification, lncs, vol. 3406, 2005, 682-
|
||||
693.
|
||||
[25] sebastiani f., <20>machine learning in
|
||||
automated text categorization<6F>, acm
|
||||
computing surveys, vol. 34 (1),2002, pp. 1-47.
|
||||
[26] shanahan j. and roma n., improving svm
|
||||
text classification performance through
|
||||
threshold adjustment, lnai 2837, 2003, 361-
|
||||
372
|
||||
[27] soucy p. and mineau g., <20>feature
|
||||
selection strategies for text categorization<6F>,
|
||||
ai 2003, lnai 2671, 2003, pp. 505-509
|
||||
[28] sousa p., pimentao j. p., santos b. r. and
|
||||
moura-pires f., <20>feature selection algorithms
|
||||
to improve documents classification
|
||||
performance<EFBFBD>, lnai 2663, 2003, pp. 288-296
|
||||
[29] sung-bae cho, jee-haeng lee, learning
|
||||
neural network ensemble for practical text
|
||||
classification, lecture notes in computer
|
||||
science, volume 2690, aug 2003, pages 1032
|
||||
<EFBFBD> 1036.
|
||||
[30] torkkola k., <20>discriminative features for
|
||||
text document classification<6F>, proc.
|
||||
international conference on pattern
|
||||
recognition, canada, 2002.
|
||||
[31] vinciarelli a., <20>noisy text categorization,
|
||||
pattern recognition<6F>, 17th international
|
||||
conference on (icpr'04) , 2004, pp. 554-557
|
||||
[32] y. yang, j. zhang and b. kisiel., <20>a
|
||||
scalability analysis of classifiers in text
|
||||
categorization<EFBFBD>, acm sigir'03, 2003, pp 96-
|
||||
103
|
||||
[33] y. yang. an evaluation of statistical
|
||||
approaches to text categorization. journal of
|
||||
information retrieval, 1(1/2):67<36>88, 1999.
|
||||
[34] zhenya zhang, shuguang zhang, enhong
|
||||
chen, xufa wang, hongmei cheng, textcc:
|
||||
new feed forward neural network for
|
||||
classifying documents instantly, lecture
|
||||
notes in computer science, volume 3497, jan
|
||||
2005, pages 232 <20> 237.
|
||||
[35] shuigeng zhou, jihong guan, evaluation
|
||||
and construction of training corpuses for text
|
||||
classification: a preliminary study, lecture
|
||||
notes in computer science, volume 2553, jan
|
||||
2002, page 97-108.
|
||||
[36] verayuth lertnattee, thanaruk
|
||||
theeramunkong, parallel text categorization
|
||||
for multi-dimensional data, lecture notes in
|
||||
computer science, volume 3320, jan 2004,
|
||||
pages 38 - 41
|
||||
[37] wang qiang, wang xiaolong, guan yi, a
|
||||
study of semi-discrete matrix decomposition
|
||||
for lsi in automated text categorization,
|
||||
lecture notes in computer science, volume
|
||||
3248, jan 2005, pages 606 <20> 615.
|
||||
[38] zu g., ohyama w., wakabayashi t.,
|
||||
kimura f., "accuracy improvement of
|
||||
automatic text classification based on feature
|
||||
transformation": proc: the 2003 acm
|
||||
symposium on document engineering,
|
||||
november 20-22, 2003, pp.118-120
|
||||
30
texte_3.txt
Normal file
30
texte_3.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
Trattamenti statistici dei dati testuali (L. Lebart, CNRS-ENST; lebart@enst.fr)
|
||||
Il materiale statistico <20> testo <20> <20> onnipresente, quasi banale, sin dallo sviluppo di Internet e del web. Lo studio quantitativo e statistico di questi testi sembra essere apparso di recente, eppure gli studi statistici sui testi risalgono a diversi decenni fa, in particolare in Francia con i lavori di P. Guiraud (<28> Problemi e metodi di statistica linguistica <20>, PUF, 1960), C. Muller (<28> Principi e metodi di statistica lessicale <20>, Hachette, 1977) e successivamente J.P. Benz<6E>cri (<28> Pratica dell'analisi dei dati, vol. 3: Linguistica e lessicologia <20>, Dunod, 1981).
|
||||
Dopo la <20> stilometria <20>, dedicata allo studio della forma dei testi, al fine di identificare un autore o di datare un'opera, sono apparse le tecniche di documentazione automatica (information retrieval in inglese), che mirano a ricercare in un database di documenti (articoli scientifici, riassunti, brevetti, ecc.) gli elementi pertinenti a partire da una richiesta espressa sotto forma di testo libero. Il campo disciplinare <20> trattamento del linguaggio naturale <20> <20> poi emerso e si <20> sviluppato inizialmente come uno dei campi di applicazione privilegiati dell'intelligenza artificiale. La complessit<69> del materiale, la necessit<69> di assimilare enormi corpus di testi, la rilevanza del concetto di apprendimento hanno naturalmente aperto questo campo ai metodi statistici. La statistica multidimensionale, le catene di Markov nascoste, i metodi di analisi discriminante intervengono quindi per costruire gli strumenti di base che sono i motori di ricerca sul web, gli analizzatori morfosintattici, i correttori ortografici, nonch<63> in campi applicativi pratici come il trattamento delle risposte alle domande aperte nelle indagini socio-economiche.
|
||||
Le domande aperte
|
||||
In un certo numero di situazioni di indagine, <20> utile lasciare aperte alcune domande, le cui risposte si presenteranno quindi sotto forma di testi di lunghezza variabile.
|
||||
La raccolta dei dati
|
||||
In almeno tre situazioni comuni, l'uso di domande aperte <20> necessario:
|
||||
Per ridurre o ottimizzare la durata dell'intervista di indagine: Sebbene le risposte libere e quelle guidate forniscano informazioni di natura diversa, le prime sono pi<70> economiche in termini di tempo di intervista e generano meno stanchezza. Una semplice domanda aperta (ad esempio: <20> Quali sono state le tue principali attivit<69> domenica scorsa? <20>) pu<70> sostituire lunghe liste di elementi.
|
||||
Come complemento a domande chiuse: Di solito si tratta della domanda <20> Perch<63>? <20>. Le spiegazioni riguardanti una risposta gi<67> data devono necessariamente essere spontanee. Un elenco di elementi potrebbe suggerire nuovi argomenti che potrebbero compromettere l'autenticit<69> dell'argomentazione.
|
||||
Per raccogliere informazioni che devono essere spontanee per loro natura: I questionari delle indagini di marketing abbondano di domande di questo tipo. Esempi includono: <20> Cosa ricordi di questa campagna pubblicitaria? <20> oppure <20> Cosa pensi di questa auto? <20>.
|
||||
Unit<EFBFBD> statistiche
|
||||
I programmi lavorano a partire dal testo grezzo, estraendo automaticamente delle unit<69> statistiche, per lo pi<70> forme grafiche (sequenze di caratteri non separatori). Si usa il termine forma grafica perch<63> la parola <20> parola <20> <20> ambigua. Pu<50> infatti riferirsi all'occorrenza di una parola, al tipo, oppure al lemma (ad esempio, <20> avere <20> <20> il lemma di <20> aveva <20>).
|
||||
nel caso dell<6C>esempio precedente per 1009 risposte si ottengono 14337 occorrenze di 1394 forme distinte (o tipi) <20> ben noto che la distribuzione di frequenza delle parole <20> molto asimmetrica (legge di zipf, simile alla distribuzione di pareto) cos<6F> selezionando solo le forme che appaiono almeno 20 volte rimane un testo di 10994 forme con solo 97 forme distinte (cos<6F> il 7% delle parole distinte corrisponde al 77% del testo totale) in particolare quasi la met<65> delle forme grafiche distinte appare una sola volta (queste sono gli <20> hapax <20>)
|
||||
il post-codifica
|
||||
il pretrattamento empirico chiamato <20> post-codifica <20> permette di chiudere a posteriori le domande aperte questa tecnica comune consiste nel costruire una serie di elementi a partire da un sotto-campione di risposte per poi codificare tutte le risposte in modo da sostituire la domanda aperta con una o pi<70> domande chiuse per l<>esempio sopra la seconda risposta la pi<70> semplice darebbe gli elementi <20> lettura <20> <20> viaggi <20> <20> tempo libero <20> a condizione che questi elementi appaiano con una certa frequenza nel campione di risposte tuttavia la prima risposta <20> pi<70> difficile da post-codificare
|
||||
gli strumenti statistici di base
|
||||
gli strumenti di base comprendono la selezione di forme caratteristiche la selezione di risposte modali l'analisi delle corrispondenze e la classificazione delle tabelle lessicali
|
||||
forme o segmenti caratteristici (o specificit<69>)
|
||||
le forme caratteristiche sono le forme <20> anormalmente <20> frequenti nelle risposte di un gruppo di individui (tecnica proposta da p lafon nel 1980) un test elementare basato sulla legge ipergeometrica permette di selezionare le parole (forme grafiche o lemmi) la cui frequenza in un gruppo <20> significativamente superiore (o inferiore per le parole anti-caratteristiche) alla frequenza media nel corpus si tratta di test classici di confronto delle frequenze ma la ripetizione di questo test porta a prendere soglie di significativit<69> molto rigide (fenomeno di confronti multipli ben noto agli statistici)
|
||||
nell<EFBFBD>esempio citato sopra la frequenza media della parola lavoro nel corpus era del 3,4%; per il gruppo delle donne oltre i 55 anni la frequenza <20> solo dell<6C>1,2% questa differenza <20> altamente significativa (si pu<70> esprimere il test di confronto delle frequenze in termini di scarti standard nella ipotesi di omogeneit<69> delle frequenze il valore del 1,2% <20> a 4,5 scarti standard dal valore medio del 3,4%) poich<63> si tratta di una frequenza anormalmente bassa si parler<65> di parole anti-caratteristiche
|
||||
le selezioni delle risposte modali
|
||||
per un gruppo di individui e quindi per il raggruppamento delle risposte corrispondenti le risposte modali (o frasi caratteristiche o documenti-tipo la terminologia varia a seconda dei campi di applicazione) sono risposte originali del corpus di base che caratterizzano meglio il gruppo si pu<70> per ogni raggruppamento calcolare la distanza del profilo lessicale di un individuo dal profilo lessicale medio del gruppo poi si possono ordinare le distanze in ordine crescente e selezionare le risposte pi<70> rappresentative in termini di profilo lessicale che corrisponderanno alle distanze minori si ottiene cos<6F> una sorta di sintesi delle risposte di ogni gruppo costituita da risposte originali (l lebart e a salem statistica testuale dunod 1994) sempre nel caso del nostro esempio <20> essere felice avere un buon lavoro successo professionale e familiare <20> <20> una risposta caratteristica dei giovani uomini <20> la salute la famiglia <20> <20> una risposta che caratterizza le persone pi<70> anziane in pratica si utilizzano pi<70> risposte caratteristiche per ogni gruppo
|
||||
analisi delle corrispondenze e classificazione
|
||||
il volume dei dati richiede l'uso di potenti strumenti di descrizione i metodi di analisi delle corrispondenze e di classificazione possono descrivere le tabelle di contingenza che incrociano le risposte con le forme grafiche o gruppi di risposte (ad esempio raggruppamenti in base al livello di istruzione dei rispondenti) e le forme grafiche questi strumenti permettono di visualizzare sotto forma di serie di mappe piane (o dendrogrammi nel caso dei metodi di classificazione o mappe auto-organizzate di kohonen metodo <20> neurale <20> di visualizzazione) le associazioni tra parole (forme) e gruppi o modalit<69> cos<6F> una visualizzazione delle prossimit<69> tra parole e categorie socio-professionali pu<70> aiutare a leggere le risposte di ciascuna di queste categorie
|
||||
conclusioni e prospettive
|
||||
per risposte semplici e stereotipate come abbiamo visto le procedure di post-codifica possono funzionare tuttavia tra i difetti di questo tipo di trattamento si possono menzionare:
|
||||
la mediazione del codificatore: le decisioni da prendere sono talvolta difficili
|
||||
la qualit<69> dell'espressione il registro del vocabolario la tonalit<69> generale dell'intervista sono elementi di analisi persi durante la post-codifica (bisogna codificare in modo diverso <20> non lo so <20> e <20> preferisco non dire nulla <20>?)
|
||||
le risposte composite complesse e molto diverse sono difficili da post-codificare ed <20> spesso in questi casi che il valore euristico delle risposte libere <20> maggiore
|
||||
le risposte poco frequenti originali e poco chiare a una prima lettura sono considerate come <20> rumore <20> e assegnate a categorie residuali (<28> altre <20>) che sono quindi molto eterogenee e difficili da gestire senza che sia necessario procedere a una post-codifica attualmente <20> possibile a partire da un insieme di testi e da una soglia di frequenza per le forme grafiche ottenere una visualizzazione delle prossimit<69> tra testi in base ai loro profili lessicali e tra forme grafiche in base alla loro distribuzione nei testi l'arricchimento delle unit<69> statistiche con segmenti ripetuti cf a salem pratica dei segmenti ripetuti klincksieck 1987 i loro raggruppamenti per categorizzazione morfologica l'utilizzo delle forme caratteristiche o specificit<69> l'aggiunta delle risposte modali o delle frasi o unit<69> di contesto caratteristiche hanno perfezionato questi approcci e messo a disposizione di molti utenti metodi e software utili in alcuni specifici ambiti applicativi come il trattamento automatico delle risposte alle domande aperte che ci interessa qui l'efficacia del metodo come complemento alle approcci tradizionali <20> riconosciuta parallelamente ai lavori dell'industria della lingua che abbiamo menzionato in precedenza e che fanno parte di un'ingegneria statistica complessa esistono quindi applicazioni testuali della statistica a portata di mano richiedono sicuramente software specifici ma la natura familiare e viva del materiale di base compensa in qualche modo la relativa complessit<69> dei trattamenti e le difficolt<6C> di interpretazione vicino alle basi di dati all'intelligenza artificiale e alle reti neurali alla teoria dell'apprendimento alle tecniche recenti di estrazione e gestione della conoscenza il dominio testuale illustra bene la polivalenza e la potenza della metodologia statistica anche quando i metodi assumono nomi pi<70> esotici come text mining o text mining il lavoro dello statistico <20> sempre necessario quando si tratta di conoscere la portata reale dei fatti osservati e dei tratti strutturali ottenuti di sapere cosa si pu<70> affermare e cosa non si deve dire ovvero di dare uno statuto scientifico ai risultati
|
||||
8
utils/__init__.py
Normal file
8
utils/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Ce module contient des fonctions utilisataires
|
||||
"""
|
||||
|
||||
|
||||
def normalize_probabilities(prob_fr: float, prob_en: float, prob_it: float, searched: float) -> float:
|
||||
sum = prob_fr + prob_en + prob_it
|
||||
return searched / sum
|
||||
BIN
utils/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
utils/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
Reference in New Issue
Block a user