Continue TP2

This commit is contained in:
flyingscorpio@clevo 2023-01-26 16:04:33 +01:00
parent b10b55ec7e
commit 0260edcdf0
3 changed files with 497 additions and 4654 deletions

File diff suppressed because one or more lines are too long

View file

@ -1,9 +1,17 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8ec58a20",
"metadata": {},
"source": [
"# TP2 - Construction d'un modèle de Machine Learning pour la détection de Spam"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a515623",
"id": "6caf133b",
"metadata": {},
"outputs": [],
"source": [
@ -11,19 +19,17 @@
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import plotly.express as px\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Activation, Dropout\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeRegressor"
"from sklearn.preprocessing import StandardScaler, LabelEncoder"
]
},
{
"cell_type": "markdown",
"id": "50e0e80b",
"metadata": {},
"source": [
"## 1 Importer le dataset Spam"
]
},
{
@ -33,25 +39,24 @@
"metadata": {},
"outputs": [],
"source": [
"# Lecture du fichier\n",
"train = pd.read_csv('Spam/training.csv',delimiter=';')"
"training_set = pd.read_csv('Spam/training.csv', delimiter=';')\n",
"testing_set = pd.read_csv('Spam/testing.csv', delimiter=',')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b53d403d",
"cell_type": "markdown",
"id": "b2e9a3e4",
"metadata": {},
"outputs": [],
"source": [
"# Affichage d'informations sur le fichier\n",
"\n",
"#print(train.columns.values) \n",
"#print(train.describe())\n",
"#print(train.head())\n",
"#print(train.tail())\n",
"#print(train.info())\n",
"#print(train.isnull().sum())"
"## 2 Préparer les données afin de les exploiter par des modèles de Machines learning de type classifieur."
]
},
{
"cell_type": "markdown",
"id": "3633d598",
"metadata": {},
"source": [
"#### Renommage de la colonne sur laquelle va porter la prédiction"
]
},
{
@ -61,9 +66,196 @@
"metadata": {},
"outputs": [],
"source": [
"# On renomme la colonne sur laquelle va porter la prédiction\n",
"train.rename(columns={\"GOAL-Spam\": \"GOAL_Spam\"}, inplace=True)\n",
"print(train.head())"
"training_set.rename(columns={\"GOAL-Spam\": \"is_spam\"}, inplace=True)\n",
"testing_set.rename(columns={\"Spam\": \"is_spam\"}, inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "9fa65851",
"metadata": {},
"source": [
"#### Encodage des labels"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac4057a3",
"metadata": {},
"outputs": [],
"source": [
"label_quality = LabelEncoder()\n",
"training_set['is_spam'] = label_quality.fit_transform(training_set['is_spam'])\n",
"testing_set['is_spam'] = label_quality.fit_transform(testing_set['is_spam'])"
]
},
{
"cell_type": "markdown",
"id": "e010ab77",
"metadata": {},
"source": [
"#### Séparation des données : données d'entrée et labels"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d87bcc11",
"metadata": {},
"outputs": [],
"source": [
"X_train = training_set.drop('is_spam', axis=1)\n",
"y_train = training_set['is_spam']\n",
"X_test = testing_set.drop('is_spam', axis=1)\n",
"y_test = testing_set['is_spam']"
]
},
{
"cell_type": "markdown",
"id": "fe23db53",
"metadata": {},
"source": [
"#### Application d'une fonction d'optimisation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a1734cb",
"metadata": {},
"outputs": [],
"source": [
"sc = StandardScaler()\n",
"X_train = sc.fit_transform(X_train)\n",
"X_test = sc.fit_transform(X_test)"
]
},
{
"cell_type": "markdown",
"id": "6020a8ec",
"metadata": {},
"source": [
"## 3 - Construire des modèles à base des méthodes suivantes:"
]
},
{
"cell_type": "markdown",
"id": "eb17b81c",
"metadata": {},
"source": [
"### Regression Logistique"
]
},
{
"cell_type": "markdown",
"id": "8b499bf0",
"metadata": {},
"source": [
"#### Entraînement"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "867c6287",
"metadata": {},
"outputs": [],
"source": [
"model = LogisticRegression()\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "622f398f",
"metadata": {},
"source": [
"#### Prédiction sur les données de test"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ab10386",
"metadata": {},
"outputs": [],
"source": [
"y_predict = model.predict(X_test)"
]
},
{
"cell_type": "markdown",
"id": "4c7ed9ea",
"metadata": {},
"source": [
"#### Qualité de la prédiction"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a4dc2ea",
"metadata": {},
"outputs": [],
"source": [
"acc = accuracy_score(y_test, y_predict) * 100\n",
"print(f\"Précision de la régression logistique : {acc:.2f}%\")\n",
"rapport_classification = classification_report(y_test, y_predict)\n",
"print(rapport_classification)"
]
},
{
"cell_type": "markdown",
"id": "a855b4e7",
"metadata": {},
"source": [
"#### Matrice de confusion"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57980c5e",
"metadata": {},
"outputs": [],
"source": [
"matrice_confusion = confusion_matrix(y_test, y_predict)\n",
"print(f\"Confusion de la classe 0 avec la classe 1 : {matrice_confusion[0, 1]} fois\")\n",
"print(f\"Confusion de la classe 0 avec la classe 1 : {matrice_confusion[1, 0]} fois\")\n",
"sns.heatmap(\n",
" matrice_confusion,\n",
" annot=True,\n",
" xticklabels=['No', 'Yes'],\n",
" yticklabels=['No', 'Yes'],\n",
")\n",
"#plt.figure(figsize=(3, 3)) # TODO: what's this?\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "603b2d0b",
"metadata": {},
"source": [
"### Réseaux de neurones"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a515623",
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Activation, Dropout\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeRegressor"
]
},
{
@ -134,115 +326,6 @@
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac4057a3",
"metadata": {},
"outputs": [],
"source": [
"# transformation des labels par encodage\n",
"c = pd.Categorical(train['GOAL_Spam'])\n",
"print(c)\n",
"label_quality = LabelEncoder() \n",
"train['GOAL_Spam'] = label_quality.fit_transform(train['GOAL_Spam'])\n",
"print(train['GOAL_Spam'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d87bcc11",
"metadata": {},
"outputs": [],
"source": [
"# Séparation des données: Données d'entrainement et Labels\n",
"X = train.drop('GOAL_Spam', axis=1)\n",
"y = train['GOAL_Spam']\n",
"print(X)\n",
"print(y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2368680f",
"metadata": {},
"outputs": [],
"source": [
"# Séparer les données d'entraînement (80%) et les données de test (20%)\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=42\n",
")\n",
"\n",
"#Appliquer une fonction d'optimisation de cette séparation\n",
"sc = StandardScaler()\n",
"X_train = sc.fit_transform(X_train)\n",
"X_test = sc.fit_transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "867c6287",
"metadata": {},
"outputs": [],
"source": [
"# 1 - Entrainement par une Regression Logistique\n",
"model = LogisticRegression()\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ab10386",
"metadata": {},
"outputs": [],
"source": [
"# Mesure de la précision du modèle\n",
"\n",
"# Prédictions sur les données de test\n",
"y_predict = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a4dc2ea",
"metadata": {},
"outputs": [],
"source": [
"# Edition d'un rapport sur la qualité de la prédiction\n",
"acc = format(accuracy_score(y_test, y_predict * 100))\n",
"print(acc)\n",
"rapport_classification = classification_report(y_test, y_predict)\n",
"#print(rapport_classification)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57980c5e",
"metadata": {},
"outputs": [],
"source": [
"# Affichage de la matrice de confusion\n",
"matrice_confusion = confusion_matrix(y_test, y_predict)\n",
"print(\n",
" \"Confusion de la classe 0 avec la classe 1 :\",\n",
" matrice_confusion[0, 1]\n",
")\n",
"sns.heatmap(\n",
" matrice_confusion,\n",
" annot=True,\n",
" xticklabels=['No', 'Yes'],\n",
" yticklabels=['No', 'Yes'],\n",
")\n",
"plt.figure(figsize=(3, 3))\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,