Continue TP2
This commit is contained in:
parent
b10b55ec7e
commit
0260edcdf0
3 changed files with 497 additions and 4654 deletions
File diff suppressed because one or more lines are too long
|
@ -1,9 +1,17 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8ec58a20",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# TP2 - Construction d'un modèle de Machine Learning pour la détection de Spam"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a515623",
|
||||
"id": "6caf133b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -11,19 +19,17 @@
|
|||
"import matplotlib.pyplot as plt\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"import plotly.express as px\n",
|
||||
"from keras.models import Sequential\n",
|
||||
"from keras.layers import Dense, Activation, Dropout\n",
|
||||
"from sklearn.linear_model import SGDClassifier\n",
|
||||
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.svm import SVC\n",
|
||||
"from sklearn.tree import DecisionTreeRegressor"
|
||||
"from sklearn.preprocessing import StandardScaler, LabelEncoder"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "50e0e80b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1 – Importer le dataset Spam"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -33,25 +39,24 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lecture du fichier\n",
|
||||
"train = pd.read_csv('Spam/training.csv',delimiter=';')"
|
||||
"training_set = pd.read_csv('Spam/training.csv', delimiter=';')\n",
|
||||
"testing_set = pd.read_csv('Spam/testing.csv', delimiter=',')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b53d403d",
|
||||
"cell_type": "markdown",
|
||||
"id": "b2e9a3e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Affichage d'informations sur le fichier\n",
|
||||
"\n",
|
||||
"#print(train.columns.values) \n",
|
||||
"#print(train.describe())\n",
|
||||
"#print(train.head())\n",
|
||||
"#print(train.tail())\n",
|
||||
"#print(train.info())\n",
|
||||
"#print(train.isnull().sum())"
|
||||
"## 2 – Préparer les données afin de les exploiter par des modèles de Machines learning de type classifieur."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3633d598",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Renommage de la colonne sur laquelle va porter la prédiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -61,9 +66,196 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# On renomme la colonne sur laquelle va porter la prédiction\n",
|
||||
"train.rename(columns={\"GOAL-Spam\": \"GOAL_Spam\"}, inplace=True)\n",
|
||||
"print(train.head())"
|
||||
"training_set.rename(columns={\"GOAL-Spam\": \"is_spam\"}, inplace=True)\n",
|
||||
"testing_set.rename(columns={\"Spam\": \"is_spam\"}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9fa65851",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Encodage des labels"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ac4057a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"label_quality = LabelEncoder()\n",
|
||||
"training_set['is_spam'] = label_quality.fit_transform(training_set['is_spam'])\n",
|
||||
"testing_set['is_spam'] = label_quality.fit_transform(testing_set['is_spam'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e010ab77",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Séparation des données : données d'entrée et labels"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d87bcc11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train = training_set.drop('is_spam', axis=1)\n",
|
||||
"y_train = training_set['is_spam']\n",
|
||||
"X_test = testing_set.drop('is_spam', axis=1)\n",
|
||||
"y_test = testing_set['is_spam']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fe23db53",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Application d'une fonction d'optimisation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a1734cb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sc = StandardScaler()\n",
|
||||
"X_train = sc.fit_transform(X_train)\n",
|
||||
"X_test = sc.fit_transform(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6020a8ec",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3 - Construire des modèles à base des méthodes suivantes:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb17b81c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regression Logistique"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8b499bf0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Entraînement"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "867c6287",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = LogisticRegression()\n",
|
||||
"model.fit(X_train, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "622f398f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Prédiction sur les données de test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ab10386",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_predict = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4c7ed9ea",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Qualité de la prédiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a4dc2ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc = accuracy_score(y_test, y_predict) * 100\n",
|
||||
"print(f\"Précision de la régression logistique : {acc:.2f}%\")\n",
|
||||
"rapport_classification = classification_report(y_test, y_predict)\n",
|
||||
"print(rapport_classification)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a855b4e7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Matrice de confusion"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57980c5e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"matrice_confusion = confusion_matrix(y_test, y_predict)\n",
|
||||
"print(f\"Confusion de la classe 0 avec la classe 1 : {matrice_confusion[0, 1]} fois\")\n",
|
||||
"print(f\"Confusion de la classe 0 avec la classe 1 : {matrice_confusion[1, 0]} fois\")\n",
|
||||
"sns.heatmap(\n",
|
||||
" matrice_confusion,\n",
|
||||
" annot=True,\n",
|
||||
" xticklabels=['No', 'Yes'],\n",
|
||||
" yticklabels=['No', 'Yes'],\n",
|
||||
")\n",
|
||||
"#plt.figure(figsize=(3, 3)) # TODO: what's this?\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "603b2d0b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Réseaux de neurones"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a515623",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import plotly.express as px\n",
|
||||
"from keras.models import Sequential\n",
|
||||
"from keras.layers import Dense, Activation, Dropout\n",
|
||||
"from sklearn.linear_model import SGDClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.svm import SVC\n",
|
||||
"from sklearn.tree import DecisionTreeRegressor"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -134,115 +326,6 @@
|
|||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ac4057a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# transformation des labels par encodage\n",
|
||||
"c = pd.Categorical(train['GOAL_Spam'])\n",
|
||||
"print(c)\n",
|
||||
"label_quality = LabelEncoder() \n",
|
||||
"train['GOAL_Spam'] = label_quality.fit_transform(train['GOAL_Spam'])\n",
|
||||
"print(train['GOAL_Spam'].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d87bcc11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Séparation des données: Données d'entrainement et Labels\n",
|
||||
"X = train.drop('GOAL_Spam', axis=1)\n",
|
||||
"y = train['GOAL_Spam']\n",
|
||||
"print(X)\n",
|
||||
"print(y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2368680f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Séparer les données d'entraînement (80%) et les données de test (20%)\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, y, test_size=0.2, random_state=42\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"#Appliquer une fonction d'optimisation de cette séparation\n",
|
||||
"sc = StandardScaler()\n",
|
||||
"X_train = sc.fit_transform(X_train)\n",
|
||||
"X_test = sc.fit_transform(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "867c6287",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 1 - Entrainement par une Regression Logistique\n",
|
||||
"model = LogisticRegression()\n",
|
||||
"model.fit(X_train, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ab10386",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Mesure de la précision du modèle\n",
|
||||
"\n",
|
||||
"# Prédictions sur les données de test\n",
|
||||
"y_predict = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a4dc2ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Edition d'un rapport sur la qualité de la prédiction\n",
|
||||
"acc = format(accuracy_score(y_test, y_predict * 100))\n",
|
||||
"print(acc)\n",
|
||||
"rapport_classification = classification_report(y_test, y_predict)\n",
|
||||
"#print(rapport_classification)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "57980c5e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Affichage de la matrice de confusion\n",
|
||||
"matrice_confusion = confusion_matrix(y_test, y_predict)\n",
|
||||
"print(\n",
|
||||
" \"Confusion de la classe 0 avec la classe 1 :\",\n",
|
||||
" matrice_confusion[0, 1]\n",
|
||||
")\n",
|
||||
"sns.heatmap(\n",
|
||||
" matrice_confusion,\n",
|
||||
" annot=True,\n",
|
||||
" xticklabels=['No', 'Yes'],\n",
|
||||
" yticklabels=['No', 'Yes'],\n",
|
||||
")\n",
|
||||
"plt.figure(figsize=(3, 3))\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
Loading…
Reference in a new issue