README¶
In this project, a neural network is applied to predict whether the student will pass or fail the exam based on the previeus exam score and the study hours.
Source: https://www.kaggle.com/datasets/mrsimple07/student-exam-performance-prediction/data
Exploratory Data Analysis¶
Libraries¶
In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split
Load Data¶
In [3]:
df = pd.read_csv("student_exam_data.csv")
df.head()
Out[3]:
| Study Hours | Previous Exam Score | Pass/Fail | |
|---|---|---|---|
| 0 | 4.370861 | 81.889703 | 0 |
| 1 | 9.556429 | 72.165782 | 1 |
| 2 | 7.587945 | 58.571657 | 0 |
| 3 | 6.387926 | 88.827701 | 1 |
| 4 | 2.404168 | 81.083870 | 0 |
Data Analysis¶
Correlation
In [4]:
df.corr()
Out[4]:
| Study Hours | Previous Exam Score | Pass/Fail | |
|---|---|---|---|
| Study Hours | 1.000000 | 0.010354 | 0.583505 |
| Previous Exam Score | 0.010354 | 1.000000 | 0.443706 |
| Pass/Fail | 0.583505 | 0.443706 | 1.000000 |
Visualize data
In [5]:
plt.figure(figsize=(8,6))
# Pass
pass_mask = df["Pass/Fail"] == 1
plt.scatter(df["Study Hours"][pass_mask],
df["Previous Exam Score"][pass_mask],
color='green', label='Pass', alpha=0.7)
# Fail
fail_mask = df["Pass/Fail"] == 0
plt.scatter(df["Study Hours"][fail_mask],
df["Previous Exam Score"][fail_mask],
color='red', label='Fail', alpha=0.7)
plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Students: Pass (green) vs Fail (red)")
plt.legend()
plt.show()
Training Neural Network¶
Preprocessing¶
Copy dataframe
In [6]:
df_model = df.copy()
print(df_model.shape)
(500, 3)
Split input and output data
In [7]:
X = df_model.drop("Pass/Fail", axis=1)
y = df_model["Pass/Fail"]
print(X.shape)
print(y.shape)
(500, 2) (500,)
Split training and test data
In [8]:
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20% test, 80% train
random_state=42, # same split everytime
)
print(f"Input training data:",X_train.shape, "Input test data:",X_test.shape)
print(y_train.shape, y_test.shape)
Input training data: (400, 2) Input test data: (100, 2) (400,) (100,)
Neural Network Model¶
Train data
In [9]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32) #.values in pytroch array,
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1,1)
print(X_train_tensor.shape)
print(y_train_tensor.shape)
torch.Size([400, 2]) torch.Size([400, 1])
Model
In [10]:
model = nn.Sequential(
nn.Linear(2,10),
nn.ReLU(),
nn.Linear(10,1)
)
In [11]:
loss_fn = torch.nn.BCEWithLogitsLoss() #catergorical loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #Optimizer = Stochastic Gradient Descent
Training loop
In [12]:
# Training loop
losses_list = []
for i in range(0,20000):
optimizer.zero_grad()
# model
outputs = model(X_train_tensor)
# loss
loss = loss_fn(outputs, y_train_tensor)
loss.backward()
optimizer.step()
# loss fucntion
losses_list.append(loss.item())
if i % 5000 == 0:
print(loss)
tensor(1.3062, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>) tensor(0.0010, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>) tensor(6.0514e-05, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>) tensor(3.9126e-06, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Visualized the loss function
In [13]:
plt.figure(figsize=(8,5))
plt.plot(losses_list)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Loss vs Iterations")
plt.grid(True)
plt.show()
Validation¶
In [14]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32) #.values in pytroch array,
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1,1)
print(X_test_tensor.shape)
print(y_test_tensor.shape)
torch.Size([100, 2]) torch.Size([100, 1])
In [15]:
#prediction model
model.eval()
with torch.no_grad(): #disable the gradients
#input layer
outputs = model(X_test_tensor)
prediction = nn.functional.sigmoid(outputs) > 0.5
prediction_compared = prediction.type(torch.float32) == y_test_tensor
# compare prediction with true data
print(f"Percentage correct: ",prediction_compared.type(torch.float32).mean())
Percentage correct: tensor(0.9800)
Confusion matrix
In [16]:
# Convert tensors to numpy
y_true = y_test_tensor.numpy()
y_pred = prediction.numpy()
# Set confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Plot
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()
New dataframes
In [17]:
X_test_tensor
df_X_test = pd.DataFrame(X_test_tensor, columns=["Study Hours", "Previous Exam Score"])
y_true
df_y_true = pd.DataFrame(y_true, columns=["Pass/Fail"])
y_pred
df_y_pred = pd.DataFrame(y_pred, columns=["Pass/Fail"])
df_y_pred = df_y_pred["Pass/Fail"].astype(float)
df_true = pd.concat([df_X_test, df_y_true], axis=1)
df_prediction = pd.concat([df_X_test, df_y_pred], axis=1)
Compare true vs prediction
In [18]:
plt.figure(figsize=(8,4))
plt.suptitle("Validation", fontsize=16)
plt.subplot(1,2,1)
# Pass
pass_mask_true = df_true["Pass/Fail"] == 1
plt.scatter(df_true["Study Hours"][pass_mask_true],
df_true["Previous Exam Score"][pass_mask_true],
color='green', label='Pass', alpha=0.7)
# Fail
fail_mask_true = df_true["Pass/Fail"] == 0
plt.scatter(df_true["Study Hours"][fail_mask_true],
df_true["Previous Exam Score"][fail_mask_true],
color='red', label='Fail', alpha=0.7)
plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Original Test Data: X_test vs y_true")
plt.legend()
plt.subplot(1,2,2)
# Pass
pass_mask_pred = df_prediction["Pass/Fail"] == 1
plt.scatter(df_prediction["Study Hours"][pass_mask_pred],
df_prediction["Previous Exam Score"][pass_mask_pred],
color='green', label='Pass', alpha=0.7)
# Fail
fail_mask_pred = df_prediction["Pass/Fail"] == 0
plt.scatter(df_prediction["Study Hours"][fail_mask_pred],
df_prediction["Previous Exam Score"][fail_mask_pred],
color='red', label='Fail', alpha=0.7)
plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Prediction: X_test vs y_prediction")
plt.legend()
plt.tight_layout()
plt.show()