README¶

In this project, a neural network is applied to predict whether the student will pass or fail the exam based on the previeus exam score and the study hours.

Source: https://www.kaggle.com/datasets/mrsimple07/student-exam-performance-prediction/data

Exploratory Data Analysis¶

Libraries¶

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split

Load Data¶

In [3]:
df = pd.read_csv("student_exam_data.csv")
df.head()
Out[3]:
Study Hours Previous Exam Score Pass/Fail
0 4.370861 81.889703 0
1 9.556429 72.165782 1
2 7.587945 58.571657 0
3 6.387926 88.827701 1
4 2.404168 81.083870 0

Data Analysis¶

Correlation

In [4]:
df.corr()
Out[4]:
Study Hours Previous Exam Score Pass/Fail
Study Hours 1.000000 0.010354 0.583505
Previous Exam Score 0.010354 1.000000 0.443706
Pass/Fail 0.583505 0.443706 1.000000

Visualize data

In [5]:
plt.figure(figsize=(8,6))

# Pass 
pass_mask = df["Pass/Fail"] == 1
plt.scatter(df["Study Hours"][pass_mask], 
            df["Previous Exam Score"][pass_mask], 
            color='green', label='Pass', alpha=0.7)

# Fail 
fail_mask = df["Pass/Fail"] == 0
plt.scatter(df["Study Hours"][fail_mask], 
            df["Previous Exam Score"][fail_mask], 
            color='red', label='Fail', alpha=0.7)

plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Students: Pass (green) vs Fail (red)")
plt.legend()
plt.show()
No description has been provided for this image

Training Neural Network¶

Preprocessing¶

Copy dataframe

In [6]:
df_model = df.copy()

print(df_model.shape)
(500, 3)

Split input and output data

In [7]:
X = df_model.drop("Pass/Fail", axis=1)
y = df_model["Pass/Fail"]

print(X.shape)
print(y.shape)
(500, 2)
(500,)

Split training and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,     # 20% test, 80% train
    random_state=42,   # same split everytime
)

print(f"Input training data:",X_train.shape, "Input test data:",X_test.shape)
print(y_train.shape, y_test.shape)
Input training data: (400, 2) Input test data: (100, 2)
(400,) (100,)

Neural Network Model¶

Train data

In [9]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32) #.values in pytroch array,
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1,1)

print(X_train_tensor.shape)
print(y_train_tensor.shape)
torch.Size([400, 2])
torch.Size([400, 1])

Model

In [10]:
model = nn.Sequential(
    nn.Linear(2,10),
    nn.ReLU(),
    nn.Linear(10,1)
)
In [11]:
loss_fn = torch.nn.BCEWithLogitsLoss() #catergorical loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #Optimizer = Stochastic Gradient Descent

Training loop

In [12]:
# Training loop
losses_list = []
for i in range(0,20000):  
    optimizer.zero_grad() 
    # model
    outputs = model(X_train_tensor)
    # loss
    loss = loss_fn(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    # loss fucntion
    losses_list.append(loss.item())

    if i % 5000 == 0:
        print(loss)
tensor(1.3062, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.0010, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(6.0514e-05, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(3.9126e-06, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

Visualized the loss function

In [13]:
plt.figure(figsize=(8,5))
plt.plot(losses_list)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Loss vs Iterations")
plt.grid(True)
plt.show()
No description has been provided for this image

Validation¶

In [14]:
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32) #.values in pytroch array,
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1,1)

print(X_test_tensor.shape)
print(y_test_tensor.shape)
torch.Size([100, 2])
torch.Size([100, 1])
In [15]:
#prediction model
model.eval() 

with torch.no_grad():  #disable the gradients

    #input layer
    outputs = model(X_test_tensor)
    
    prediction = nn.functional.sigmoid(outputs) > 0.5
    prediction_compared = prediction.type(torch.float32) == y_test_tensor
    # compare prediction with true data
    print(f"Percentage correct: ",prediction_compared.type(torch.float32).mean())
Percentage correct:  tensor(0.9800)

Confusion matrix

In [16]:
# Convert tensors to numpy
y_true = y_test_tensor.numpy()
y_pred = prediction.numpy()

# Set confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()
No description has been provided for this image

New dataframes

In [17]:
X_test_tensor
df_X_test = pd.DataFrame(X_test_tensor, columns=["Study Hours", "Previous Exam Score"])

y_true
df_y_true = pd.DataFrame(y_true, columns=["Pass/Fail"])

y_pred
df_y_pred = pd.DataFrame(y_pred, columns=["Pass/Fail"])
df_y_pred = df_y_pred["Pass/Fail"].astype(float)


df_true = pd.concat([df_X_test, df_y_true], axis=1)
df_prediction = pd.concat([df_X_test, df_y_pred], axis=1)

Compare true vs prediction

In [18]:
plt.figure(figsize=(8,4))
plt.suptitle("Validation", fontsize=16) 

plt.subplot(1,2,1)
# Pass 
pass_mask_true = df_true["Pass/Fail"] == 1
plt.scatter(df_true["Study Hours"][pass_mask_true], 
            df_true["Previous Exam Score"][pass_mask_true], 
            color='green', label='Pass', alpha=0.7)
# Fail 
fail_mask_true = df_true["Pass/Fail"] == 0
plt.scatter(df_true["Study Hours"][fail_mask_true], 
            df_true["Previous Exam Score"][fail_mask_true], 
            color='red', label='Fail', alpha=0.7)

plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Original Test Data: X_test vs y_true")
plt.legend()

plt.subplot(1,2,2)
# Pass 
pass_mask_pred = df_prediction["Pass/Fail"] == 1
plt.scatter(df_prediction["Study Hours"][pass_mask_pred], 
            df_prediction["Previous Exam Score"][pass_mask_pred], 
            color='green', label='Pass', alpha=0.7)
# Fail 
fail_mask_pred = df_prediction["Pass/Fail"] == 0
plt.scatter(df_prediction["Study Hours"][fail_mask_pred], 
            df_prediction["Previous Exam Score"][fail_mask_pred], 
            color='red', label='Fail', alpha=0.7)

plt.xlabel("Study Hours")
plt.ylabel("Previous Exam Score")
plt.title("Prediction: X_test vs y_prediction")
plt.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image