# Give Access to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import pandas as pd

# create the data path
grade_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/AI/A1.3 Calificaciones.csv")

# print to confirm the import and get a main idea on the data structure
grade_df.head()

rows = grade_df.shape[0]
cols = grade_df.shape[1]

print(f"Rows: {rows} \nColumns: {cols}")

Rows: 395 
Columns: 10

# get the info of the colums
grade_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Escuela         395 non-null    object
 1   Sexo            395 non-null    object
 2   Edad            395 non-null    int64 
 3   HorasDeEstudio  395 non-null    int64 
 4   Reprobadas      395 non-null    int64 
 5   Internet        395 non-null    object
 6   Faltas          395 non-null    int64 
 7   G1              395 non-null    int64 
 8   G2              395 non-null    int64 
 9   G3              395 non-null    int64 
dtypes: int64(7), object(3)
memory usage: 31.0+ KB

# Save categorcal object cols
cat_cols = grade_df.select_dtypes(include=['object']).columns

# get the unique vals
print(grade_df[cat_cols].nunique())

Escuela     2
Sexo        2
Internet    2
dtype: int64

# loop through categorical cols
for c in cat_cols:
  # print the amount of observations per category
  print(f"{grade_df[c].value_counts()}\n")

Escuela
GP    349
MS     46
Name: count, dtype: int64

Sexo
F    208
M    187
Name: count, dtype: int64

Internet
yes    329
no      66
Name: count, dtype: int64

# Check the mean of the target variable for each school
print(grade_df.groupby('Escuela')['G3'].mean())

Escuela
GP    10.489971
MS     9.847826
Name: G3, dtype: float64

# show mathematical summary of the data
grade_df.describe()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))

sns.boxplot(data=grade_df.select_dtypes(include='number'), palette="Set2", orient='h')

plt.title("Comparison of All Numerical Feature Distributions")
plt.xticks(rotation=45)
plt.show()

import numpy as np

# lets see the before histogram
sns.histplot(grade_df['Faltas'], kde=True)
plt.title("Faltas before Log Transformation")

Text(0.5, 1.0, 'Faltas before Log Transformation')

# do the log tarsnformation
grade_df['Faltas'] = np.log1p(grade_df['Faltas'])

# lets see if it improved
sns.histplot(grade_df['Faltas'], kde=True)
plt.title("Faltas after Log Transformation")

Text(0.5, 1.0, 'Faltas after Log Transformation')

# generate the dummy varaibles, concat the new columns and drop the initial and the double
grades_df_dummies = pd.get_dummies(grade_df, columns=cat_cols, drop_first=True, dtype=int)

# print head to check if its right
grades_df_dummies.head()

# generate the matrix, we dont need just positives or negatives
corrs_matrix = grades_df_dummies.corr()

# fill the diagonal with ceros
np.fill_diagonal(corrs_matrix.values, 0)

# import seaborn
import seaborn as sns

#visualize with a heatmap
sns.heatmap(corrs_matrix, annot=True, cmap='coolwarm', fmt=".2f")

<Axes: >

# combined of correlated G1 and G2 variable to not increase the size
grades_df_dummies['G1_G2'] = (grades_df_dummies['G1'] * grades_df_dummies['G2']) / 2

# drop the combined columns
grades_df_dummies = grades_df_dummies.drop(columns=['G1', 'G2'])

# import sklearn
from sklearn.model_selection import train_test_split

# create the subsets
X = grades_df_dummies.drop('G3', axis=1)
y = grades_df_dummies['G3']

# split the data 80/20
X_train_main, X_test, y_train_main, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# split the train data into train and validation
X_train, X_train_val, y_train, y_train_val = train_test_split(X_train_main, y_train_main, test_size=0.2, random_state=42)

# print the size of the objects
print(f"Train Sample\n {X_train.shape}, {y_train.shape}")

print(f"\nTrain Validation Sample\n {X_train_val.shape}, {y_train_val.shape}")

print(f"\nTest Sample")
X_test.shape, y_test.shape

Train Sample
 (252, 8), (252,)

Train Validation Sample
 (64, 8), (64,)

Test Sample

((79, 8), (79,))

# import pearson r
import scipy.stats as stats

# get the amount of variables (n)
n = X_train.shape[1]

# initialize with zeros
corr_vector = np.zeros(n)

# get corr between y and each X trai column
for i in range(n):
  # pearsonr returns (statistic, pvalue)
  corr_vector[i] = abs(stats.pearsonr(X_train.iloc[:, i], y_train)[0])

# create a Pandas Series: Data=Your Vector, Index=Column Names
corr_results = pd.Series(corr_vector, index=X_train.columns)

# order the results from biggest to smallest
corr_results = corr_results.sort_values(ascending=False)

# reults of the array
print(corr_results)

G1_G2             0.867706
Reprobadas        0.395822
Faltas            0.203502
Edad              0.158143
Sexo_M            0.135934
HorasDeEstudio    0.115188
Internet_yes      0.073236
Escuela_MS        0.036210
dtype: float64

# import library
import statsmodels.api as sm

sorted_features = corr_results.index.tolist()

# initialize with zeros the array
n = len(sorted_features)
r_adjusted = np.zeros(n)

# loop to get the r adjusteds for the models
for i in range(n):
  current_features = sorted_features[:i+1]

  # get the temporary features
  X_temp = X_train[current_features]

  # define the model
  temp_model = sm.OLS(y_train, sm.add_constant(X_temp))

  # fit the model
  temp_results = temp_model.fit()

  # save the radjusted variable for this
  r_adjusted[i] = round(temp_results.rsquared_adj, 5)

# print resils
print(f"{r_adjusted}\n")

# generate a visualization for it
plt.plot(r_adjusted,'o-', c='purple')

[0.75193 0.76263 0.79354 0.80211 0.80368 0.80296 0.80287 0.80261]

[<matplotlib.lines.Line2D at 0x7d3a74db5b20>]

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

model = LinearRegression()

# do the forward selection
sfs = SequentialFeatureSelector(model, n_features_to_select=4, direction='forward')

# fit the model
sfs.fit(X_train, y_train)

selected_columns = X_train.columns[sfs.get_support()].tolist()

# Create the clean datasets with ONLY the selected features
X_train_selected = X_train[selected_columns]
X_validation_selected = X_train_val[selected_columns]

print(selected_columns)

['Edad', 'Reprobadas', 'Faltas', 'G1_G2']

import statsmodels.api as sm

# define the variable now (this helps s get the statistics)

X_train_stats = sm.add_constant(X_train_selected)

# define the model
model = sm.OLS(y_train, X_train_stats)

# fit the model
ols_model = model.fit()

# lets print the summary with the variables both we and the forward selection function gave
print(ols_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     G3   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.802
Method:                 Least Squares   F-statistic:                     255.3
Date:                Sun, 01 Feb 2026   Prob (F-statistic):           1.78e-86
Time:                        14:50:22   Log-Likelihood:                -542.86
No. Observations:                 252   AIC:                             1096.
Df Residuals:                     247   BIC:                             1113.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.3517      1.873      4.994      0.000       5.663      13.040
Edad          -0.3822      0.112     -3.427      0.001      -0.602      -0.163
Reprobadas    -0.5512      0.191     -2.880      0.004      -0.928      -0.174
Faltas         0.8266      0.123      6.693      0.000       0.583       1.070
G1_G2          0.1005      0.004     27.657      0.000       0.093       0.108
==============================================================================
Omnibus:                       34.259   Durbin-Watson:                   1.947
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               47.390
Skew:                          -0.866   Prob(JB):                     5.12e-11
Kurtosis:                       4.230   Cond. No.                     1.11e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.11e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

# check fo the multicollinearity because of the coefficient of Faltas
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = X_train_selected.columns

vif_data["VIF"] = [variance_inflation_factor(X_train_selected.values, i)
                          for i in range(len(X_train_selected.columns))]
print(vif_data)

      feature       VIF
0        Edad  6.871791
1  Reprobadas  1.409975
2      Faltas  2.709557
3       G1_G2  4.321990

import statsmodels.api as sm

# define the variable now (this helps s get the statistics)
X_train_stats_2 = sm.add_constant(X_train[['Reprobadas', 'Faltas', 'G1_G2']])

# define the model
model_2 = sm.OLS(y_train, X_train_stats_2)

# fit the model
ols_model_2 = model_2.fit()

# lets print the summary with the variables both we and the forward selection function gave
print(ols_model_2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     G3   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     322.6
Date:                Sun, 01 Feb 2026   Prob (F-statistic):           2.78e-85
Time:                        14:50:22   Log-Likelihood:                -548.71
No. Observations:                 252   AIC:                             1105.
Df Residuals:                     248   BIC:                             1120.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.0467      0.355      8.571      0.000       2.347       3.747
Reprobadas    -0.6610      0.193     -3.429      0.001      -1.041      -0.281
Faltas         0.7745      0.125      6.187      0.000       0.528       1.021
G1_G2          0.1008      0.004     27.153      0.000       0.093       0.108
==============================================================================
Omnibus:                       42.281   Durbin-Watson:                   1.987
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               71.537
Skew:                          -0.924   Prob(JB):                     2.92e-16
Kurtosis:                       4.844   Cond. No.                         212.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# add constant to se the stats
X_validation_selected = sm.add_constant(X_validation_selected)

# make the precictions with the previously done model
y_val_predictions = ols_model.predict(X_validation_selected)

# import the library for the MAE RMSE metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse = np.sqrt(mean_squared_error(y_train_val, y_val_predictions))

# get the mae, nicer as it avgs the mistakes
mae = mean_absolute_error(y_train_val, y_val_predictions)

# get the data range to see hwo big it is
data_range = y_train_val.max() - y_train_val.min()

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Data range: {data_range}")
print(f"Error %: {mae / data_range * 100:.2f}")

RMSE: 1.9419
MAE: 1.5126
Data range: 19
Error %: 7.96

# add constant to se the stats
X_validation_selected_2 = sm.add_constant(X_train_val[['Reprobadas', 'Faltas', 'G1_G2']])

# make the precictions with the previously done model
y_val_predictions_2 = ols_model_2.predict(X_validation_selected_2)

# import the library for the MAE RMSE metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse = np.sqrt(mean_squared_error(y_train_val, y_val_predictions_2))

# get the mae, nicer as it avgs the mistakes
mae = mean_absolute_error(y_train_val, y_val_predictions_2)

# get the data range to see hwo big it is
data_range = y_train_val.max() - y_train_val.min()

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Data range: {data_range}")
print(f"Error %: {mae / data_range * 100:.2f}")

RMSE: 1.9635
MAE: 1.5037
Data range: 19
Error %: 7.91

# final testing with test data
features = ['Reprobadas', 'Faltas', 'G1_G2']

# The Final Exam
X_test_final = sm.add_constant(X_test[features])
y_test_pred = ols_model_2.predict(X_test_final)

# data range
data_range = y_test.max() - y_test.min()

# mae caulcation
mae = mean_absolute_error(y_test, y_test_pred)

from sklearn.metrics import r2_score, mean_squared_error

print(f"R2: {r2_score(y_test, y_test_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"MAE: {mae:.4f}")
print(f"Data range: {data_range}")
print(f"Error %: {mae / data_range * 100:.2f}")

R2: 0.7359
RMSE: 2.3269
MAE: 1.7059
Data range: 19
Error %: 8.98

	Edad	HorasDeEstudio	Reprobadas	Faltas	G1	G2	G3
count	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000
mean	16.696203	2.035443	0.334177	5.708861	10.908861	10.713924	10.415190
std	1.276043	0.839240	0.743651	8.003096	3.319195	3.761505	4.581443
min	15.000000	1.000000	0.000000	0.000000	3.000000	0.000000	0.000000
25%	16.000000	1.000000	0.000000	0.000000	8.000000	9.000000	8.000000
50%	17.000000	2.000000	0.000000	4.000000	11.000000	11.000000	11.000000
75%	18.000000	2.000000	0.000000	8.000000	13.000000	13.000000	14.000000
max	22.000000	4.000000	3.000000	75.000000	19.000000	19.000000	20.000000

Introduction¶

Data Exploration and Comprehension¶

Data cleaning and Tranformation¶

Feature Transformation¶

Correlation Analysis¶

Dealing with Multicollinarity¶

Training and Testing of Model¶

Automated Revision¶

Results:¶

Results:¶

Second Model Evaluations¶

Validation & Comaprison¶

Testing¶

Conclusion¶

Limitations¶

Further Study¶

	Escuela	Sexo	Edad	HorasDeEstudio	Reprobadas	Internet	Faltas	G1	G2	G3
0	GP	F	18	2	0	no	6	5	6	6
1	GP	F	17	2	0	yes	4	5	5	6
2	GP	F	15	2	3	yes	10	7	8	10
3	GP	F	15	3	0	yes	2	15	14	15
4	GP	F	16	2	0	no	4	6	10	10

	Edad	HorasDeEstudio	Reprobadas	Faltas	G1	G2	G3	Internet_yes
0	18	2	0	1.945910	5	6	6	0
1	17	2	0	1.609438	5	5	6	1
2	15	2	3	2.397895	7	8	10	1
3	15	3	0	1.098612	15	14	15	1
4	16	2	0	1.609438	6	10	10	0

	Escuela	Sexo	Edad	HorasDeEstudio	Reprobadas	Internet	Faltas	G1	G2	G3
0	GP	F	18	2	0	no	6	5	6	6
1	GP	F	17	2	0	yes	4	5	5	6
2	GP	F	15	2	3	yes	10	7	8	10
3	GP	F	15	3	0	yes	2	15	14	15
4	GP	F	16	2	0	no	4	6	10	10

	Escuela	Sexo	Edad	HorasDeEstudio	Reprobadas	Internet	Faltas	G1	G2	G3
0	GP	F	18	2	0	no	6	5	6	6
1	GP	F	17	2	0	yes	4	5	5	6
2	GP	F	15	2	3	yes	10	7	8	10
3	GP	F	15	3	0	yes	2	15	14	15
4	GP	F	16	2	0	no	4	6	10	10