from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Mounted at /content/drive

# path to read from
# Please adjust the path if your file is in a different subfolder within your Google Drive.
path = '/content/drive/MyDrive/Colab Notebooks/AI/classified_deforestation.csv'
# read csv
df = pd.read_csv(path)

# sanity check
print("Dataset Shape:", df.shape)

Dataset Shape: (103, 21)

# get a list of all the nulls and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 21 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Density
(P/Km2)                            103 non-null    float64
 1   Agricultural Land( %)                      103 non-null    float64
 2   Armed Forces size                          103 non-null    float64
 3   Co2-Emissions                              103 non-null    float64
 4   CPI                                        103 non-null    float64
 5   Gasoline Price                             103 non-null    float64
 6   GDP                                        103 non-null    float64
 7   Gross primary education enrollment (%)     103 non-null    float64
 8   Gross tertiary education enrollment (%)    103 non-null    float64
 9   Infant mortality                           103 non-null    float64
 10  Minimum wage                               103 non-null    float64
 11  Out of pocket health expenditure           103 non-null    float64
 12  Physicians per thousand                    103 non-null    float64
 13  Population: Labor force participation (%)  103 non-null    float64
 14  Tax revenue (%)                            103 non-null    float64
 15  Total tax rate                             103 non-null    float64
 16  Unemployment rate                          103 non-null    float64
 17  Urban_population                           103 non-null    float64
 18  Latitude                                   103 non-null    float64
 19  Longitude                                  103 non-null    float64
 20  Deforestation_Critical                     103 non-null    int64  
dtypes: float64(20), int64(1)
memory usage: 17.0 KB

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="white")
plt.figure(figsize=(8, 4))
ax = sns.countplot(data=df, x='Deforestation_Critical', palette=['#1A4734', '#870903'])

# Annotate the bars with exact counts + percentages
total = len(df)
for p in ax.patches:
    count = int(p.get_height())
    percentage = f'{100 * count / total:.1f}%'
    # Place text above the bar
    ax.annotate(f'{count} ({percentage})',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='bottom', fontsize=11, color='#333333', xytext=(0, 5), textcoords='offset points')

# Polish the labels and borders
plt.title('Target Variable Distribution: Deforestation Risk', fontsize=13, pad=15)
plt.ylabel('Number of Countries', fontsize=11)
plt.xlabel('') # Removing the x-label because the ticks are self-explanatory
plt.xticks(ticks=[0, 1], labels=['Low/Moderate Risk (0)', 'Critical Risk (1)'], fontsize=11)
sns.despine()

plt.show()

/tmp/ipykernel_13731/2165957095.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.countplot(data=df, x='Deforestation_Critical', palette=['#1A4734', '#870903'])

from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
# All columns except 'Deforestation_Critical' are features
X = df.drop('Deforestation_Critical', axis=1)
# The 'Deforestation_Critical' column is the target
y = df['Deforestation_Critical']

# Perform 80/20 train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (82, 20)
X_test shape: (21, 20)
y_train shape: (82,)
y_test shape: (21,)

import numpy as np

# create the corr matrix
correlation_matrix = X_train.corr()

# Get the absolute values of the correlation matrix
abs_correlation_matrix = correlation_matrix.abs()

# Stack the matrix to get pairs and their correlation values
stacked_corr = abs_correlation_matrix.stack()

# Filter for correlations greater than 0.80 and not self-correlations
high_corr = stacked_corr[(stacked_corr > 0.80) & (stacked_corr < 1.0)]

# Sort the correlations for better readability
high_corr = high_corr.sort_values(ascending=False)

print("Feature pairs with absolute Pearson correlation > 0.80 (excluding self-correlation):\n")

# Print unique pairs
reported_pairs = set()
for (feature1, feature2), corr_value in high_corr.items():
    # Ensure each pair is reported only once, regardless of order
    if (feature2, feature1) not in reported_pairs:
        print(f"('{feature1}', '{feature2}'): {corr_value:.3f}")
        reported_pairs.add((feature1, feature2))

Feature pairs with absolute Pearson correlation > 0.80 (excluding self-correlation):

('Co2-Emissions', 'GDP'): 0.918
('Infant mortality', 'Gross tertiary education enrollment (%)'): 0.857
('Co2-Emissions', 'Urban_population'): 0.823

columns_to_drop = [
    'Total tax rate',
    'Gross tertiary education enrollment (%)',
    'Co2-Emissions'
]

# Drop them from both sets
X_train_final = X_train.drop(columns=columns_to_drop)
X_test_final = X_test.drop(columns=columns_to_drop)

# Print the final shape to confirm
print(f"Original X_train shape: {X_train.shape}")
print(f"Final X_train shape: {X_train_final.shape}")

Original X_train shape: (82, 20)
Final X_train shape: (82, 17)

# Apply the absolute value transformation to collapse the latitude axis
X_train_final['Latitude'] = X_train_final['Latitude'].abs()
X_test_final['Latitude'] = X_test_final['Latitude'].abs()

# Rename the column so we know exactly what it represents now
X_train_final = X_train_final.rename(columns={'Latitude': 'Abs_Latitude'})
X_test_final = X_test_final.rename(columns={'Latitude': 'Abs_Latitude'})

print("Latitude successfully transformed into absolute distance from the equator.")

Latitude successfully transformed into absolute distance from the equator.

from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Define the Pipeline
pipe_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty='l1',
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    )
)

# Define the Hyperparameter Grid
param_grid_lr = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100]
}

# Setup Stratified 5-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearchCV on the training data
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=cv, scoring='recall', n_jobs=-1)
grid_lr.fit(X_train_final, y_train)

print(f"Best Regularization (C): {grid_lr.best_params_['logisticregression__C']}")

Best Regularization (C): 0.1

from sklearn.metrics import precision_score, recall_score, f1_score

# Extract the best model found by the GridSearch
best_lr = grid_lr.best_estimator_

# Generate out-of-fold probabilities using the best model
y_train_cv_prob = cross_val_predict(best_lr, X_train_final, y_train, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]

# Test multiple thresholds to find the ecological sweet spot
print("--- Threshold Trade-off Analysis (Class 1: High Risk) ---")
thresholds_to_test = [0.3, 0.4, 0.5]

for t in thresholds_to_test:
    # Apply the custom threshold
    y_pred_custom = (y_train_cv_prob >= t).astype(int)

    # Calculate metrics specifically for the minority class (Class 1)
    sens = recall_score(y_train, y_pred_custom)
    prec = precision_score(y_train, y_pred_custom, zero_division=0)
    f1 = f1_score(y_train, y_pred_custom)

    # Print a clean summary for each threshold
    print(f"Threshold: {t:.1f} | Sensitivity (Recall): {sens:.3f} | Precision: {prec:.3f} | F1-Score: {f1:.3f}")

--- Threshold Trade-off Analysis (Class 1: High Risk) ---
Threshold: 0.3 | Sensitivity (Recall): 1.000 | Precision: 0.305 | F1-Score: 0.467
Threshold: 0.4 | Sensitivity (Recall): 0.880 | Precision: 0.297 | F1-Score: 0.444
Threshold: 0.5 | Sensitivity (Recall): 0.800 | Precision: 0.465 | F1-Score: 0.588

from sklearn.preprocessing import StandardScaler

# Initialize the scaler for baseline LDA visualization
scaler = StandardScaler()

# FIT only on the training data, and transform the training data
X_train_scaled = scaler.fit_transform(X_train_final)

# Transform the testing data using training parameters
X_test_scaled = scaler.transform(X_test_final)

print("Data scaled for baseline LDA exploration.")

Data scaled for baseline LDA exploration.

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.colors as mcolors

# create a base LDA
lda_base = LinearDiscriminantAnalysis()
lda_base.fit(X_train_scaled, y_train)

# Define custom colors (Hex codes for a Light Blue and Light Brown/Tan)
# Class 0 will be Light Blue, Class 1 will be Light Brown
custom_cmap = mcolors.ListedColormap(['#87CEEB', '#714423'])

# visualize the discriminants
X_lda = lda_base.transform(X_train_scaled)
plt.scatter(X_lda[:, 0], np.zeros_like(X_lda[:, 0]), c=y_train, alpha=0.5, cmap=custom_cmap)
plt.title('LDA Projection')
plt.xlabel('LD1')
plt.show()

# Extract scalings and put them in a DataFrame
scalings = pd.DataFrame(lda_base.scalings_, index=X_train_final.columns, columns=['LD1'])

# Sort by absolute value
scalings['Abs_Weight'] = scalings['LD1'].abs()
scalings_sorted = scalings.sort_values(by='Abs_Weight', ascending=False)
scalings_sorted_down = scalings.sort_values(by='Abs_Weight', ascending=True)

print("Top Discriminatory Variables:")
print(scalings_sorted[['LD1']].head(5))

print("\nLeast Discriminatory Variables:")
print(scalings_sorted_down[['LD1']].head(5))

Top Discriminatory Variables:
                       LD1
Infant mortality  1.755451
Urban_population  0.598710
Longitude        -0.598242
Minimum wage      0.572328
Density\n(P/Km2)  0.535959

Least Discriminatory Variables:
                                             LD1
Gross primary education enrollment (%) -0.020659
Unemployment rate                      -0.072354
Agricultural Land( %)                   0.128566
GDP                                    -0.139179
Out of pocket health expenditure        0.186595

# cols to drop only for the LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Define the specific columns that LDA found useless
cols_to_drop_lda = [
    'Gross primary education enrollment (%)',
    'Unemployment rate',
    'Agricultural Land( %)',
    'GDP'
]

# Create the specific unscaled datasets for LDA
X_train_lda = X_train_final.drop(columns=cols_to_drop_lda)
X_test_lda = X_test_final.drop(columns=cols_to_drop_lda)

print("New feature set for LDA created")

New feature set for LDA created

from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, confusion_matrix

# Define the Pipeline
pipe_lda_opt = make_pipeline(
    StandardScaler(),
    LinearDiscriminantAnalysis(solver='lsqr')
)

# Define the Hyperparameter Grid
param_grid_lda = {
    'lineardiscriminantanalysis__shrinkage': [None, 'auto', 0.1, 0.5, 0.9]
}

# Setup Stratified 5-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearchCV optimizing
grid_lda = GridSearchCV(pipe_lda_opt, param_grid_lda, cv=cv, scoring='recall', n_jobs=-1)

# Fit strictly on the unscaled, pruned data
grid_lda.fit(X_train_lda, y_train)

# Print the winning parameter!
print(f"Best Shrinkage Parameter: {grid_lda.best_params_['lineardiscriminantanalysis__shrinkage']}")

# Extract the best model and evaluate with CROSS-VALIDATION on training data
best_lda = grid_lda.best_estimator_

# Get CV scores on train data only
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1': 'f1'}
cv_scores_lda = cross_validate(best_lda, X_train_lda, y_train, cv=cv, scoring=scoring, n_jobs=-1)

# Display CV metrics
print("\n--- LDA Results (Cross-Validation on Training Data) ---")
print(f"CV Accuracy:  {cv_scores_lda['test_accuracy'].mean():.3f} (±{cv_scores_lda['test_accuracy'].std():.3f})")
print(f"CV Recall:    {cv_scores_lda['test_recall'].mean():.3f} (±{cv_scores_lda['test_recall'].std():.3f})")
print(f"CV Precision: {cv_scores_lda['test_precision'].mean():.3f} (±{cv_scores_lda['test_precision'].std():.3f})")
print(f"CV F1-Score:  {cv_scores_lda['test_f1'].mean():.3f} (±{cv_scores_lda['test_f1'].std():.3f})")

Best Shrinkage Parameter: 0.9

--- LDA Results (Cross-Validation on Training Data) ---
CV Accuracy:  0.791 (±0.096)
CV Recall:    0.800 (±0.126)
CV Precision: 0.654 (±0.134)
CV F1-Score:  0.708 (±0.101)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

# Define a targeted, regularization-heavy Grid
param_grid_rf = {
    'n_estimators': [50, 100, 150, 200],  # Number of trees
    'max_depth': [3, 4, 5],               # Strict depth limits to prevent overfitting
    'min_samples_leaf': [2, 4, 6],        # Forces broader, more general rules
    'max_features': ['sqrt', 'log2']      # Standard feature subsetting
}

# Setup 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearchCV
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid_rf,
    cv=cv,
    scoring='recall',
    n_jobs=-1
)

# Fit on the unscaled, clean data
rf_grid.fit(X_train_final, y_train)

# Extract Best Model and evaluate with CV
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")

best_rf = rf_grid.best_estimator_

# Get CV scores on train data only
from sklearn.model_selection import cross_validate
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1': 'f1'}
cv_scores_rf = cross_validate(best_rf, X_train_final, y_train, cv=cv, scoring=scoring, n_jobs=-1)

# Display CV metrics
print("\n--- Random Forest Results (Cross-Validation on Training Data) ---")
print(f"CV Accuracy:  {cv_scores_rf['test_accuracy'].mean():.3f} (±{cv_scores_rf['test_accuracy'].std():.3f})")
print(f"CV Recall:    {cv_scores_rf['test_recall'].mean():.3f} (±{cv_scores_rf['test_recall'].std():.3f})")
print(f"CV Precision: {cv_scores_rf['test_precision'].mean():.3f} (±{cv_scores_rf['test_precision'].std():.3f})")
print(f"CV F1-Score:  {cv_scores_rf['test_f1'].mean():.3f} (±{cv_scores_rf['test_f1'].std():.3f})")

Best Random Forest Parameters: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'n_estimators': 200}

--- Random Forest Results (Cross-Validation on Training Data) ---
CV Accuracy:  0.755 (±0.088)
CV Recall:    0.560 (±0.196)
CV Precision: 0.673 (±0.213)
CV F1-Score:  0.577 (±0.133)

from sklearn.svm import SVC

# Define the Pipeline (Scaler + Model)
# NOTE: class_weight removed - will be optimized as hyperparameter
pipe_svm = make_pipeline(
    StandardScaler(),
    SVC(random_state=42)
)

# Expanded hyperparameters + class_weight testing
param_grid_svm = {
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [1, 10, 100, 1000],  # Higher values for better minority class capture
    'svc__gamma': ['scale', 'auto', 0.01, 0.001],  # More granular smoothness options
    'svc__class_weight': [None, 'balanced', {0: 1, 1: 2}]  # Test different weighting strategies
}

# Setup 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Run GridSearchCV optimizing for 'recall'
grid_svm = GridSearchCV(
    pipe_svm,
    param_grid_svm,
    cv=cv,
    scoring='recall',
    n_jobs=-1,
    verbose=1  # Show progress since this can take time
)

# Fit strictly on the training data
grid_svm.fit(X_train_final, y_train)

# Extract Best Model and evaluate with CV
print(f"Best SVM Parameters: {grid_svm.best_params_}")
best_svm = grid_svm.best_estimator_

# Get CV scores on TRAINING data only
from sklearn.model_selection import cross_validate
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1': 'f1'}
cv_scores_svm = cross_validate(best_svm, X_train_final, y_train, cv=cv, scoring=scoring, n_jobs=-1)

# Display CV metrics (TRAINING DATA ONLY)
print("\n--- SVM Results (Cross-Validation on Training Data) ---")
print(f"CV Accuracy:  {cv_scores_svm['test_accuracy'].mean():.3f} (±{cv_scores_svm['test_accuracy'].std():.3f})")
print(f"CV Recall:    {cv_scores_svm['test_recall'].mean():.3f} (±{cv_scores_svm['test_recall'].std():.3f})")
print(f"CV Precision: {cv_scores_svm['test_precision'].mean():.3f} (±{cv_scores_svm['test_precision'].std():.3f})")
print(f"CV F1-Score:  {cv_scores_svm['test_f1'].mean():.3f} (±{cv_scores_svm['test_f1'].std():.3f})")

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best SVM Parameters: {'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale', 'svc__kernel': 'linear'}

--- SVM Results (Cross-Validation on Training Data) ---
CV Accuracy:  0.804 (±0.092)
CV Recall:    0.800 (±0.126)
CV Precision: 0.685 (±0.188)
CV F1-Score:  0.723 (±0.117)

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

# Setup
tf.random.set_seed(42)
np.random.seed(42)

def build_baseline_model():
    model = Sequential([
        Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Cross-Validation Loop
print("Training Baseline Neural Network (5-Fold CV)... Please wait.")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {'recall': [], 'precision': [], 'f1': [], 'accuracy': []}

for train_idx, val_idx in cv.split(X_train_scaled, y_train):
    X_f_train, X_f_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_f_train, y_f_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = build_baseline_model()
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_f_train, y_f_train, epochs=100, batch_size=16,
              validation_data=(X_f_val, y_f_val), callbacks=[early_stop], verbose=0)

    y_pred = (model.predict(X_f_val, verbose=0) > 0.5).astype(int)

    scores['recall'].append(recall_score(y_f_val, y_pred))
    scores['precision'].append(precision_score(y_f_val, y_pred, zero_division=0))
    scores['f1'].append(f1_score(y_f_val, y_pred))
    scores['accuracy'].append(accuracy_score(y_f_val, y_pred))

# Print Final Results
print("MODEL 1: Cross-Validation Results (5-Fold)")
print(f"CV Recall:    {np.mean(scores['recall']):.3f} (±{np.std(scores['recall']):.3f})")
print(f"CV Precision: {np.mean(scores['precision']):.3f} (±{np.std(scores['precision']):.3f})")
print(f"CV F1-Score:  {np.mean(scores['f1']):.3f} (±{np.std(scores['f1']):.3f})")
print(f"CV Accuracy:  {np.mean(scores['accuracy']):.3f} (±{np.std(scores['accuracy']):.3f})")

Training Baseline Neural Network (5-Fold CV)... Please wait.

/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

MODEL 1: Cross-Validation Results (5-Fold)
CV Recall:    0.440 (±0.233)
CV Precision: 0.565 (±0.294)
CV F1-Score:  0.451 (±0.211)
CV Accuracy:  0.668 (±0.182)

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

# 1. Setup & Configuration
tf.random.set_seed(42)
np.random.seed(42)
keras_weights = {0: 1.0, 1: 3.0}

# Reusable Functions (This is what saves so much space!)
def build_model():
    model = Sequential([
        Dense(16, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Recall(name='recall')]
    )
    return model

def get_callbacks():
    return [
        EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, mode='min', verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001, verbose=0)
    ]

# Cross-Validation Loop
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {'recall': [], 'precision': [], 'f1': [], 'accuracy': []}

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_scaled, y_train)):
    X_f_train, X_f_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_f_train, y_f_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Build and train a fresh model for this fold
    model = build_model()
    model.fit(X_f_train, y_f_train, epochs=100, batch_size=8, validation_data=(X_f_val, y_f_val),
              class_weight=keras_weights, callbacks=get_callbacks(), verbose=0)

    # Evaluate
    y_pred = (model.predict(X_f_val, verbose=0) > 0.5).astype(int)
    scores['recall'].append(recall_score(y_f_val, y_pred))
    scores['precision'].append(precision_score(y_f_val, y_pred, zero_division=0))
    scores['f1'].append(f1_score(y_f_val, y_pred))
    scores['accuracy'].append(accuracy_score(y_f_val, y_pred))

# Print clean results
print(f"CV Recall:    {np.mean(scores['recall']):.3f} (±{np.std(scores['recall']):.3f})")
print(f"CV Precision: {np.mean(scores['precision']):.3f} (±{np.std(scores['precision']):.3f})")
print(f"CV F1-Score:  {np.mean(scores['f1']):.3f} (±{np.std(scores['f1']):.3f})")
print(f"CV Accuracy:  {np.mean(scores['accuracy']):.3f} (±{np.std(scores['accuracy']):.3f})")

# Train Final Production Model
print("\n--- Training Final Model on Full Training Set ---")
final_model = build_model()
final_model.fit(X_train_scaled, y_train, epochs=100, batch_size=8, validation_split=0.2,
                class_weight=keras_weights, callbacks=get_callbacks(), verbose=1)

final_model.summary()

/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

CV Recall:    0.720 (±0.204)
CV Precision: 0.472 (±0.079)
CV F1-Score:  0.556 (±0.099)
CV Accuracy:  0.657 (±0.087)

--- Training Final Model on Full Training Set ---

/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

Epoch 1/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 4s 66ms/step - accuracy: 0.4923 - loss: 1.1251 - recall: 0.7143 - val_accuracy: 0.5294 - val_loss: 0.7172 - val_recall: 0.7500 - learning_rate: 0.0010
Epoch 2/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step - accuracy: 0.5077 - loss: 1.1039 - recall: 0.8095 - val_accuracy: 0.5294 - val_loss: 0.7184 - val_recall: 0.7500 - learning_rate: 0.0010
Epoch 3/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 23ms/step - accuracy: 0.5077 - loss: 1.0872 - recall: 0.8095 - val_accuracy: 0.5294 - val_loss: 0.7190 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 4/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 22ms/step - accuracy: 0.5538 - loss: 1.0727 - recall: 0.9048 - val_accuracy: 0.4118 - val_loss: 0.7189 - val_recall: 0.7500 - learning_rate: 0.0010
Epoch 5/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step - accuracy: 0.5538 - loss: 1.0602 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7189 - val_recall: 0.7500 - learning_rate: 0.0010
Epoch 6/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step - accuracy: 0.5538 - loss: 1.0473 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7188 - val_recall: 0.7500 - learning_rate: 0.0010
Epoch 7/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 20ms/step - accuracy: 0.5538 - loss: 1.0370 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7185 - val_recall: 0.7500 - learning_rate: 5.0000e-04
Epoch 8/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 22ms/step - accuracy: 0.5538 - loss: 1.0303 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7182 - val_recall: 0.7500 - learning_rate: 5.0000e-04
Epoch 9/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step - accuracy: 0.5538 - loss: 1.0234 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7180 - val_recall: 0.7500 - learning_rate: 5.0000e-04
Epoch 10/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 20ms/step - accuracy: 0.5692 - loss: 1.0166 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7180 - val_recall: 0.7500 - learning_rate: 5.0000e-04
Epoch 11/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step - accuracy: 0.5692 - loss: 1.0095 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7179 - val_recall: 0.7500 - learning_rate: 5.0000e-04
Epoch 12/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 20ms/step - accuracy: 0.5692 - loss: 1.0038 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7178 - val_recall: 0.7500 - learning_rate: 2.5000e-04
Epoch 13/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.5692 - loss: 1.0002 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7178 - val_recall: 0.7500 - learning_rate: 2.5000e-04
Epoch 14/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.5692 - loss: 0.9965 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7178 - val_recall: 0.7500 - learning_rate: 2.5000e-04
Epoch 15/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.5692 - loss: 0.9927 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7178 - val_recall: 0.7500 - learning_rate: 2.5000e-04
Epoch 16/100
9/9 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.5692 - loss: 0.9890 - recall: 0.9524 - val_accuracy: 0.4118 - val_loss: 0.7177 - val_recall: 0.7500 - learning_rate: 2.5000e-04

Model: "sequential_17"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense_51 (Dense)                │ (None, 16)             │           288 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_52 (Dense)                │ (None, 8)              │           136 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_53 (Dense)                │ (None, 1)              │             9 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 1,301 (5.09 KB)

 Trainable params: 433 (1.69 KB)

 Non-trainable params: 0 (0.00 B)

 Optimizer params: 868 (3.39 KB)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

y_pred_final = best_svm.predict(X_test_final)

print("--- FINAL TEST SET EVALUATION (SVM Linear) ---")
print(f"Accuracy:  {accuracy_score(y_test, y_pred_final):.3f}")
print(f"Recall:    {recall_score(y_test, y_pred_final):.3f} (Sensitivity)")
print(f"Precision: {precision_score(y_test, y_pred_final):.3f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_final):.3f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))

--- FINAL TEST SET EVALUATION (SVM Linear) ---
Accuracy:  0.762
Recall:    0.333 (Sensitivity)
Precision: 0.667
F1-Score:  0.444

Confusion Matrix:
[[14  1]
 [ 4  2]]

Data Contextualization¶

Introduction¶

The problem¶

The Target Variable¶

The goal of this study¶

Data Exploration and Understanding (a refresher)¶

Data Dictionary¶

Data Split¶

Feature Selection¶

Final Data Preparation¶

Logistic Regression¶

Linear Discriminant Analysis¶

Random Forest¶

Support Vector Machines¶

Final Decision¶

Conclusion¶

Learnings¶

Limitations¶

Future Work¶

Connection to Real Life Problem¶

References¶