from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_score, recall_score,
    f1_score, roc_curve, auc
)

# csv path
path = '/content/drive/My Drive/Colab Notebooks/AI/simplified_df.csv'

# convert to a dataframe
df = pd.read_csv(path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

df.head()

# Convert Forested Area to a true decimal (e.g., 48.0 becomes 0.48)
df['Forested_Area_Decimal'] = df['Forested Area (%)'].astype(float) / 100

# Revert the log transformation for Deforestation and Land Area!
df['Deforestation_Hectares'] = np.exp(df['Deforestation'])
df['Land_Area_Real_Km2'] = np.exp(df['Land Area(Km2)'])

# Transform real Km2 to Hectares (1 Km2 = 100 Hectares)
df['Land_Area_Hectares'] = df['Land_Area_Real_Km2'] * 100

# Calculate the actual Forest Area in hectares
df['Total_Forest_Hectares'] = df['Land_Area_Hectares'] * df['Forested_Area_Decimal']

# Create the new variable of % forested area deforested
df['Deforestation_Rate_%'] = (df['Deforestation_Hectares'] / df['Total_Forest_Hectares']) * 100

display(df[['Deforestation_Hectares', 'Total_Forest_Hectares', 'Deforestation_Rate_%']].head())

df['Deforestation_Rate_%'].describe().round(5)

# Drop the rows representing countries with no forest to deforest
df = df[df['Forested Area (%)'] > 0.000]

# Output the cleaned and rounded statistical summary
df['Deforestation_Rate_%'].describe().round(5)

columns_to_drop = [
    'Land Area(Km2)',
    'Deforestation',
    'Deforestation_Hectares',
    'Land_Area_Real_Km2',
    'Land_Area_Hectares',
    'Forested Area (%)',
    'Forested_Area_Decimal',
    'Total_Forest_Hectares'
]

df = df.drop(columns=columns_to_drop)

# Display the remaining columns to verify
print(df.columns.tolist())

['Unnamed: 0', 'Density\n(P/Km2)', 'Agricultural Land( %)', 'Armed Forces size', 'Co2-Emissions', 'CPI', 'CPI Change (%)', 'Gasoline Price', 'GDP', 'Gross primary education enrollment (%)', 'Gross tertiary education enrollment (%)', 'Infant mortality', 'Minimum wage', 'Out of pocket health expenditure', 'Physicians per thousand', 'Population: Labor force participation (%)', 'Tax revenue (%)', 'Total tax rate', 'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude', 'Deforestation_Rate_%']

df['Deforestation_Critical'] = (df['Deforestation_Rate_%'] > 0.501).astype(int)
df.drop(columns=['Deforestation_Rate_%'], inplace=True)

df['Deforestation_Critical'].head()

# We drop the target AND all the temporary math columns we created so the model doesn't cheat
y = df['Deforestation_Critical']
X = df.drop(columns=['Deforestation_Critical'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# revision of class balance
print("Original Data :", y.value_counts(normalize=True).round(3))
print("\n2. Training Set :", y_train.value_counts(normalize=True).round(3))
print("\n3. Testing Set :", y_test.value_counts(normalize=True).round(3))

Original Data : Deforestation_Critical
0    0.699
1    0.301
Name: proportion, dtype: float64

2. Training Set : Deforestation_Critical
0    0.695
1    0.305
Name: proportion, dtype: float64

3. Testing Set : Deforestation_Critical
0    0.714
1    0.286
Name: proportion, dtype: float64

# Wrap scaler + model into a Pipeline so CV fits the scaler inside each fold (no leakage)
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
)

# Cross-Validation (5-Folds) on train Data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Generate 'out-of-fold' predictions using cross-validation
y_train_pred = cross_val_predict(pipe, X_train, y_train, cv=cv, n_jobs=-1, method='predict_proba')

# The 'predict_proba' method returns probabilities for both classes.
# We are interested in the probability of the positive class (class 1) which is the second column.
y_train_pred_class1_proba = y_train_pred[:, 1]

# Convert probabilities to binary predictions using a threshold (not the ecology one, its teh probability one)
y_train_pred_binary = (y_train_pred_class1_proba > 0.5).astype(int)

# Display the Classification Report
print("\n--- Classification Report (Cross-Validation) ---")
print(classification_report(y_train, y_train_pred_binary))

--- Classification Report (Cross-Validation) ---
              precision    recall  f1-score   support

           0       0.86      0.77      0.81        57
           1       0.58      0.72      0.64        25

    accuracy                           0.76        82
   macro avg       0.72      0.75      0.73        82
weighted avg       0.78      0.76      0.76        82

# Visualize the Cross-Validation Confusion Matrix as a heatmap
cm_cv = confusion_matrix(y_train, y_train_pred_binary)
cm_cv_df = pd.DataFrame(cm_cv, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

plt.figure(figsize=(6, 5))
sns.heatmap(cm_cv_df, annot=True, fmt='d', cmap='Oranges', cbar=False, linewidths=.5)
plt.title('Confusion Matrix — Cross-Validation (Threshold = 0.5)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

threshs = [0.3, 0.4, 0.6]
for t in threshs:
  #Apply the new threshold to our existing CV probabilities
  y_pred_custom_threshold = (y_train_pred_class1_proba >= t).astype(int)

  # 2. Extract specific metrics to see the trade-off clearly
  report = classification_report(y_train, y_pred_custom_threshold, output_dict=True, zero_division=0)

  recall_class1 = report['1']['recall']
  precision_class1 = report['1']['precision']
  f1_class1 = report['1']['f1-score']

  # 3. Print a clean summary for each threshold
  print(f"For T: {t} Recall (Class 1): {recall_class1:.2f} | Precision: {precision_class1:.2f} | F1-Score: {f1_class1:.2f}")

For T: 0.3 Recall (Class 1): 0.88 | Precision: 0.52 | F1-Score: 0.66
For T: 0.4 Recall (Class 1): 0.76 | Precision: 0.56 | F1-Score: 0.64
For T: 0.6 Recall (Class 1): 0.64 | Precision: 0.64 | F1-Score: 0.64

# Train the pipeline (scaler + model) on the full training set, evaluate on test
pipe.fit(X_train, y_train)
y_test_prob = pipe.predict_proba(X_test)[:, 1]

# If the probability 30% or higher, predict 1 (High Risk). Otherwise, 0.
y_test_pred_03 = (y_test_prob >= 0.3).astype(int)

# Print the core metrics (For Class 1: High Risk)
print("--- Final Test Set Results ---")
print(f"Recall:    {recall_score(y_test, y_test_pred_03):.3f} (Did we catch the deforesting nations?)")
print(f"Precision: {precision_score(y_test, y_test_pred_03):.3f} (Are our alarms accurate?)")
print(f"F1-Score:  {f1_score(y_test, y_test_pred_03):.3f} (The overall balance)")

--- Final Test Set Results ---
Recall:    0.667 (Did we catch the deforesting nations?)
Precision: 0.667 (Are our alarms accurate?)
F1-Score:  0.667 (The overall balance)

# Calculate the confusion matrix for the test set (using the 0.3 threshold)
cm = confusion_matrix(y_test, y_test_pred_03)

# Create a DataFrame for better labeling
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', cbar=False, linewidths=.5)
plt.title('Confusion Matrix for Test Set (Threshold = 0.3)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# We use y_test_prob because the ROC curve needs probabilities, not the 0/1 predictions
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)

# Calculate the Area Under the Curve (AUC)
final_auc = auc(fpr, tpr)

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {final_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier (Baseline)')

# Formatting for the report
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Recall / Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) - Test Set')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

print(f"Final Area Under the Curve (AUC): {final_auc:.4f}")

Final Area Under the Curve (AUC): 0.8333

# Extract the fitted logistic regression step from the pipeline
fitted_lr = pipe.named_steps['logisticregression']
feature_names = X.columns.tolist()

# Build a sorted DataFrame of coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': fitted_lr.coef_[0]
}).sort_values('Coefficient', ascending=False).reset_index(drop=True)

print(coef_df.to_string(index=False))

# Visualization
colors = ['#d73027' if c > 0 else '#4575b4' for c in coef_df['Coefficient']]

plt.figure(figsize=(10, 6))
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color=colors)
plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
plt.xlabel('Coefficient Value (Standardized Log-Odds)')
plt.title('Logistic Regression Coefficients')
plt.tight_layout()
plt.show()

                                  Feature  Coefficient
                           Gasoline Price     0.947719
                         Infant mortality     0.910034
Population: Labor force participation (%)     0.888075
                         Urban_population     0.851602
                                      CPI     0.802715
                         Density\n(P/Km2)     0.669438
                    Agricultural Land( %)     0.616020
                           Total tax rate     0.533570
                             Minimum wage     0.492882
         Out of pocket health expenditure     0.457337
                  Physicians per thousand     0.027279
                                 Latitude    -0.015242
                                      GDP    -0.043343
   Gross primary education enrollment (%)    -0.104871
  Gross tertiary education enrollment (%)    -0.232142
                        Armed Forces size    -0.254203
                        Unemployment rate    -0.272496
                               Unnamed: 0    -0.274781
                            Co2-Emissions    -0.293834
                           CPI Change (%)    -0.467499
                          Tax revenue (%)    -0.732323
                                Longitude    -0.912253

	Unnamed: 0	Density\n(P/Km2)	Agricultural Land( %)	Land Area(Km2)	Armed Forces size	Co2-Emissions	CPI	CPI Change (%)	Forested Area (%)	Gasoline Price	GDP	Gross primary education enrollment (%)	Gross tertiary education enrollment (%)	Infant mortality	Minimum wage	Out of pocket health expenditure	Physicians per thousand	Population: Labor force participation (%)	Tax revenue (%)	Total tax rate	Unemployment rate	Urban_population	Latitude	Longitude	Deforestation
0	0	2.944439	17.4	14.683343	12.666660	150006.0	151.36	2.000	0.8	0.28	25.858995	109.9	51.4	3.049273	0.667829	28.1	1.72	41.2	37.2	66.100	11.70	17.265819	28.033886	1.659626	9.067739
1	1	3.295837	47.5	14.036011	11.669938	34693.0	261.73	17.100	46.3	0.97	25.273298	113.5	9.3	3.962716	0.536493	33.4	0.21	77.5	9.2	49.100	6.89	16.862935	-11.202692	17.873887	13.143030
2	2	2.890372	54.3	14.838106	11.561725	201348.0	232.75	53.375	9.8	1.10	26.831765	109.7	90.0	2.282382	1.470176	17.6	3.96	61.3	10.1	104.955	9.79	17.537331	-38.416097	-63.616672	12.182297
3	3	4.700480	32.4	11.337047	9.952325	61448.0	118.06	1.500	46.9	1.20	26.824290	103.1	85.1	1.360977	2.311148	17.9	5.17	60.7	25.4	51.400	4.67	15.463095	47.516231	14.550072	8.543056
4	4	7.714231	11.1	6.641182	9.852247	31694.0	117.59	2.100	0.8	0.43	24.375846	99.4	50.5	1.960095	1.213725	25.1	0.93	73.4	4.2	13.800	0.71	14.198805	26.066700	50.557700	0.000000

	Deforestation_Hectares	Total_Forest_Hectares	Deforestation_Rate_%
0	8671.0	1905393.6	0.455077
1	510441.0	57722256.3	0.884305
2	195301.0	27247929.8	0.716755
3	5131.0	3933596.8	0.130440
4	1.0	612.8	0.163185

	Deforestation_Rate_%
count	106.00000
mean	inf
std	NaN
min	0.00000
25%	0.01994
50%	0.13507
75%	0.78236
max	inf

	Deforestation_Rate_%
count	103.00000
mean	0.45276
std	0.64163
min	0.00000
25%	0.01814
50%	0.13044
75%	0.70634
max	2.62770

Deforestation: a new categorical problem¶

Data Splitting¶

Logistic Regression on Training Data & Cross Fold Validation¶

Different Threshold¶

Final Testing¶

Coefficient Interpretation¶

Cross-Validation vs. Test Set Results¶

References¶