import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_score, recall_score,
    f1_score, roc_curve, auc, accuracy_score, classification_report
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier

# csv path
path = './classified_deforestation_df.csv'

# convert to a dataframe
df = pd.read_csv(path)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

df.head()

# define target
y = df['Deforestation_Critical']

# check for data balance
y.value_counts()

Deforestation_Critical
0    72
1    31
Name: count, dtype: int64

# Define features (X) as all columns except the target 'Deforestation_Critical'
X = df.drop('Deforestation_Critical', axis=1)

# Perform 80/20 train-test split with stratification and random state
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display the shapes of the resulting datasets to confirm the split
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (82, 20)
Shape of X_test: (21, 20)
Shape of y_train: (82,)
Shape of y_test: (21,)

# Check the distribution of the target in train and test sets
print("\nTarget distribution in y_train:")
print(y_train.value_counts(normalize=True))
print("\nTarget distribution in y_test:")
print(y_test.value_counts(normalize=True))

Target distribution in y_train:
Deforestation_Critical
0    0.695122
1    0.304878
Name: proportion, dtype: float64

Target distribution in y_test:
Deforestation_Critical
0    0.714286
1    0.285714
Name: proportion, dtype: float64

import math

# Combine X_train and y_train for plotting
plot_df = X_train.copy()
plot_df['Risk_Level'] = y_train

# Get list of features
features = X_train.columns
n_features = len(features)
rows = math.ceil(n_features / 4)

# Create the Grid
fig, axes = plt.subplots(rows, 4, figsize=(15, rows * 4))
axes = axes.flatten()

for i, col in enumerate(features):
    sns.kdeplot(
        data=plot_df,
        x=col,
        hue='Risk_Level',
        fill=True,
        palette='RdYlGn_r',
        common_norm=False,
        ax=axes[i]
    )
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel('')

# Remove empty subplots if any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Calculate the correlation matrix for X_train
correlation_matrix = X_train.corr()

# Create a list of highly correlated pairs (|r| > 0.80)
high_corr = correlation_matrix.unstack()
high_corr = high_corr[abs(high_corr) > 0.80]
high_corr = high_corr[high_corr < 1] # Remove self-correlation

print("Highly Correlated Pairs to Investigate:")
print(high_corr.sort_values(ascending=True).drop_duplicates())

Highly Correlated Pairs to Investigate:
Gross tertiary education enrollment (%)  Infant mortality   -0.856807
Co2-Emissions                            Urban_population    0.822963
                                         GDP                 0.918253
dtype: float64

# Train : drop co2.emissions and Gross Tertiary Education Enrollment
X_train.drop(columns=['Co2-Emissions', 'Gross tertiary education enrollment (%)'], inplace=True)

# Test : drop co2.emissions and Gross Tertiary Education Enrollment
X_test.drop(columns=['Co2-Emissions', 'Gross tertiary education enrollment (%)'], inplace=True)

# create a second revision on the correlation matrix
correlation_matrix_2 = X_train.corr()

# Create a list of highly correlated pairs (|r| > 0.80)
high_corr = correlation_matrix_2.unstack()
high_corr = high_corr[abs(high_corr) > 0.80]
high_corr = high_corr[high_corr < 1] # Remove self-correlation

print("Highly Correlated Pairs to Investigate:")
print(high_corr.sort_values(ascending=True).drop_duplicates())

Highly Correlated Pairs to Investigate:
Series([], dtype: float64)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# initialize the LDA
lda = LinearDiscriminantAnalysis()

# fit LDA
lda.fit(X_train_scaled, y_train)

# visualize the discriminants
X_lda = lda.transform(X_train_scaled)
plt.scatter(X_lda[:, 0], np.zeros_like(X_lda[:, 0]), c=y_train)
plt.title('LDA Projection')
plt.xlabel('LD1')
plt.show()

# Extract scalings and put them in a DataFrame
scalings = pd.DataFrame(lda.scalings_, index=X_train.columns, columns=['LD1'])

# Sort by absolute value
scalings['Abs_Weight'] = scalings['LD1'].abs()
scalings_sorted = scalings.sort_values(by='Abs_Weight', ascending=False)
scalings_sorted_down = scalings.sort_values(by='Abs_Weight', ascending=True)

print("Top Discriminatory Variables:")
print(scalings_sorted[['LD1']].head(7))

print("\nLeast Discriminatory Variables:")
print(scalings_sorted_down[['LD1']].head(7))

Top Discriminatory Variables:
                                                LD1
Infant mortality                           1.456415
Minimum wage                               0.624716
Urban_population                           0.551531
Population: Labor force participation (%)  0.519285
Longitude                                 -0.489309
Physicians per thousand                    0.405828
Density\n(P/Km2)                           0.394014
/nLeast Discriminatory Variables:
                                             LD1
Gross primary education enrollment (%) -0.044174
Unemployment rate                      -0.070120
Latitude                                0.141279
GDP                                    -0.163004
Out of pocket health expenditure        0.172323
Total tax rate                          0.177278
Agricultural Land( %)                   0.209706

# Drop
X_train_reduced = X_train.drop(columns=['Gross primary education enrollment (%)', 'Unemployment rate', 'GDP', 'Latitude'])
X_test_reduced = X_test.drop(columns=['Gross primary education enrollment (%)', 'Unemployment rate', 'Latitude', 'GDP'])

# Re-scaling the reduced feature set
scaler = StandardScaler()

X_train_cleaned = scaler.fit_transform(X_train_reduced)
X_test_cleaned = scaler.transform(X_test_reduced)

print(f"Features reduced from {X_train.shape[1]} to {X_train_reduced.shape[1]}")

Features reduced from 18 to 14

# Initialize and Fit
lda_opt = LinearDiscriminantAnalysis()
lda_opt.fit(X_train_cleaned, y_train)

# Predictions
y_pred = lda_opt.predict(X_test_cleaned)

# Metrics
print("--- Optimized LDA Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

--- Optimized LDA Performance ---
Accuracy: 0.8571
Precision: 0.8000
Recall: 0.6667
F1-Score: 0.7273

Confusion Matrix:
[[14  1]
 [ 2  4]]

# ROC Curve (The proof of multivariate power)
y_probs = lda_opt.predict_proba(X_test_cleaned)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) - LDA')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

#  Initialize the "Wild" Tree
dt_wild = DecisionTreeClassifier(random_state=42)

# Fit on the SAME data used for LDA
dt_wild.fit(X_train, y_train)

# Check for Overfitting
train_acc = dt_wild.score(X_train, y_train)
test_acc = dt_wild.score(X_test, y_test)
train_f1_score = f1_score(y_train, dt_wild.predict(X_train))
test_f1_score = f1_score(y_test, dt_wild.predict(X_test))

print(f"Wild Tree Training Accuracy: {train_acc:.4f}")
print(f"Wild Tree Test Accuracy: {test_acc:.4f}")
print(f"Wild Tree Training F1-Score: {train_f1_score:.4f}")
print(f"Wild Tree Test F1-Score: {test_f1_score:.4f}")

Wild Tree Training Accuracy: 1.0000
Wild Tree Test Accuracy: 0.7143
Wild Tree Training F1-Score: 1.0000
Wild Tree Test F1-Score: 0.4000

# Extract importances
importances = pd.Series(dt_wild.feature_importances_, index=X_train.columns)
importances = importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=importances.index, hue=importances.index, dodge=False, palette='viridis', legend=False)
plt.title('Decision Tree Feature Importance (Gini)')
plt.show()

from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'criterion': ['gini'], 
}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
# print the accuracy 
print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.4f}")

Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best Cross-Validated Accuracy: 0.7316

# Initialize with the best parameters
dt_final = DecisionTreeClassifier(
    criterion='gini', 
    max_depth=3, 
    min_samples_leaf=2, 
    min_samples_split=10, 
    random_state=42
)

# Fit and Predict
dt_final.fit(X_train, y_train)
y_pred_final = dt_final.predict(X_test)

# chck the classification report
print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred_final))

Final Train F1: 0.7727
Final Test F1: 0.2500
Final Train Accuracy: 0.8780
Final Test Accuracy: 0.7142857142857143

Final Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.93      0.82        15
           1       0.50      0.17      0.25         6

    accuracy                           0.71        21
   macro avg       0.62      0.55      0.54        21
weighted avg       0.67      0.71      0.66        21

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plot_tree(dt_final, 
          feature_names=X_train.columns, 
          class_names=['Low Risk', 'High Risk'], 
          filled=True, rounded=True, precision=2)
plt.title("Optimized Decision Tree: Deforestation Risk Logic")
plt.show()

	Density\n(P/Km2)	Agricultural Land( %)	Armed Forces size	Co2-Emissions	CPI	Gasoline Price	GDP	Gross primary education enrollment (%)	Gross tertiary education enrollment (%)	Infant mortality	Minimum wage	Out of pocket health expenditure	Physicians per thousand	Population: Labor force participation (%)	Tax revenue (%)	Total tax rate	Unemployment rate	Urban_population	Latitude	Longitude	Deforestation_Critical
0	2.944439	17.4	12.666660	11.918437	5.026246	0.28	25.858995	109.9	51.4	3.049273	0.667829	28.1	1.72	41.2	37.2	66.100	11.70	17.265819	28.033886	1.659626	0
1	3.295837	47.5	11.669938	10.454322	5.571127	0.97	25.273298	113.5	9.3	3.962716	0.536493	33.4	0.21	77.5	9.2	49.100	6.89	16.862935	-11.202692	17.873887	1
2	2.890372	54.3	11.561725	12.212795	5.454252	1.10	26.831765	109.7	90.0	2.282382	1.470176	17.6	3.96	61.3	10.1	104.955	9.79	17.537331	-38.416097	-63.616672	1
3	4.700480	32.4	9.952325	11.025963	4.779628	1.20	26.824290	103.1	85.1	1.360977	2.311148	17.9	5.17	60.7	25.4	51.400	4.67	15.463095	47.516231	14.550072	0
4	7.714231	11.1	9.852247	10.363914	4.775672	0.43	24.375846	99.4	50.5	1.960095	1.213725	25.1	0.93	73.4	4.2	13.800	0.71	14.198805	26.066700	50.557700	0

Parameter	Values
`max_depth`	3, 5, 10
`min_samples_split`	2, 5, 10
`min_samples_leaf`	1, 2, 5

Introduction: Deforestation Risk Classification (LDA vs. Decision Trees)¶

Decision Tree¶

Decision Tree: Classification Report & Tree Analysis¶

Classification Report¶

Final Analysis¶

Comparison LDA vs Decision Tree... Which is better?¶

Key Takeaways (Beyond Infant Mortality)¶

Further improvements¶