import pandas as pd 

# load the datasets 
country_path = "data/world-data/world-data-2023.csv"
deforestation_path = "data/deforestation/annual-deforestation.csv"

country_data = pd.read_csv(country_path)
deforestation_data = pd.read_csv(deforestation_path)

# have to analyze how the countries are prsented to see if we can merge 
print("First 5 rows of country dataset")
country_data.head()

First 5 rows of country dataset

print("First 5 rows of deforestation data set")
deforestation_data.head()

First 5 rows of deforestation data set

# lets compare the shape of both datasets
# (rows, columns)
print(f"Country info, country rows: {country_data.shape[0]}")
print(f"Deforestation info, country rows: {deforestation_data.shape[0]}")

Country info, country rows: 195
Deforestation info, country rows: 113

# find what the not matches are, and delete... we dont have the data
indices = country_data['Country'].isin(deforestation_data['Entity'])
inverse = [True if i==False else False for i in indices]
print(country_data[inverse])

                  Country Density\n(P/Km2) Abbreviation Agricultural Land( %)  \
0             Afghanistan               60           AF                58.10%   
1                 Albania              105           AL                43.10%   
3                 Andorra              164           AD                40.00%   
5     Antigua and Barbuda              223           AG                20.50%   
7                 Armenia              104           AM                58.90%   
..                    ...              ...          ...                   ...   
183               Ukraine               75           UA                71.70%   
184  United Arab Emirates              118           AE                 5.50%   
186         United States               36           US                44.40%   
189               Vanuatu               25           VU                15.30%   
193                Zambia               25           ZM                32.10%   

    Land Area(Km2) Armed Forces size  Birth Rate  Calling Code  \
0          652,230           323,000       32.49          93.0   
1           28,748             9,000       11.78         355.0   
3              468               NaN        7.20         376.0   
5              443                 0       15.33           1.0   
7           29,743            49,000       13.99         374.0   
..             ...               ...         ...           ...   
183        603,550           297,000        8.70         380.0   
184         83,600            63,000       10.33         971.0   
186      9,833,517         1,359,000       11.60           1.0   
189         12,189               NaN       29.60         678.0   
193        752,618            16,000       36.19         260.0   

         Capital/Major City Co2-Emissions     CPI CPI Change (%)  \
0                     Kabul         8,672   149.9          2.30%   
1                    Tirana         4,536  119.05          1.40%   
3          Andorra la Vella           469     NaN            NaN   
5    St. John's, Saint John           557  113.81          1.20%   
7                   Yerevan         5,156  129.18          1.40%   
..                      ...           ...     ...            ...   
183                    Kyiv       202,250  281.66          7.90%   
184               Abu Dhabi       206,324  114.52         -1.90%   
186        Washington, D.C.     5,006,302  117.24          7.50%   
189               Port Vila           147  117.13          2.80%   
193                  Lusaka         5,141  212.31          9.20%   

    Currency-Code  Fertility Rate Forested Area (%) Gasoline Price  \
0             AFN            4.47             2.10%         $0.70    
1             ALL            1.62            28.10%         $1.36    
3             EUR            1.27            34.00%         $1.51    
5             XCD            1.99            22.30%         $0.99    
7             AMD            1.76            11.70%         $0.77    
..            ...             ...               ...            ...   
183           UAH            1.30            16.70%         $0.83    
184           AED            1.41             4.60%         $0.49    
186           USD            1.73            33.90%         $0.71    
189           VUV            3.78            36.10%         $1.31    
193           ZMW            4.63            65.20%         $1.40    

                      GDP Gross primary education enrollment (%)  \
0        $19,101,353,833                                 104.00%   
1        $15,278,077,447                                 107.00%   
3         $3,154,057,987                                 106.40%   
5         $1,727,759,259                                 105.00%   
7        $13,672,802,158                                  92.70%   
..                    ...                                    ...   
183     $153,781,069,118                                  99.00%   
184     $421,142,267,938                                 108.40%   
186  $21,427,700,000,000                                 101.80%   
189         $917,058,851                                 109.30%   
193      $23,064,722,446                                  98.70%   

    Gross tertiary education enrollment (%)  Infant mortality  \
0                                     9.70%              47.9   
1                                    55.00%               7.8   
3                                       NaN               2.7   
5                                    24.80%               5.0   
7                                    54.60%              11.0   
..                                      ...               ...   
183                                  82.70%               7.5   
184                                  36.80%               6.5   
186                                  88.20%               5.6   
189                                   4.70%              22.3   
193                                   4.10%              40.4   

               Largest city  Life expectancy  Maternal mortality ratio  \
0                     Kabul             64.5                     638.0   
1                    Tirana             78.5                      15.0   
3          Andorra la Vella              NaN                       NaN   
5    St. John's, Saint John             76.9                      42.0   
7                   Yerevan             74.9                      26.0   
..                      ...              ...                       ...   
183                    Kyiv             71.6                      19.0   
184                   Dubai             77.8                       3.0   
186           New York City             78.5                      19.0   
189               Port Vila             70.3                      72.0   
193                  Lusaka             63.5                     213.0   

    Minimum wage Official language Out of pocket health expenditure  \
0         $0.43             Pashto                           78.40%   
1         $1.12           Albanian                           56.90%   
3         $6.63            Catalan                           36.40%   
5         $3.04            English                           24.30%   
7         $0.66           Armenian                           81.60%   
..           ...               ...                              ...   
183       $0.84          Ukrainian                           47.80%   
184          NaN            Arabic                           17.80%   
186       $7.25                NaN                           11.10%   
189       $1.56             French                            8.90%   
193       $0.24            English                           27.50%   

     Physicians per thousand   Population  \
0                       0.28   38,041,754   
1                       1.20    2,854,191   
3                       3.33       77,142   
5                       2.76       97,118   
7                       4.40    2,957,731   
..                       ...          ...   
183                     2.99   44,385,155   
184                     2.53    9,770,529   
186                     2.61  328,239,523   
189                     0.17      299,882   
193                     1.19   17,861,030   

    Population: Labor force participation (%) Tax revenue (%) Total tax rate  \
0                                      48.90%           9.30%         71.40%   
1                                      55.70%          18.60%         36.60%   
3                                         NaN             NaN            NaN   
5                                         NaN          16.50%         43.00%   
7                                      55.60%          20.90%         22.60%   
..                                        ...             ...            ...   
183                                    54.20%          20.10%         45.20%   
184                                    82.10%           0.10%         15.90%   
186                                    62.00%           9.60%         36.60%   
189                                    69.90%          17.80%          8.50%   
193                                    74.60%          16.20%         15.60%   

    Unemployment rate Urban_population   Latitude   Longitude  
0              11.12%        9,797,273  33.939110   67.709953  
1              12.33%        1,747,593  41.153332   20.168331  
3                 NaN           67,873  42.506285    1.521801  
5                 NaN           23,800  17.060816  -61.796428  
7              16.99%        1,869,848  40.069099   45.038189  
..                ...              ...        ...         ...  
183             8.88%       30,835,699  48.379433   31.165580  
184             2.35%        8,479,744  23.424076   53.847818  
186            14.70%      270,663,028  37.090240  -95.712891  
189             4.39%           76,152 -15.376706  166.959158  
193            11.43%        7,871,713 -13.133897   27.849332  

[89 rows x 35 columns]

countries_in_deforestation = set(deforestation_data['Entity'].unique())
countries_in_stats = set(country_data['Country'].unique())
overlap = countries_in_deforestation & countries_in_stats
print(f"Countries in common: {overlap}")

Countries in common: {'Portugal', 'Denmark', 'Uzbekistan', 'Colombia', 'Morocco', 'Namibia', 'Djibouti', 'Algeria', 'Indonesia', 'Suriname', 'Central African Republic', 'Dominican Republic', 'New Zealand', 'Nicaragua', 'Senegal', 'Austria', 'Costa Rica', 'Tunisia', 'Togo', 'Eswatini', 'Somalia', 'Saint Vincent and the Grenadines', 'Bhutan', 'Comoros', 'Gabon', 'Chad', 'Norway', 'Germany', 'Burkina Faso', 'Monaco', 'Brazil', 'Qatar', 'Georgia', 'Estonia', 'Jordan', 'Netherlands', 'Serbia', 'Uganda', 'Honduras', 'Malta', 'Kuwait', 'San Marino', 'Chile', 'Iceland', 'Vietnam', 'Hungary', 'South Korea', 'Panama', 'United Kingdom', 'Cameroon', 'Niger', 'Singapore', 'Syria', 'Latvia', 'Turkey', 'Sweden', 'Benin', 'Iraq', 'Argentina', 'Ethiopia', 'Jamaica', 'Libya', 'Malawi', 'Oman', 'Mozambique', 'Lithuania', 'Italy', 'Angola', 'Poland', 'Saudi Arabia', 'Tanzania', 'India', 'El Salvador', 'Equatorial Guinea', 'Spain', 'Yemen', 'Canada', 'China', 'Guinea', 'Lebanon', 'Cuba', 'Nauru', 'Bahrain', 'Bulgaria', 'Uruguay', 'Dominica', 'Guyana', 'Romania', 'Papua New Guinea', 'Venezuela', 'Mexico', 'Croatia', 'Slovenia', 'Iran', 'Sudan', 'Guatemala', 'Kazakhstan', 'Liechtenstein', 'Madagascar', 'Russia', 'Zimbabwe', 'Mauritius', 'Paraguay', 'Switzerland', 'Finland', 'Mali'}

merged_df = country_data.merge(deforestation_data, 
                         left_on='Country', 
                         right_on='Entity', 
                         how='inner')
merged_df.shape
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    106 non-null    object 
 1   Density
(P/Km2)                            106 non-null    object 
 2   Abbreviation                               104 non-null    object 
 3   Agricultural Land( %)                      103 non-null    object 
 4   Land Area(Km2)                             106 non-null    object 
 5   Armed Forces size                          98 non-null     object 
 6   Birth Rate                                 104 non-null    float64
 7   Calling Code                               106 non-null    float64
 8   Capital/Major City                         104 non-null    object 
 9   Co2-Emissions                              102 non-null    object 
 10  CPI                                        99 non-null     object 
 11  CPI Change (%)                             99 non-null     object 
 12  Currency-Code                              98 non-null     object 
 13  Fertility Rate                             103 non-null    float64
 14  Forested Area (%)                          103 non-null    object 
 15  Gasoline Price                             99 non-null     object 
 16  GDP                                        106 non-null    object 
 17  Gross primary education enrollment (%)     103 non-null    object 
 18  Gross tertiary education enrollment (%)    103 non-null    object 
 19  Infant mortality                           103 non-null    float64
 20  Largest city                               103 non-null    object 
 21  Life expectancy                            103 non-null    float64
 22  Maternal mortality ratio                   100 non-null    float64
 23  Minimum wage                               83 non-null     object 
 24  Official language                          105 non-null    object 
 25  Out of pocket health expenditure           102 non-null    object 
 26  Physicians per thousand                    103 non-null    float64
 27  Population                                 106 non-null    object 
 28  Population: Labor force participation (%)  100 non-null    object 
 29  Tax revenue (%)                            94 non-null     object 
 30  Total tax rate                             101 non-null    object 
 31  Unemployment rate                          100 non-null    object 
 32  Urban_population                           104 non-null    object 
 33  Latitude                                   106 non-null    float64
 34  Longitude                                  106 non-null    float64
 35  Entity                                     106 non-null    object 
 36  Code                                       106 non-null    object 
 37  Year                                       106 non-null    int64  
 38  Deforestation                              106 non-null    int64  
dtypes: float64(9), int64(2), object(28)
memory usage: 32.4+ KB

# how many nulls? do we need ot fill it up
merged_df.isnull().sum()

Country                                       0
Density\n(P/Km2)                              0
Abbreviation                                  2
Agricultural Land( %)                         3
Land Area(Km2)                                0
Armed Forces size                             8
Birth Rate                                    2
Calling Code                                  0
Capital/Major City                            2
Co2-Emissions                                 4
CPI                                           7
CPI Change (%)                                7
Currency-Code                                 8
Fertility Rate                                3
Forested Area (%)                             3
Gasoline Price                                7
GDP                                           0
Gross primary education enrollment (%)        3
Gross tertiary education enrollment (%)       3
Infant mortality                              3
Largest city                                  3
Life expectancy                               3
Maternal mortality ratio                      6
Minimum wage                                 23
Official language                             1
Out of pocket health expenditure              4
Physicians per thousand                       3
Population                                    0
Population: Labor force participation (%)     6
Tax revenue (%)                              12
Total tax rate                                5
Unemployment rate                             6
Urban_population                              2
Latitude                                      0
Longitude                                     0
Entity                                        0
Code                                          0
Year                                          0
Deforestation                                 0
dtype: int64

unnecesessary_cols = [
    'Abbreviation', 
    'Official language', 
    'Currency-Code', 
    'Entity', 
    'Code', 
    'Capital/Major City', 
    'Calling Code',
    'Largest city',
    'Year',
    'Country'
]
simplified_df = merged_df.drop(columns=unnecesessary_cols)

# how many nulls? do we need ot fill it up
simplified_df.isnull().sum()

Density\n(P/Km2)                              0
Agricultural Land( %)                         3
Land Area(Km2)                                0
Armed Forces size                             8
Birth Rate                                    2
Co2-Emissions                                 4
CPI                                           7
CPI Change (%)                                7
Fertility Rate                                3
Forested Area (%)                             3
Gasoline Price                                7
GDP                                           0
Gross primary education enrollment (%)        3
Gross tertiary education enrollment (%)       3
Infant mortality                              3
Life expectancy                               3
Maternal mortality ratio                      6
Minimum wage                                 23
Out of pocket health expenditure              4
Physicians per thousand                       3
Population                                    0
Population: Labor force participation (%)     6
Tax revenue (%)                              12
Total tax rate                                5
Unemployment rate                             6
Urban_population                              2
Latitude                                      0
Longitude                                     0
Deforestation                                 0
dtype: int64

simplified_df.info()

categorical_cols = simplified_df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_cols}")
simplified_df[categorical_cols].head()

Categorical columns: Index(['Density\n(P/Km2)', 'Agricultural Land( %)', 'Land Area(Km2)',
       'Armed Forces size', 'Co2-Emissions', 'CPI', 'CPI Change (%)',
       'Forested Area (%)', 'Gasoline Price', 'GDP',
       'Gross primary education enrollment (%)',
       'Gross tertiary education enrollment (%)', 'Minimum wage',
       'Out of pocket health expenditure', 'Population',
       'Population: Labor force participation (%)', 'Tax revenue (%)',
       'Total tax rate', 'Unemployment rate', 'Urban_population'],
      dtype='object')

import numpy as np

# treat everything as text 
simplified_df[categorical_cols] = simplified_df[categorical_cols].astype(str)

# replace the symbols 
for col in categorical_cols:
    simplified_df[col] = simplified_df[col].str.replace(r'[$%,]', '', regex=True)
    # convert to numeric, errors='coerce' will turn non-convertible values into NaN
    simplified_df[col] = pd.to_numeric(simplified_df[col], errors='coerce')

# check the data types again
simplified_df[categorical_cols].describe()

# before dropping the unncessesary cols, just check if the data is real with the lwoest and higest land values to see if they match the bigges
# Assuming country_data is already loaded as a DataFrame
# Convert the 'Land Area(Km2)' column to numeric, because it is no longer in the simplified... 
country_data['Land Area(Km2)'] = pd.to_numeric(country_data['Land Area(Km2)'].str.replace(',', ''), errors='coerce')

# Get the lowest and highest values along with the corresponding countries
lowest_land_area = country_data.loc[country_data['Land Area(Km2)'].idxmin()]
highest_land_area = country_data.loc[country_data['Land Area(Km2)'].idxmax()]

# Display the results
print("Country with the lowest land area:")
print(lowest_land_area[['Country', 'Land Area(Km2)']])

print("\nCountry with the highest land area:")
print(highest_land_area[['Country', 'Land Area(Km2)']])

Country with the lowest land area:
Country           Vatican City
Land Area(Km2)             0.0
Name: 73, dtype: object

Country with the highest land area:
Country               Russia
Land Area(Km2)    17098240.0
Name: 143, dtype: object

# how many nulls? do we need ot fill it up... get the % of nulls
print(round(((simplified_df.isnull().sum())/simplified_df.shape[0]) * 100,2))

Density\n(P/Km2)                              0.00
Agricultural Land( %)                         2.83
Land Area(Km2)                                0.00
Armed Forces size                             7.55
Birth Rate                                    1.89
Co2-Emissions                                 3.77
CPI                                           6.60
CPI Change (%)                                6.60
Fertility Rate                                2.83
Forested Area (%)                             2.83
Gasoline Price                                6.60
GDP                                           0.00
Gross primary education enrollment (%)        2.83
Gross tertiary education enrollment (%)       2.83
Infant mortality                              2.83
Life expectancy                               2.83
Maternal mortality ratio                      5.66
Minimum wage                                 21.70
Out of pocket health expenditure              3.77
Physicians per thousand                       2.83
Population                                    0.00
Population: Labor force participation (%)     5.66
Tax revenue (%)                              11.32
Total tax rate                                4.72
Unemployment rate                             5.66
Urban_population                              1.89
Latitude                                      0.00
Longitude                                     0.00
Deforestation                                 0.00
dtype: float64

import matplotlib.pyplot as plt
import seaborn as sns

cols_to_plot = [
    'Armed Forces size', 'Birth Rate', 'Agricultural Land( %)', 
    'Co2-Emissions', 'CPI', 'CPI Change (%)', 'Fertility Rate', 'Forested Area (%)', 
    'Gasoline Price', 'Gross primary education enrollment (%)', 'Gross tertiary education enrollment (%)',
    'Infant mortality', 'Life expectancy', 'Minimum wage', 'Out of pocket health expenditure', 'Maternal mortality ratio',
    'Physicians per thousand', 'Population: Labor force participation (%)', 'Tax revenue (%)', 'Total tax rate', 
    'Unemployment rate', 'Urban_population'
]

# get the grid 
fig, axes = plt.subplots(nrows=6, ncols=4, figsize=(20, 25))
axes = axes.flatten()

for i, col in enumerate(cols_to_plot):
    if col in simplified_df.columns:
        sns.boxplot(x=simplified_df[col], ax=axes[i], color='skyblue')
    else:
        axes[i].axis('off')

plt.tight_layout()
plt.show()

# set the columns to inpute the mean
cols_mean_imputation = ['Birth Rate', 'Agricultural Land( %)', 'Fertility Rate', 'Forested Area (%)', 'Gasoline Price', 
                        'Gross tertiary education enrollment (%)', 'Population: Labor force participation (%)', 'Tax revenue (%)', 
                        'Out of pocket health expenditure', 'Physicians per thousand']
for col in cols_mean_imputation:
    if col in simplified_df.columns:
        mean_value = simplified_df[col].mean()
        simplified_df[col] = simplified_df[col].fillna(mean_value)
    else:
        print(f"Column '{col}' not found in the DataFrame.")

# check that it worked
simplified_df[cols_mean_imputation].isnull().sum()

Birth Rate                                   0
Agricultural Land( %)                        0
Fertility Rate                               0
Forested Area (%)                            0
Gasoline Price                               0
Gross tertiary education enrollment (%)      0
Population: Labor force participation (%)    0
Tax revenue (%)                              0
Out of pocket health expenditure             0
Physicians per thousand                      0
dtype: int64

# get the cols to impute with the median
cols_median_imputation = ['Armed Forces size', 'Co2-Emissions', 'CPI', 'CPI Change (%)', 'Co2-Emissions',
                          'Gross primary education enrollment (%)', 'Total tax rate',
                            'Unemployment rate', 'Urban_population']

for col in cols_median_imputation:
    if col in simplified_df.columns:
        median_value = simplified_df[col].median()
        simplified_df[col] = simplified_df[col].fillna(median_value)
    else:
        print(f"Column '{col}' not found in the DataFrame.")

# check that it worked
simplified_df[cols_median_imputation].isnull().sum()

Armed Forces size                         0
Co2-Emissions                             0
CPI                                       0
CPI Change (%)                            0
Co2-Emissions                             0
Gross primary education enrollment (%)    0
Total tax rate                            0
Unemployment rate                         0
Urban_population                          0
dtype: int64

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# first we scale the data 
cols_knn_inputation = ['Minimum wage', 
    'Infant mortality', 
    'Maternal mortality ratio', 
    'Life expectancy', 
]

print("Nulls before KNN Imputation:\n", simplified_df[cols_knn_inputation].isnull().sum())

# standarization (z-scores)
scaler = StandardScaler()
# learn the shape of data then uses those to tranform it 
scaled_knn_col_data = scaler.fit_transform(simplified_df[cols_knn_inputation])

# define the imputer
imputer = KNNImputer(n_neighbors=5)

# fill gaps with the z matrix
imputed_scaled_data = imputer.fit_transform(scaled_knn_col_data)

# return back to normal 
imputed_data_original_scale = scaler.inverse_transform(imputed_scaled_data)

# replace the cols with the one without nulls
simplified_df[cols_knn_inputation] = imputed_data_original_scale

# check that it worked
print("\nNulls after KNN Imputation:\n", simplified_df[cols_knn_inputation].isnull().sum())

Nulls before KNN Imputation:
 Minimum wage                23
Infant mortality             3
Maternal mortality ratio     6
Life expectancy              3
dtype: int64

Nulls after KNN Imputation:
 Minimum wage                0
Infant mortality            0
Maternal mortality ratio    0
Life expectancy             0
dtype: int64

# Statistical summary
simplified_df.describe()

# Calculate skewness for each numeric column
skewness = simplified_df.select_dtypes(include=['float64', 'int64']).skew()

# Filter columns with skewness greater than 1 or less than -1
skewed_cols = skewness[(skewness > 1) | (skewness < -1)].index

print(f"Skewed columns are {len(skewed_cols)} out of {len(simplified_df.columns)}")

# Create histograms for the skewed columns
num_cols = len(skewed_cols)
# function to do the figure size based on the number of columns we have to plot
fig, axes = plt.subplots(nrows=(num_cols // 4) + 1, ncols=4, figsize=(20, 5 * ((num_cols // 4) + 1)))
axes = axes.flatten()

for i, col in enumerate(skewed_cols):
    axes[i].hist(simplified_df[col].dropna(), bins=30, color='#A865B5', alpha=0.6)
    axes[i].set_title(f'Histogram of {col}')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

# save as image png
plt.savefig('skewed_columns_histograms.png')

plt.tight_layout()
plt.show()

Skewed columns are 17 out of 29

import numpy as np

# Columns that span orders of magnitude (counts, sizes, emissions) — log1p compresses the extreme tail
log_cols = [
    'Density\n(P/Km2)', 'Land Area(Km2)', 'Armed Forces size',
    'CO2-Emissions', 'GDP', 'Population', 'Urban_population',
    'Maternal mortality ratio', 'Infant mortality', 'Deforestation', 'Minimum wage'
]
for col in log_cols:
    if col in simplified_df.columns:
        simplified_df[col] = np.log1p(simplified_df[col])  # log1p = log(1+x), safe for zeros

# Columns with 1-2 extreme outliers but otherwise fine — clip at 99th percentile
cap_cols = ['CPI', 'CPI Change (%)', 'Total tax rate']
for col in cap_cols:
    if col in simplified_df.columns:
        simplified_df[col] = simplified_df[col].clip(upper=simplified_df[col].quantile(0.99))  # remove extreme top values

# Quick check: skewness values should now be much closer to 0
all_treated = log_cols + cap_cols
print(simplified_df[[c for c in all_treated if c in simplified_df.columns]].skew().round(2))

Density\n(P/Km2)            0.71
Land Area(Km2)             -1.42
Armed Forces size          -1.99
GDP                        -0.02
Population                 -0.75
Urban_population           -0.69
Maternal mortality ratio    0.08
Infant mortality            0.06
Deforestation              -0.50
Minimum wage                0.67
CPI                         6.01
CPI Change (%)              3.41
Total tax rate              1.30
dtype: float64

# correlation matrix
corr_matrix = simplified_df.corr().abs()

# Select the upper triangle of the matrix (to avoid duplicate pairs)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# find index of feature columns with correlation greater than 0.85
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

# print the specific pairs that are highly correlated
print("Highly Correlated Pairs:")
for col in to_drop:
    connected = upper.index[upper[col] > 0.85].tolist()
    print(f"{col} is highly correlated with: {connected} with corr values: {upper.loc[connected, col].values}")

Highly Correlated Pairs:
Fertility Rate is highly correlated with: ['Birth Rate'] with corr values: [0.97490843]
Infant mortality is highly correlated with: ['Birth Rate'] with corr values: [0.89625677]
Life expectancy is highly correlated with: ['Birth Rate', 'Fertility Rate', 'Infant mortality'] with corr values: [0.91483404 0.89357794 0.911579  ]
Maternal mortality ratio is highly correlated with: ['Birth Rate', 'Infant mortality', 'Life expectancy'] with corr values: [0.880848  0.9575965 0.9027807]
Minimum wage is highly correlated with: [] with corr values: []
Physicians per thousand is highly correlated with: [] with corr values: []
Population is highly correlated with: [] with corr values: []
Urban_population is highly correlated with: ['Population'] with corr values: [0.91331582]

# drop leakage and non-informative geographic constant
cols_leakage_drop = ['Forested Area (%)', 'Land Area(Km2)']

simplified_df.drop(columns=cols_leakage_drop, inplace=True)

# confirm
simplified_df.shape

(106, 27)

# Now you can safely drop the redundant ones
cols_to_drop = ['Fertility Rate', 'Birth Rate', 'Life expectancy', 
                'Maternal mortality ratio', 'Population']

simplified_df.drop(columns=cols_to_drop, inplace=True)

# confirm they were dropped
simplified_df.shape

(106, 22)

# define the y (target) variables
y = simplified_df['Deforestation']

# define the X (features) variables now
X = simplified_df.drop(columns=['Deforestation'])

from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

print("Train R2:", model.score(X_train, y_train))
print("Validation R2:", model.score(X_val, y_val))

Train R2: 0.6673497139211095
Validation R2: 0.2872152078297733

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

# build the pipeline: scale first, then run lasso
pipeline = Pipeline([
    ('scaler', StandardScaler()),       
    ('lasso', LassoCV(cv=5, random_state=42, max_iter=10000)) 
])

pipeline.fit(X_train, y_train)

# how did it do?
print("Best lambda (alpha):", round(pipeline.named_steps['lasso'].alpha_, 4))
print("Train R2:     ", round(pipeline.score(X_train, y_train), 4))
print("Validation R2:", round(pipeline.score(X_val, y_val), 4))

Best lambda (alpha): 0.2476
Train R2:      0.6052
Validation R2: 0.4746

import pandas as pd

# Extract which features LASSO kept (non-zero coefficient) and which it eliminated (zero)
lasso_step = pipeline.named_steps['lasso']
coef_df = pd.DataFrame({'feature': X_train.columns, 'coefficient': lasso_step.coef_})

selected = coef_df[coef_df['coefficient'] != 0].sort_values('coefficient', key=abs, ascending=False)
dropped  = coef_df[coef_df['coefficient'] == 0]

print(f'Features kept:       {len(selected)}')
print(f'Features zeroed out: {len(dropped)}')

Features kept:       11
Features zeroed out: 10

from sklearn.metrics import mean_squared_error, mean_absolute_error

# get predictions on validation set (still in log1p scale)
y_pred_val_log = pipeline.predict(X_val)

# back-transform to real hectares using expm1 (inverse of log1p)
y_pred_val = np.expm1(y_pred_val_log)
y_val_real  = np.expm1(y_val)

# RMSE in real hectares — penalizes large errors more
rmse = np.sqrt(mean_squared_error(y_val_real, y_pred_val))

# MAE in real hectares — average absolute error per country
mae = mean_absolute_error(y_val_real, y_pred_val)

print(f"Validation RMSE: {rmse:,.0f} hectares")
print(f"Validation MAE:  {mae:,.0f} hectares")
print(f"Validation R²:   {pipeline.score(X_val, y_val):.4f}")

Validation RMSE: 351,456 hectares
Validation MAE:  114,176 hectares
Validation R²:   0.4746

# see the real range of deforestation 
defor_real = np.expm1(y)

print(f"Min:    {defor_real.min():>15,.0f} hectares")
print(f"Max:    {defor_real.max():>15,.0f} hectares")
print(f"Median: {defor_real.median():>15,.0f} hectares")
print(f"Mean:   {defor_real.mean():>15,.0f} hectares")
print(f"Std:    {defor_real.std():>15,.0f} hectares")

# calculate the percentage error for each prediction
print(f"MAE represents an error of for max {(mae/defor_real.max() * 100):.3f} %")
print(f"MAE represents an error of for mean {(mae/defor_real.mean() * 100):.3f} %")

# calculate the % for the RMSE 
print(f"RMSE represents an error of for max {(rmse/defor_real.max() * 100):.3f} %")
print(f"RMSE represents an error of for mean {(rmse/defor_real.mean() * 100):.3f} %")

Min:                  0 hectares
Max:          3,256,050 hectares
Median:           4,110 hectares
Mean:            76,455 hectares
Std:            327,919 hectares
MAE represents an error of for max 3.507 %
MAE represents an error of for mean 149.339 %
RMSE represents an error of for max 10.794 %
RMSE represents an error of for mean 459.692 %

print("Coefficient interpretation (standardized):")
print(selected.to_string(index=False))

Coefficient interpretation (standardized):
                                  feature  coefficient
                         Urban_population     1.604320
                  Physicians per thousand    -1.226848
                         Density\n(P/Km2)    -1.009355
                           Total tax rate     0.686123
Population: Labor force participation (%)     0.654856
                           Gasoline Price     0.371588
                                 Latitude    -0.349172
                                Longitude    -0.335643
                           CPI Change (%)     0.312444
                            Co2-Emissions     0.059804
                          Tax revenue (%)    -0.000582

from sklearn.linear_model import Lasso

# drop the two near-zero LASSO features
final_cols = [col for col in selected['feature'].tolist()
              if col not in ['Co2-Emissions', 'Tax revenue (%)']];

X_train_f = X_train[final_cols]
X_val_f   = X_val[final_cols]
X_test_f  = X_test[final_cols]

# reuse the lambda already found by cross-validation (no re-running CV)
best_alpha = pipeline.named_steps['lasso'].alpha_
linear_f = Pipeline([
                ('scaler', StandardScaler()),
                ('lasso', Lasso(alpha=best_alpha, max_iter=10000))
                ]);

linear_f.fit(X_train_f, y_train);

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_f, y_train)

print('Linear (Lasso)')
print(f'Train R2: {linear_f.score(X_train_f, y_train):.4f}')
print(f'Val   R2: {linear_f.score(X_val_f, y_val):.4f}')
print()
print('Random Forest')
print(f'Train R2: {rf.score(X_train_f, y_train):.4f}')
print(f'Val   R2: {rf.score(X_val_f, y_val):.4f}')

Linear (Lasso)
Train R2: 0.6040
Val   R2: 0.4772

Random Forest
Train R2: 0.9227
Val   R2: 0.5377

#RF predictions back to real hectares
y_pred_rf = np.expm1(rf.predict(X_val_f))
y_val_real = np.expm1(y_val)

print(f"RF RMSE: {np.sqrt(mean_squared_error(y_val_real, y_pred_rf)):,.0f} hectares")
print(f"RF MAE:  {mean_absolute_error(y_val_real, y_pred_rf):,.0f} hectares")

RF RMSE: 71,279 hectares
RF MAE:  38,604 hectares

import statsmodels.api as sm

# scale features so coefficients are comparable, then add intercept
scaler_inf = StandardScaler()
X_inf_scaled = scaler_inf.fit_transform(X_train_f)
X_inf = sm.add_constant(X_inf_scaled)

# OLS gives p-values and confidence intervals
ols = sm.OLS(y_train, X_inf).fit()

inf_df = pd.DataFrame({
    'feature'    : ['intercept'] + final_cols,
    'coef'       : ols.params.round(4),
    'p_value'    : ols.pvalues.round(4),
    'CI_low'     : ols.conf_int()[0].round(4),
    'CI_high'    : ols.conf_int()[1].round(4)
})
inf_df['significant'] = inf_df['p_value'] < 0.05
print(inf_df.to_string(index=False))

                                  feature    coef  p_value  CI_low  CI_high  significant
                                intercept  7.2506   0.0000  6.5387   7.9626         True
                         Urban_population  2.0431   0.0000  1.2625   2.8238         True
                  Physicians per thousand -1.5410   0.0010 -2.4323  -0.6496         True
                         Density\n(P/Km2) -1.2374   0.0028 -2.0304  -0.4444         True
                           Total tax rate  0.8001   0.0433  0.0251   1.5752         True
Population: Labor force participation (%)  0.9724   0.0121  0.2208   1.7239         True
                           Gasoline Price  1.0962   0.0119  0.2509   1.9414         True
                                 Latitude -0.3935   0.3749 -1.2747   0.4876        False
                                Longitude -0.6444   0.0989 -1.4137   0.1249        False
                           CPI Change (%)  0.5874   0.1271 -0.1723   1.3470        False

# unlock the test set 
y_test_real = np.expm1(y_test)

# linear
y_pred_lin  = np.expm1(linear_f.predict(X_test_f))
lin_r2   = linear_f.score(X_test_f, y_test)
lin_rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_lin))
lin_mae  = mean_absolute_error(y_test_real, y_pred_lin)

# random forest
y_pred_rf_t = np.expm1(rf.predict(X_test_f))
rf_r2   = rf.score(X_test_f, y_test)
rf_rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_rf_t))
rf_mae  = mean_absolute_error(y_test_real, y_pred_rf_t)

print(f'               Linear (LASSO)   Random Forest')
print(f'R²             {lin_r2:>14.4f}   {rf_r2:.4f}')
print(f'RMSE (ha)      {lin_rmse:>14,.0f}   {rf_rmse:,.0f}')
print(f'MAE  (ha)      {lin_mae:>14,.0f}   {rf_mae:,.0f}')

               Linear (LASSO)   Random Forest
R²                     0.1348   0.0135
RMSE (ha)              74,648   17,874
MAE  (ha)              24,596   10,069

Dataset	Source	Description
World Data 2023	World Bank indicators (compiled and hosted on Kaggle)	~195 countries, 35 socio-economic indicators (population, GDP, health, education, etc.)
Annual Deforestation	Our World in Data	Annual forest loss in hectares by country and year

	Entity	Code	Year	Deforestation
0	Algeria	DZA	2020	8670
1	Angola	AGO	2020	510440
2	Argentina	ARG	2020	195300
3	Austria	AUT	2020	5130
4	Bahrain	BHR	2020	0

Variable	Type	Description
Country	Text	Country name (used as merge key)
Density (P/Km²)	Numeric	Population density in people per km²
Agricultural Land (%)	Numeric	Share of land used for agriculture
Land Area (Km²)	Numeric	Total land area — dropped
Armed Forces size	Numeric	Number of active military personnel
Birth Rate	Numeric	Births per 1,000 people per year
Co2-Emissions	Numeric	Total CO₂ emissions (kt)
CPI	Numeric	Consumer Price Index — measures price level
CPI Change (%)	Numeric	Annual inflation rate
Fertility Rate	Numeric	Average children per woman
Forested Area (%)	Numeric	Current share of land that is forested — dropped
Gasoline Price	Numeric	Average retail gasoline price (USD/liter)
GDP	Numeric	Gross Domestic Product (USD)
Gross primary education enrollment (%)	Numeric	% of primary-school-age children enrolled
Gross tertiary education enrollment (%)	Numeric	% of tertiary-age population enrolled in university
Infant mortality	Numeric	Deaths per 1,000 live births before age 1
Life expectancy	Numeric	Average years a newborn is expected to live
Maternal mortality ratio	Numeric	Deaths per 100,000 live births due to pregnancy/childbirth
Minimum wage	Numeric	National minimum wage (USD/month)
Out of pocket health expenditure	Numeric	Share of health costs paid directly by citizens (%)
Physicians per thousand	Numeric	Number of doctors per 1,000 people
Population	Numeric	Total population
Population: Labor force participation (%)	Numeric	Share of working-age population employed or seeking work
Tax revenue (%)	Numeric	Government tax revenue as % of GDP
Total tax rate	Numeric	Total tax burden on businesses (% of commercial profits)
Unemployment rate	Numeric	% of labor force without a job
Urban_population	Numeric	Number of people living in urban areas
Latitude	Numeric	Geographic latitude of country centroid
Longitude	Numeric	Geographic longitude of country centroid
Administrative columns	Text	Abbreviation, Currency-Code, Calling Code, Capital/Major City, Largest city, Official language — all dropped

Variable	Type	Description
Entity	Text	Country name (used as merge key)
Code	Text	ISO country code — dropped
Year	Numeric	Year of observation — dropped
Deforestation	Numeric	Target variable — hectares of forest lost that year

	Density\n(P/Km2)	Agricultural Land( %)	Land Area(Km2)	Armed Forces size	Co2-Emissions	CPI	CPI Change (%)	Forested Area (%)	Gasoline Price	GDP	Gross primary education enrollment (%)	Gross tertiary education enrollment (%)	Minimum wage	Out of pocket health expenditure	Population	Population: Labor force participation (%)	Tax revenue (%)	Total tax rate	Unemployment rate	Urban_population
0	18	17.40%	2,381,741	317,000	150,006	151.36	2.00%	0.80%	$0.28	$169,988,236,398	109.90%	51.40%	$0.95	28.10%	43,053,054	41.20%	37.20%	66.10%	11.70%	31,510,100
1	26	47.50%	1,246,700	117,000	34,693	261.73	17.10%	46.30%	$0.97	$94,635,415,870	113.50%	9.30%	$0.71	33.40%	31,825,295	77.50%	9.20%	49.10%	6.89%	21,061,025
2	17	54.30%	2,780,400	105,000	201,348	232.75	53.50%	9.80%	$1.10	$449,663,446,954	109.70%	90.00%	$3.35	17.60%	44,938,712	61.30%	10.10%	106.30%	9.79%	41,339,571
3	109	32.40%	83,871	21,000	61,448	118.06	1.50%	46.90%	$1.20	$446,314,739,528	103.10%	85.10%	NaN	17.90%	8,877,067	60.70%	25.40%	51.40%	4.67%	5,194,416
4	2,239	11.10%	765	19,000	31,694	117.59	2.10%	0.80%	$0.43	$38,574,069,149	99.40%	50.50%	NaN	25.10%	1,501,635	73.40%	4.20%	13.80%	0.71%	1,467,109

Deforestation and Its Hidden Drivers¶

Problem Statement¶

Objective¶

Data Sources¶

Approach¶

Data Dictionaries¶

Dataset 1 — World Data 2023 (Kaggle)¶

Dataset 2 — Annual Deforestation (Our World in Data)¶

Categorical Values¶

Dealling with the nulls¶

Mean¶

Median¶

KNN¶

Outliers and Skewn¶

Collinearity & Correlation¶

Dropping Collinear Variables¶

Relevant Variables¶

Data Partitioning¶

Feature Selection¶

Lasso¶

Coefficient Interpretation¶

Non-Linear Model¶

Model Comparison¶

Inference Conclusions¶

Final Evaluation — Test Set¶

Reading the Results¶

Conclusions and Critical Reflection¶

Predictive Models¶

Inference¶

Real Applications and Margins of Error¶

Limitations¶

Future Work¶

References¶

	Country	Density\n(P/Km2)	Abbreviation	Agricultural Land( %)	Land Area(Km2)	Armed Forces size	Birth Rate	Calling Code	Capital/Major City	Co2-Emissions	CPI	CPI Change (%)	Currency-Code	Fertility Rate	Forested Area (%)	Gasoline Price	GDP	Gross primary education enrollment (%)	Gross tertiary education enrollment (%)	Infant mortality	Largest city	Life expectancy	Maternal mortality ratio	Minimum wage	Official language	Out of pocket health expenditure	Physicians per thousand	Population	Population: Labor force participation (%)	Tax revenue (%)	Total tax rate	Unemployment rate	Urban_population	Latitude	Longitude
0	Afghanistan	60	AF	58.10%	652,230	323,000	32.49	93.0	Kabul	8,672	149.9	2.30%	AFN	4.47	2.10%	$0.70	$19,101,353,833	104.00%	9.70%	47.9	Kabul	64.5	638.0	$0.43	Pashto	78.40%	0.28	38,041,754	48.90%	9.30%	71.40%	11.12%	9,797,273	33.939110	67.709953
1	Albania	105	AL	43.10%	28,748	9,000	11.78	355.0	Tirana	4,536	119.05	1.40%	ALL	1.62	28.10%	$1.36	$15,278,077,447	107.00%	55.00%	7.8	Tirana	78.5	15.0	$1.12	Albanian	56.90%	1.20	2,854,191	55.70%	18.60%	36.60%	12.33%	1,747,593	41.153332	20.168331
2	Algeria	18	DZ	17.40%	2,381,741	317,000	24.28	213.0	Algiers	150,006	151.36	2.00%	DZD	3.02	0.80%	$0.28	$169,988,236,398	109.90%	51.40%	20.1	Algiers	76.7	112.0	$0.95	Arabic	28.10%	1.72	43,053,054	41.20%	37.20%	66.10%	11.70%	31,510,100	28.033886	1.659626
3	Andorra	164	AD	40.00%	468	NaN	7.20	376.0	Andorra la Vella	469	NaN	NaN	EUR	1.27	34.00%	$1.51	$3,154,057,987	106.40%	NaN	2.7	Andorra la Vella	NaN	NaN	$6.63	Catalan	36.40%	3.33	77,142	NaN	NaN	NaN	NaN	67,873	42.506285	1.521801
4	Angola	26	AO	47.50%	1,246,700	117,000	40.73	244.0	Luanda	34,693	261.73	17.10%	AOA	5.52	46.30%	$0.97	$94,635,415,870	113.50%	9.30%	51.6	Luanda	60.8	241.0	$0.71	Portuguese	33.40%	0.21	31,825,295	77.50%	9.20%	49.10%	6.89%	21,061,025	-11.202692	17.873887

	Density\n(P/Km2)	Agricultural Land( %)	Land Area(Km2)	Armed Forces size	Co2-Emissions	CPI	CPI Change (%)	Forested Area (%)	Gasoline Price	GDP	Gross primary education enrollment (%)	Gross tertiary education enrollment (%)	Minimum wage	Out of pocket health expenditure	Population	Population: Labor force participation (%)	Tax revenue (%)	Total tax rate	Unemployment rate	Urban_population
count	106.000000	103.000000	1.060000e+02	9.800000e+01	1.020000e+02	99.000000	99.000000	103.000000	99.000000	1.060000e+02	103.000000	103.000000	83.000000	102.000000	1.060000e+02	100.00000	94.000000	101.000000	100.000000	1.040000e+02
mean	479.150943	39.030097	8.851272e+05	1.727143e+05	2.292163e+05	183.278586	7.494949	29.903883	1.025354	5.149246e+11	101.866990	41.160194	2.101928	32.097059	5.103524e+07	62.05500	15.976596	42.265347	6.643900	2.881799e+07
std	2672.263495	21.525411	2.277325e+06	4.437516e+05	1.019249e+06	292.839470	26.992650	23.294428	0.376315	2.012593e+12	14.934658	28.509236	2.740145	17.371534	1.899496e+08	10.71333	7.174139	23.669113	4.540792	9.651982e+07
min	3.000000	0.600000	2.000000e+00	0.000000e+00	5.100000e+01	99.550000	-4.300000	0.000000	0.000000	1.330000e+08	23.400000	0.800000	0.010000	5.800000	1.008400e+04	38.00000	0.000000	9.900000	0.090000	5.464000e+03
25%	25.000000	22.250000	5.247350e+04	1.350000e+04	5.879000e+03	113.820000	0.950000	9.750000	0.785000	1.513707e+10	97.500000	12.700000	0.405000	18.000000	3.807162e+06	55.77500	11.250000	31.600000	3.457500	2.688883e+06
50%	76.000000	39.300000	2.266800e+05	3.150000e+04	2.495200e+04	123.780000	2.300000	31.100000	1.030000	5.398074e+10	101.900000	40.100000	1.160000	28.700000	1.027744e+07	62.25000	15.200000	37.700000	5.375000	5.894282e+06
75%	163.500000	54.850000	6.339888e+05	1.235000e+05	9.631325e+04	156.785000	3.700000	44.700000	1.275000	2.789289e+11	108.600000	67.100000	2.330000	41.550000	3.314181e+07	68.80000	20.875000	48.200000	8.747500	1.770477e+07
max	26337.000000	82.600000	1.709824e+07	3.031000e+06	9.893038e+06	2740.270000	254.900000	98.300000	2.000000	1.991000e+13	142.500000	94.300000	11.720000	81.000000	1.397715e+09	86.80000	37.200000	219.600000	20.270000	8.429340e+08

	Linear (LASSO)	Random Forest
Train R²	0.605	~0.95+
Validation R²	0.475	higher
Val RMSE	351,456 ha	71,279 ha
Val MAE	114,176 ha	38,604 ha
Test R²	0.1348	0.0135
Test RMSE/MAE	74,648/24,596	17,874/10,069