# Give Access to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# import the library
import pandas as pd

# read the csv into a variable
csv_path = "/content/drive/My Drive/Colab Notebooks/AI/A1.2 Felicidad y GDP.csv"
happiness_index_data = pd.read_csv(csv_path)

# print the first few lines to test the imports
happiness_index_data.head()

# lets start with seeing the nature of the columns
happiness_index_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pais       141 non-null    object 
 1   Felicidad  141 non-null    float64
 2   GDP        141 non-null    float64
dtypes: float64(2), object(1)
memory usage: 3.4+ KB

# descibr the maths of the numerical columns
happiness_index_data.describe()

# Isolate the highest GDP country to confirm context
richest_country = happiness_index_data.nlargest(2, 'GDP')
print(f"The maximum GDP of {richest_country['GDP'].values[0]:.2e} belongs to: {richest_country['Pais'].values[0]}")
print(f"The second maximum GDP of {richest_country['GDP'].values[-1]:.2e} belongs to: {richest_country['Pais'].values[-1]}")

# Isolate the poorest country
poorest_country = happiness_index_data.nsmallest(1, 'GDP')
print(f"\nThe lowest GDP of {poorest_country['GDP'].values[0]:.2e} belongs to: {poorest_country['Pais'].values[0]}")

The maximum GDP of 2.09e+13 belongs to: United States
The second maximum GDP of 1.47e+13 belongs to: China

The lowest GDP of 1.22e+09 belongs to: Comoros

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 2))
sns.boxplot(x=happiness_index_data['GDP'], color='lightpink')
plt.title('Visual Identification of GDP')
plt.show()

# import numpy to be able to manage the tranformations
import numpy as np

# Create log-transformed GDP
happiness_index_data['log_GDP'] = np.log(happiness_index_data['GDP'])

# decsibr the data to see the differences
happiness_index_data.describe()

# do the 1st scatter plot with the original data
plt.scatter(happiness_index_data['GDP'], happiness_index_data['Felicidad'], alpha=0.5, c='c')
plt.xlabel('Raw GDP (USD)')
plt.ylabel('Happiness Score')
plt.title('Happiness vs. Raw GDP')
# This forces the X-axis to show scientific notation (e.g., 1e13) for readability
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.show()

plt.scatter(happiness_index_data['log_GDP'], happiness_index_data['Felicidad'], alpha=0.5, c='m')
plt.xlabel('Logged GDP (USD)')
plt.ylabel('Happiness Score')
plt.title('Happiness vs. Tranformed Log GDP')
plt.show()

X = happiness_index_data['log_GDP']
y = happiness_index_data['Felicidad']

# get the average of X
xBar = np.mean(X)

# get the average of y
yBar = np.mean(y)

# get the numerator of the function with the sum function
B1_num = np.sum((X - xBar) * (y - yBar))

# now get the denominator
B1_den = np.sum((X - xBar)**2)

# Calcula B1 como la proporción entre el numerador y el denominador
B1 = B1_num/B1_den

# Calcula B0
B0 = yBar - (B1*xBar)

# Imprime el valor de B0
print(f"B0: {B0}")

# Imprime el valor de B1
print(f"B1: {B1}")

B0: -1.3023500570747268
B1: 0.27279272665849097

# Code to do the scatter plot again
plt.scatter(X, y, alpha=0.7, c='m')
plt.xlabel("Log GDP")
plt.ylabel("Happiness Score")

# Usando x, B0, y B1, guarda el valor de Y estimada en una variable de nombre yHat
yhat = B0 + B1*X

# Agrega la línea que representa al modelo con la función plot
plt.plot(X, yhat, c='r', linewidth=3, alpha = 0.5)

# Muestra la gráfica usando la función show
plt.show()

# get each of the residuals between the "predicted" y and the actual dot
residuals = y - yhat

# calculate the sum of this residuals squared with the help of numpy
rss = np.sum(residuals**2)

# Lets print the value
print(f"RSS: {rss: 4f}")

RSS:  131.373832

# we need ot get the value of the observations as its used. Thus len of the target value vector
n = len(y)

# get the values for the denominator
denominator_SEB1 = np.sum((X - xBar)**2)

# get the SE with the sqrt from numpy
SEB1 = np.sqrt((rss/(n-2))/(denominator_SEB1))

print(f"SEB1: {SEB1}")

SEB1: 0.043357261652203986

# import the scipy library
import scipy.stats as st

# get the the 97.5 percentile
per = st.t.interval(confidence=0.95, df=n-2)

# get the lower bound
CIlow = B1 - (per[1] * SEB1)

# get the upper bound
CIhigh = B1 + (per[1] * SEB1)

# Print the Confidence interval
print(f"We are 95% confident the slope is between {CIlow} and {CIhigh}")

We are 95% confident the slope is between 0.18706771472547046 and 0.3585177385915115

# get the t - statistic
t = B1 / SEB1

# get the associated p-value
p_value = st.t.sf(np.abs(t), df=n-2) * 2

# show in console the result
print(f"T-statistic: {t:.4f}")

# print the p-value
print(f"P-value: {p_value:.10f}")

T-statistic: 6.2917
P-value: 0.0000000038

# get the rse thorugh the sqrt of the rss / n -2
rse = np.sqrt(rss/(n-2))

# get the tss the total sum of squares
tss = np.sum((y-yBar)**2)

# get the r squared by dividng the rss and the tss
r2 = 1 - (rss/tss)

# print the results
print(f"RSE: {rse}")
print(f"R2: {r2}")

RSE: 0.9721807858537376
R2: 0.22166361654970634

# load the CO2 csv
co2_path = "/content/drive/My Drive/Colab Notebooks/AI/co2-emissions.csv"
co2_data = pd.read_csv(co2_path)

# Load Corruption Index data
corruption_path = "/content/drive/My Drive/Colab Notebooks/AI/corruption-perception-index.csv"
corruption_data = pd.read_csv(corruption_path)

# Load Life Expectancy data
life_exp_path = "/content/drive/My Drive/Colab Notebooks/AI/life-expectancy.csv"
life_exp_data = pd.read_csv(life_exp_path)

print("Successfully loaded 3 external datasets")
print(f"  CO₂ Emissions: {len(co2_data)} rows")
print(f"  Corruption Index: {len(corruption_data)} rows")
print(f"  Life Expectancy: {len(life_exp_data)} rows")

Successfully loaded 3 external datasets
  CO₂ Emissions: 197 rows
  Corruption Index: 181 rows
  Life Expectancy: 201 rows

# Check column names
print("\nColumn names:")
print(co2_data.columns.tolist())

Column names:
['Entity', 'Code', 'Year', 'Annual CO₂ emissions', 'time']

# Check data types
print("\nData types:")
print(co2_data.dtypes)

Data types:
Entity                   object
Code                     object
Year                      int64
Annual CO₂ emissions    float64
time                      int64
dtype: object

# check if the data is for only a specifi year, 9ts through this we can drop the year column
print("\nAvailable years in dataset:")
print(sorted(co2_data['Year'].unique()))

Available years in dataset:
[np.int64(2022)]

# check for the is null count
co2_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entity                197 non-null    object 
 1   Code                  197 non-null    object 
 2   Year                  197 non-null    int64  
 3   Annual CO₂ emissions  197 non-null    float64
 4   time                  197 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 7.8+ KB

# We define a list of columns to remove
cols_to_drop = ['Code', 'Year', 'time']

# We use the drop function.
# axis=1 tells pandas to remove COLUMNS.
co2_data.drop(columns=cols_to_drop, axis=1, inplace=True)

# We rename 'Entity' to 'Country' to make it easier to merge with our Happiness data
co2_data.rename(columns={'Entity': 'Country'}, inplace=True)

# Display the clean version
print("Cleaned CO2 Dataframe:")
print(co2_data.head())

Cleaned CO2 Dataframe:
       Country  Annual CO₂ emissions
0  Afghanistan            10169889.0
1      Albania             4498282.0
2      Algeria           192778560.0
3      Andorra              423408.0
4       Angola            21089004.0

# Check column names
print("\nColumn names:")
print(corruption_data.columns.tolist())

Column names:
['Entity', 'Code', 'Year', 'Corruption Perceptions Index', 'World region according to OWID', 'time', 'time.1']

# Check data types to ensure the index is a float or int
print("\nData Types:")
print(corruption_data.dtypes)

Data Types:
Entity                             object
Code                               object
Year                              float64
Corruption Perceptions Index      float64
World region according to OWID     object
time                              float64
time.1                              int64
dtype: object

# Check which years are available in this specific dataset
print("\nAvailable years in CPI dataset:")
print(sorted(corruption_data['Year'].unique()))

Available years in CPI dataset:
[np.float64(2022.0), np.float64(nan)]

# Perform a health check for null values
corruption_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          181 non-null    object 
 1   Code                            179 non-null    object 
 2   Year                            179 non-null    float64
 3   Corruption Perceptions Index    179 non-null    float64
 4   World region according to OWID  179 non-null    object 
 5   time                            179 non-null    float64
 6   time.1                          181 non-null    int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 10.0+ KB

# We remove 'time.1' and 'time' as they are likely duplicates or internal IDs
cols_to_drop = ['Code', 'Year', 'World region according to OWID', 'time', 'time.1']

# Drop columns (axis=1 for columns)
corruption_data.drop(columns=cols_to_drop, axis=1, inplace=True)

# Rename columns for merging and readability
# We change 'Entity' to 'Country' and shorten the long Index name
corruption_data.rename(columns={
    'Entity': 'Country',
    'Corruption Perceptions Index': 'Corruption_Index'
}, inplace=True)

print("Cleaned Corruption Dataframe:")
print(corruption_data.head())

Cleaned Corruption Dataframe:
       Country  Corruption_Index
0  Afghanistan              24.0
1      Albania              36.0
2      Algeria              33.0
3       Angola              33.0
4    Argentina              38.0

# Check the raw column names
print("\nLife Expectancy Columns:")
print(life_exp_data.columns.tolist())

# Check for temporal consistency
print("\nAvailable years in Life Expectancy dataset:")
print(sorted(life_exp_data['Year'].unique()))

Life Expectancy Columns:
['Entity', 'Code', 'Year', 'Life expectancy - Sex: all - Age: 0 - Variant: estimates', 'time']

Available years in Life Expectancy dataset:
[np.int64(2022)]

# check for possible nulls
life_exp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 5 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Entity                                                    201 non-null    object 
 1   Code                                                      201 non-null    object 
 2   Year                                                      201 non-null    int64  
 3   Life expectancy - Sex: all - Age: 0 - Variant: estimates  201 non-null    float64
 4   time                                                      201 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 8.0+ KB

# redundant columns to remove
cols_to_drop = ['Code', 'Year', 'time']

# .drop() to clean the dataframe
life_exp_data.drop(columns=cols_to_drop, axis=1, inplace=True)

# We change 'Entity' to 'Country' and simplify the massive life expectancy label
life_exp_data.rename(columns={
    'Entity': 'Country',
    'Life expectancy - Sex: all - Age: 0 - Variant: estimates': 'Life_Expectancy'
}, inplace=True)

print("Cleaned Life Expectancy Dataframe:")
print(life_exp_data.head())

Cleaned Life Expectancy Dataframe:
       Country  Life_Expectancy
0  Afghanistan           65.617
1      Albania           78.769
2      Algeria           76.129
3      Andorra           84.016
4       Angola           64.246

# rename 'Pais' to 'Country' for consistency in the OG dataset
happiness_index_data_extended = happiness_index_data.copy()
happiness_index_data_extended.rename(columns={'Pais': 'Country'}, inplace=True)

# merge CO2 data
happiness_index_data_extended = happiness_index_data_extended.merge(
    co2_data,
    on='Country',
    how='left'  # Keep all happiness countries
)
print(f"   After merge: {len(happiness_index_data_extended)} rows")
print(f"   Missing C02 data: {happiness_index_data_extended['Annual CO₂ emissions'].isna().sum()} countries")

   After merge: 141 rows
   Missing C02 data: 2 countries

# Merge Corruption data
happiness_index_data_extended = happiness_index_data_extended.merge(
    corruption_data,
    on='Country',
    how='left'
)
print(f"   After merge: {len(happiness_index_data_extended)} rows")
print(f"   Missing Corruption data: {happiness_index_data_extended['Corruption_Index'].isna().sum()} countries")

   After merge: 141 rows
   Missing Corruption data: 2 countries

# merge the life expectancy data
happiness_index_data_extended = happiness_index_data_extended.merge(
    life_exp_data,
    on='Country',
    how='left'
)
print(f"   After merge: {len(happiness_index_data_extended)} rows")
print(f"   Missing Life Expectancy data: {happiness_index_data_extended['Life_Expectancy'].isna().sum()} countries")

   After merge: 141 rows
   Missing Life Expectancy data: 2 countries

# identify which countries are missing CO2 data
missing_co2 = happiness_index_data_extended[happiness_index_data_extended['Annual CO₂ emissions'].isna()]
print("Countries missing CO2 data:")
print(missing_co2['Country'].tolist())

# identify which countries are missing Corruption data
missing_corruption = happiness_index_data_extended[happiness_index_data_extended['Corruption_Index'].isna()]
print("\nCountries missing Corruption data:")
print(missing_corruption['Country'].tolist())

# identify which countries are missing Life Expectancy data
missing_life = happiness_index_data_extended[happiness_index_data_extended['Life_Expectancy'].isna()]
print("\nCountries missing Life Expectancy data:")
print(missing_life['Country'].tolist())

Countries missing CO2 data:
['Hong Kong', 'Ivory Coast']

Countries missing Corruption data:
['Hong Kong', 'Ivory Coast']

Countries missing Life Expectancy data:
['Hong Kong', 'Ivory Coast']

# create sets for each dataset's country list
h_set = set(happiness_index_data['Pais'].unique())
co2_set = set(co2_data['Country'].unique())
c_set = set(corruption_data['Country'].unique())
l_set = set(life_exp_data['Country'].unique())

# find countries in Happiness that are MISSING in the others
missing_in_co2 = sorted(list(h_set - co2_set))
missing_in_corruption = sorted(list(h_set - c_set))
missing_in_life = sorted(list(h_set - l_set))

# print
print(f"Missing in CO2 ({len(missing_in_co2)}): {missing_in_co2}")
print(f"Missing in Corruption ({len(missing_in_corruption)}): {missing_in_corruption}")
print(f"Missing in Life Exp ({len(missing_in_life)}): {missing_in_life}")

Missing in CO2 (2): ['Hong Kong', 'Ivory Coast']
Missing in Corruption (2): ['Hong Kong', 'Ivory Coast']
Missing in Life Exp (2): ['Hong Kong', 'Ivory Coast']

# standardize naming for Ivory Coast across all source dataframes
name_fixes = {"Cote d'Ivoire": "Ivory Coast"}

co2_data = co2_data.replace({"Country": name_fixes})
corruption_data = corruption_data.replace({"Country": name_fixes})
life_exp_data = life_exp_data.replace({"Country": name_fixes})

# re do the merge starting from the original Log_GDP data
df_master = happiness_index_data_extended[['Country', 'Felicidad', 'log_GDP']].copy()

# merge all three additional features
df_master = df_master.merge(co2_data[['Country', 'Annual CO₂ emissions']], on='Country', how='left')
df_master = df_master.merge(corruption_data[['Country', 'Corruption_Index']], on='Country', how='left')
df_master = df_master.merge(life_exp_data[['Country', 'Life_Expectancy']], on='Country', how='left')

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# CO2 Distribution
axes[0, 0].hist(happiness_index_data_extended['Annual CO₂ emissions'], bins=25, alpha=0.7, color='lightblue')
axes[0, 0].set_xlabel('CO₂ Emissions (tons per capita)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('Distribution of CO₂ Emissions')

# corruption Distribution
axes[0, 1].hist(happiness_index_data_extended['Corruption_Index'], bins=25, alpha=0.7, color='lightpink')
axes[0, 1].set_xlabel('Corruption Perceptions Index (0-100)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].set_title('Distribution of Corruption Index')

# Life Expectancy Distribution
axes[1, 0].hist(happiness_index_data_extended['Life_Expectancy'], bins=25, alpha=0.7, color='#b19cd9')
axes[1, 0].set_xlabel('Life Expectancy (years)', fontsize=11)
axes[1, 0].set_ylabel('Frequency', fontsize=11)
axes[1, 0].set_title('Distribution of Life Expectancy')

# Happiness (for comparison)
axes[1, 1].hist(happiness_index_data_extended['Felicidad'], bins=25, alpha=0.7, color='#fff192')
axes[1, 1].set_xlabel('Happiness Score', fontsize=11)
axes[1, 1].set_ylabel('Frequency', fontsize=11)
axes[1, 1].set_title('Distribution of Happiness (Target)')

plt.show()

# drop incomplete rows (e.g., Hong Kong) to ensure a complete case basis
df_master.dropna(inplace=True)

print(f"Final observation count (n): {len(df_master)} countries.")
df_master.head()

Final observation count (n): 140 countries.

min_co2 = df_master['Annual CO₂ emissions'].min()
print(f"The lowest emission value in our data is: {min_co2}")

The lowest emission value in our data is: 531671.0

# apply  log transformation
df_master['log_CO2'] = np.log(df_master['Annual CO₂ emissions'])

# helps visualize the Before vs After
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Original
ax1.hist(df_master['Annual CO₂ emissions'], bins=25, color='lightblue')
ax1.set_title('Original CO₂ (Heavy Skew)')

# Transformed
ax2.hist(df_master['log_CO2'], bins=25, color='skyblue')
ax2.set_title('Log-Transformed CO₂ (Normalized)')

plt.show()

# drop the annual co2 emissions
df_master.drop('Annual CO₂ emissions', axis=1, inplace=True)

df_master.head()

from sklearn.model_selection import train_test_split

# We split the 'df_master' created in the previous step
train, test = train_test_split(df_master, train_size=0.8, random_state=42)

# print the numbers for general curiosity nd reinfocement
print(f"Training set size: {train.shape[0]} countries")
print(f"Validation set size: {test.shape[0]} countries")

Training set size: 112 countries
Validation set size: 28 countries

train.head()

import statsmodels.api as sm

# we have to drop the target column from the training data
X = train.drop(['Felicidad', 'Country'], axis=1)

# define our trget variable
y_train = train['Felicidad']

# define the kind of model (OLS)
model = sm.OLS(y_train, sm.add_constant(X))

# adjunst or fit the model
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              Felicidad   R-squared:                       0.647
Model:                            OLS   Adj. R-squared:                  0.634
Method:                 Least Squares   F-statistic:                     49.09
Date:                Mon, 26 Jan 2026   Prob (F-statistic):           2.19e-23
Time:                        19:56:53   Log-Likelihood:                -110.10
No. Observations:                 112   AIC:                             230.2
Df Residuals:                     107   BIC:                             243.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -3.4957      1.320     -2.649      0.009      -6.111      -0.880
log_GDP              0.2207      0.111      1.996      0.048       0.001       0.440
Corruption_Index     0.0119      0.006      2.137      0.035       0.001       0.023
Life_Expectancy      0.0899      0.013      6.801      0.000       0.064       0.116
log_CO2             -0.2070      0.103     -2.016      0.046      -0.411      -0.003
==============================================================================
Omnibus:                       20.399   Durbin-Watson:                   2.032
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               25.259
Skew:                          -1.015   Prob(JB):                     3.27e-06
Kurtosis:                       4.138   Cond. No.                     1.95e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.95e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

import scipy.stats as st
import numpy as np

# get the predicted y
yhat = results.predict(sm.add_constant(X))

# get the mean y
ybar = np.mean(y_train)

# get the ess
ESS = sum((yhat - ybar)**2)

# get the m
m = X.shape[1]

# get the EMS
EMS = ESS / m

# RSS calculation with the previous values defines
RSS = sum((y_train - yhat)**2)

# get the number of observations
n = X.shape[0]

# RMS and the F statistic
RMS = RSS / (n - m - 1)
F = EMS / RMS

# Finaly get the p-value
pval = st.f.sf(F, m, n - m - 1)

#print
print("RSS =", RSS)
print("F =", F)
print("p-value =", pval)

RSS = 46.83382692162639
F = 49.086006365049826
p-value = 2.1892007201308535e-23

# to test the significance we drop the co2
XNew = X.drop('log_CO2', axis = 1)

# created the new model and feed it the params
modelNew = sm.OLS(y_train, sm.add_constant(XNew))

#. fir to the new model
resultsNew = modelNew.fit()

# get the summary of the results
print(resultsNew.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              Felicidad   R-squared:                       0.634
Model:                            OLS   Adj. R-squared:                  0.624
Method:                 Least Squares   F-statistic:                     62.32
Date:                Mon, 26 Jan 2026   Prob (F-statistic):           1.83e-23
Time:                        19:52:59   Log-Likelihood:                -112.18
No. Observations:                 112   AIC:                             232.4
Df Residuals:                     108   BIC:                             243.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -1.7314      1.002     -1.729      0.087      -3.717       0.254
log_GDP              0.0122      0.040      0.307      0.760      -0.066       0.091
Corruption_Index     0.0180      0.005      3.804      0.000       0.009       0.027
Life_Expectancy      0.0848      0.013      6.446      0.000       0.059       0.111
==============================================================================
Omnibus:                       18.961   Durbin-Watson:                   2.080
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               22.722
Skew:                          -0.978   Prob(JB):                     1.16e-05
Kurtosis:                       4.020   Cond. No.                     1.43e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.43e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

	Pais	Felicidad	GDP
0	Finland	7.8210	2.718370e+11
1	Denmark	7.6362	3.560850e+11
2	Iceland	7.5575	2.171808e+10
3	Switzerland	7.5116	7.522480e+11
4	Netherlands	7.4149	9.138650e+11

	Felicidad	GDP
count	141.000000	1.410000e+02
mean	5.560004	5.889942e+11
std	1.098011	2.221612e+12
min	2.403800	1.223876e+09
25%	4.887900	1.805117e+10
50%	5.585300	6.215800e+10
75%	6.309100	3.452960e+11
max	7.821000	2.089370e+13

	Felicidad	GDP	log_GDP
count	141.000000	1.410000e+02	141.000000
mean	5.560004	5.889942e+11	25.155928
std	1.098011	2.221612e+12	1.895052
min	2.403800	1.223876e+09	20.925289
25%	4.887900	1.805117e+10	23.616476
50%	5.585300	6.215800e+10	24.852945
75%	6.309100	3.452960e+11	26.567668
max	7.821000	2.089370e+13	30.670469

	Country	Felicidad	log_GDP	Annual CO₂ emissions	Corruption_Index	Life_Expectancy
0	Finland	7.8210	26.328468	36337000.0	87.0	81.243
1	Denmark	7.6362	26.598435	29094320.0	90.0	81.291
2	Iceland	7.5575	23.801411	3597685.0	74.0	81.588
3	Switzerland	7.5116	27.346332	32950562.0	82.0	83.200
4	Netherlands	7.4149	27.540949	127503144.0	80.0	81.912

	Country	Felicidad	log_GDP	Corruption_Index	Life_Expectancy	log_CO2
0	Finland	7.8210	26.328468	87.0	81.243	17.408347
1	Denmark	7.6362	26.598435	90.0	81.291	17.186054
2	Iceland	7.5575	23.801411	74.0	81.588	15.095801
3	Switzerland	7.5116	27.346332	82.0	83.200	17.310519
4	Netherlands	7.4149	27.540949	80.0	81.912	18.663652

Introduction¶

Data Exploration¶

Transformations to the data¶

Analysis¶

Variable Definition¶

Linear Regression¶

The Analysis¶

Metrics & Revision¶

The result¶

The result¶

Results¶

The t-statistic¶

The p-value¶

Residual Standard Error (RSE)¶

Total Sum of Squares (TSS)¶

R-Squared ($R^2$)¶

Final Conclusions on Single Variable Linear Regression¶

Multivariate linear Regression¶

Extensions of the Data¶

CO2 Emissions exploration and understanding¶

Steps to take to understand and see data¶

Corruption Perception Index Exploration and Understanding¶

Steps to follow and results we saw:¶

Life Expectancy data Exploration and Understanding¶

Steps followed and results seen:¶

Resolving Naming Conflicts (Standardization)¶

The Aggregation Process (The "Left" Merge)¶

Visualizing Feature Distributions¶

Ensuring Statistical Integrity¶

tranformations to C02 scales¶

Multivariate Linear Regression Implementation¶

Results¶

Manual Model Verification¶

Reduced Model¶

Comparison of the two possible Multivariable regression Models¶

Final Comparison of the Models¶

Additonal Variables. Good?¶

Limitations¶

Future Work¶

Conclusion¶

	Country	Felicidad	log_GDP	Corruption_Index	Life_Expectancy	log_CO2
16	United Kingdom	6.9425	28.645128	73.0	81.074	19.555683
18	Belgium	6.8050	26.980314	73.0	81.159	18.303191
10	Austria	7.1630	26.794599	71.0	81.296	17.933802
112	Uganda	4.6026	24.350280	26.0	67.675	15.628420
102	Gabon	4.9583	23.452218	29.0	67.713	15.570477