# importing packages
import pandas as pd
import datetime as dt
import pandas_datareader.data as web
import os
import pandasdmx as pdmx
import numpy as np

C:\Users\gamepc\anaconda3\lib\site-packages\pandasdmx\remote.py:14: RuntimeWarning: optional dependency requests_cache is not installed; cache options to Session() have no effect
  RuntimeWarning,


# Data 
oecd = pdmx.Request("OECD")

#Reading in Employment dataset from OECD
data = oecd.data(resource_id = "MIG_NUP_RATES_GENDER", key ="AUS+AUT+BEL+CAN+CHL+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+USA.NB.TOT.N_RATE+U_RATE+P_RATE/all?startTime=2000&endTime=2019").to_pandas()
df_employment = pd.DataFrame(data).reset_index()

#Reading in Population dataset from OECD
data = oecd.data(resource_id = "HISTPOP", key ="AUS+AUT+BEL+CAN+CHL+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+USA.W+M+T.TOTAL/all?startTime=2000&endTime=2019").to_pandas()
df_population = pd.DataFrame(data).reset_index()

#Reading in Migration dataset from OECD
data = oecd.data(resource_id = "MIG", key ="TOT.B11.TOT.AUS+AUT+BEL+CAN+CHL+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+USA/all?startTime=2000&endTime=2019").to_pandas()
df_migration = pd.DataFrame(data).reset_index()

#Reading in GDP per capita dataset from OECD
data = oecd.data(resource_id = "SNA_TABLE1", key ="AUS+AUT+BEL+CAN+CHL+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+USA.B1_GE.HVPVOB/all?startTime=2000&endTime=2019").to_pandas()
df_GDPpc = pd.DataFrame(data).reset_index()

#Reading in inflation dataset from OECD
data = oecd.data(resource_id = "PRICES_CPI", key ="AUS+AUT+BEL+CAN+CHL+CZE+DNK+EST+FIN+FRA+DEU+GRC+HUN+ISL+IRL+ISR+ITA+LVA+LTU+LUX+MEX+NLD+NZL+NOR+POL+PRT+SVK+SVN+ESP+SWE+CHE+TUR+GBR+USA.CPALTT01+CP18ALTT.GY.A/all?startTime=2000&endTime=2019").to_pandas()
df_inflation = pd.DataFrame(data).reset_index()


# Data selection and merging
#Selection of only native born unemployment
df_employment = df_employment[df_employment["RATE"] == "U_RATE"]

#selection of only Total population
df_population = df_population[df_population["SEX"] == "T"]

#Dropping irrelevant columns
df_employment.drop(["BIRTH","GENDER","RATE"], axis=1, inplace = True)
df_population.drop(["SEX","AGE"], axis=1, inplace = True)
df_migration.drop(["CO2","VAR","GEN"], axis=1, inplace = True)
df_GDPpc.drop(["TRANSACT","MEASURE"], axis=1, inplace = True)
df_inflation.drop(["SUBJECT","MEASURE","FREQUENCY"], axis=1, inplace = True)

#RENAMING COLUMNS
df_employment.rename(columns = {"COUNTRY": "Country", "TIME_PERIOD": "Year", "value": "Unemployment"}, inplace = True)
df_population.rename(columns = {"LOCATION": "Country", "TIME_PERIOD": "Year", "value": "Population"}, inplace = True)
df_migration.rename(columns = {"COU": "Country", "TIME_PERIOD": "Year", "value": "Migration"}, inplace = True)
df_GDPpc.rename(columns = {"LOCATION": "Country", "TIME_PERIOD": "Year", "value": "GDPpc"}, inplace = True)
df_inflation.rename(columns = {"LOCATION": "Country", "TIME_PERIOD": "Year", "value":"Inflation"}, inplace = True)

#combiningall datasets
df_combined = df_employment.merge(df_population, how = "left", left_on = ["Country", "Year"], right_on = ["Country", "Year"]).merge(df_migration, how="left", left_on = ["Country", "Year"], right_on = ["Country", "Year"])
df_combined = df_combined.merge(df_GDPpc, how = "left", left_on = ["Country", "Year"], right_on = ["Country", "Year"]).merge(df_inflation, how="left", left_on = ["Country", "Year"], right_on = ["Country", "Year"])


#Exploring the main dataset
df_combined.head()


#Creating the variable migration as a share of population
df_combined["Migration"] = df_combined["Migration"]/df_combined["Population"]
#Dropping the population column
df_combined.drop(["Population"], axis = 1, inplace = True)


data = df_combined
# Checking missing values
print(data.isna().sum())
print(data.shape)

Country          0
Year             0
Unemployment     0
Migration       23
GDPpc            0
Inflation        0
dtype: int64
(592, 6)


# Import necessary packages
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(20,10))

plt.title("DAG describing relationship between migration and native born unemployment levels")

# Create 5 nodes for the content of teh DAG and 3 to use as a legend indicating whether the variable is the relationship of interest, unobserved or a covariate.
DAG = nx.DiGraph()
variables = np.arange(0, 8).tolist()
DAG.add_nodes_from(variables)

# Add arrows between nodes
DAG.add_edges_from([(0,3), (1,3),
 (2,3), (3, 4)])

# Set node colors
colors = ["lime", "mistyrose","mistyrose", "coral",
          "lime",
          "lime",
          "coral",
          "mistyrose"]

# Set node positions          
pos = {0:(1, 8),
 1:(1, 6), 2:(1, 4),
 3:(5, 6), 4:(8, 6),5:(8, 9),
 6:(8, 8.5), 7:(8, 8)}

# Set node labels
labels = {0:"Migration",
 1:"Inflation", 2: "GDPpc",
 3: "Labor \nMarket \nConditions", 4: "Unemployment\n Native Born", 5:"Variables of interest",6:"Unobserved",7:"Covariates"}

# Set node sizes
sizes = [10000, 10000, 10000, 10000, 10000,1000,1000,1000]

# Draw DAG
nx.draw(DAG, pos = pos, labels = labels, arrows = True,
 node_shape = "s", node_size=sizes,node_color = colors)

plt.savefig("DAG.png", format="PNG")
plt.show()


# Import necessary packages
import matplotlib.colors as colors
import matplotlib as mpl
import matplotlib.cm as mplcm
import statsmodels.api as sm
from matplotlib import pyplot as plt

# Give every country a different color
plt.style.use('seaborn-dark-palette')
NUM_COLORS = 34

cm = plt.get_cmap('tab20b') 
cNorm  = colors.Normalize(vmin=0, vmax=NUM_COLORS-1)
scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)

mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=([scalarMap.to_rgba(i) for i in range(NUM_COLORS)]))

# Create figure
fig, ax1 = plt.subplots(figsize=(10, 10))

# Plot Native Born Unemployment on the y-axis and Migration on the x-axis
for country in df_combined.Country.unique():
    mask = (df_combined.Country == country)
    ax1.scatter(df_combined[mask]['Migration'], df_combined[mask]['Unemployment'], label = country)

ax1.set_xlabel('Migration')
ax1.set_ylabel("Native Born Unemployment(%)")

plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.);


df_simulated = df_combined.copy()
df_simulated.head()
df_simulated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 592 entries, 0 to 591
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       592 non-null    object 
 1   Year          592 non-null    object 
 2   Unemployment  592 non-null    float64
 3   Migration     569 non-null    float64
 4   GDPpc         592 non-null    float64
 5   Inflation     592 non-null    float64
dtypes: float64(4), object(2)
memory usage: 32.4+ KB


#creating simulated data
from sklearn import datasets
features, output, coef = datasets.make_regression(n_samples = 592, n_features = 3,
                                n_informative = 4, n_targets = 1,
                                noise = 0.0, coef = True)


#replacing columns of original dataset with simulated data
features = pd.DataFrame(features, columns=['Migration', 'GDPpc', 'Inflation'])
target = pd.DataFrame(output, columns=['Unemployment'])


#Dropping real data columns and merging the simulated data columns to df_simulated
df_simulated.drop(['Migration', 'GDPpc', 'Inflation','Unemployment'], axis = 1, inplace = True)


df_simulated = pd.concat([df_simulated,features,target], axis=1)


print("Simulated dataframe: \n",df_simulated.head())
print("\n\n Real dataframe: \n",df_combined.head())

Simulated dataframe: 
   Country  Year  Migration     GDPpc  Inflation  Unemployment
0     AUS  2000   1.733243  0.000080   0.478075     67.662168
1     AUS  2001   0.332955 -0.834127   0.299409    -21.361405
2     AUS  2002  -1.312477 -1.925291   1.567559    -40.344644
3     AUS  2003  -0.441943 -1.607506  -0.819247   -151.702276
4     AUS  2004   1.131345 -2.052216   0.446066    -63.792617


 Real dataframe: 
   Country  Year  Unemployment  Migration         GDPpc  Inflation
0     AUS  2000           6.3   0.005631  37933.185775   4.457435
1     AUS  2001           6.7   0.006634  38952.657236   4.407135
2     AUS  2002           6.3   0.006108  39709.935147   2.981575
3     AUS  2003           5.9   0.006258  40906.749065   2.732596
4     AUS  2004           5.5   0.007347  41750.715304   2.343255


# Set independant and explanatory variables as y and x
df_Lasso = df_simulated
y = pd.DataFrame(df_Lasso["Unemployment"])
X = df_Lasso.drop(["Unemployment", "Country", "Year"], axis = 1)

#Check data for y and X datasets
#print(y.head())
#print(X.head())

# Import and run the function to split our data in a train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

# Import lasso models and fit the model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

# Lasso with 5 fold cross-validation
model = LassoCV(cv=10, random_state=0, max_iter=1000000)

# Fit model
model.fit(X_train, y_train)

# Print optimal alpha for our model
print(model.alpha_)

# Set best alpha determined using crossvalidation for our new lasso model
lasso_best = Lasso(alpha=model.alpha_)
lasso_best.fit(X_train, y_train)

# The resulting coeffecients from the lasso regression on the simulated data are stored to later compare to the values used to simulate the data during data creation
coef_sim = list(zip(lasso_best.coef_, X))
print(coef_sim)

# Print model R squareds
print('R squared training set', round(lasso_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(lasso_best.score(X_test, y_test)*100, 2))

#comparing true coefficients with the lasso coefficients:
coef_df = pd.DataFrame(coef, columns=['True coefficient values'])
coef_sim_df = pd.DataFrame(coef_sim, columns=['Simulated data lasso coefficients','Column'])
df_coefficients = coef_df.join(coef_sim_df)

df_coefficients = df_coefficients[['Column', 'Simulated data lasso coefficients', 'True coefficient values']]
print(df_coefficients.to_string(index=False))

C:\Users\gamepc\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:1571: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

0.057027570442860255
[(21.825187464590677, 'Migration'), (56.60128660649159, 'GDPpc'), (62.12052127946433, 'Inflation')]
R squared training set 100.0
R squared test set 100.0
   Column  Simulated data lasso coefficients  True coefficient values
Migration                          21.825187                21.883924
    GDPpc                          56.601287                56.664626
Inflation                          62.120521                62.181616


import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns

unit_names = ['AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'CHL', 'CZE', 'DEU', 'DNK',
       'ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GRC', 'HUN', 'IRL', 'ISL',
       'ISR', 'ITA', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD', 'NOR', 'NZL',
       'POL', 'PRT', 'SVK', 'SVN', 'SWE', 'TUR', 'USA']

unit_col_name='Country'
time_period_col_name='Year'

#Define y and X
y_var_name = 'Unemployment'
X_var_names = ['Migration','GDPpc','Inflation']


#selecting data
df_panel = df_simulated.dropna()

#looking at the correlation
print(df_panel.corr())

#Create the dummy variables, one for each country
df_dummies = pd.get_dummies(df_panel[unit_col_name])

#Join the dummies Dataframe with the panel data set
df_panel_with_dummies = df_panel.join(df_dummies)

              Migration     GDPpc  Inflation  Unemployment
Migration      1.000000 -0.019727  -0.006861      0.236185
GDPpc         -0.019727  1.000000  -0.016705      0.647171
Inflation     -0.006861 -0.016705   1.000000      0.707814
Unemployment   0.236185  0.647171   0.707814      1.000000


lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name


print('Regression expression for OLS with dummies=' + lsdv_expr)

lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies)
lsdv_model_results = lsdv_model.fit()
print('===============================================================================')
print('============================== OLSR With Dummies ==============================')
print(lsdv_model_results.summary())
print('LSDV='+str(lsdv_model_results.ssr))

Regression expression for OLS with dummies=Unemployment ~ Migration + GDPpc + Inflation + AUS + AUT + BEL + CAN + CHE + CHL + CZE + DEU + DNK + ESP + EST + FIN + FRA + GBR + GRC + HUN + IRL + ISL + ISR + ITA + LTU + LUX + LVA + MEX + NLD + NOR + NZL + POL + PRT + SVK + SVN + SWE + TUR
===============================================================================
============================== OLSR With Dummies ==============================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Unemployment   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.931e+29
Date:                Fri, 26 Aug 2022   Prob (F-statistic):               0.00
Time:                        19:18:21   Log-Likelihood:                 15685.
No. Observations:                 592   AIC:                        -3.130e+04
Df Residuals:                     555   BIC:                        -3.113e+04
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1.137e-13   1.75e-13      0.650      0.516    -2.3e-13    4.57e-13
Migration     21.8839    3.4e-14   6.44e+14      0.000      21.884      21.884
GDPpc         56.6646    3.3e-14   1.72e+15      0.000      56.665      56.665
Inflation     62.1816   3.37e-14   1.85e+15      0.000      62.182      62.182
AUS         7.105e-14   2.48e-13      0.287      0.774   -4.16e-13    5.58e-13
AUT        -9.948e-14   2.48e-13     -0.402      0.688   -5.86e-13    3.87e-13
BEL        -1.066e-14   2.47e-13     -0.043      0.966   -4.96e-13    4.75e-13
CAN        -3.908e-14   2.85e-13     -0.137      0.891   -5.99e-13    5.21e-13
CHE        -8.527e-14   2.51e-13     -0.340      0.734   -5.78e-13    4.08e-13
CHL          3.02e-14   3.63e-13      0.083      0.934   -6.82e-13    7.43e-13
CZE        -4.885e-14    2.5e-13     -0.196      0.845   -5.39e-13    4.42e-13
DEU         2.665e-14   2.48e-13      0.107      0.915   -4.61e-13    5.15e-13
DNK         3.197e-14   2.49e-13      0.129      0.898   -4.56e-13     5.2e-13
ESP         2.132e-14   2.46e-13      0.087      0.931   -4.62e-13    5.04e-13
EST         1.776e-15   2.47e-13      0.007      0.994   -4.84e-13    4.87e-13
FIN                 0   2.47e-13          0      1.000   -4.85e-13    4.85e-13
FRA         6.395e-14   2.47e-13      0.259      0.796   -4.22e-13     5.5e-13
GBR         4.974e-14   2.47e-13      0.201      0.840   -4.35e-13    5.35e-13
GRC         1.421e-14   2.46e-13      0.058      0.954   -4.69e-13    4.97e-13
HUN         3.197e-14   2.47e-13      0.129      0.897   -4.54e-13    5.18e-13
IRL         8.171e-14   2.47e-13      0.331      0.741   -4.04e-13    5.67e-13
ISL          6.75e-14   2.47e-13      0.273      0.785   -4.18e-13    5.53e-13
ISR        -9.237e-14   3.25e-13     -0.284      0.777   -7.32e-13    5.47e-13
ITA         3.109e-14   2.47e-13      0.126      0.900   -4.54e-13    5.16e-13
LTU         5.684e-14    5.8e-13      0.098      0.922   -1.08e-12     1.2e-12
LUX         1.066e-14   2.46e-13      0.043      0.966   -4.74e-13    4.95e-13
LVA        -3.908e-14   4.82e-13     -0.081      0.935   -9.85e-13    9.07e-13
MEX        -5.684e-14   2.67e-13     -0.213      0.832   -5.82e-13    4.68e-13
NLD         6.928e-14   2.47e-13      0.280      0.779   -4.16e-13    5.55e-13
NOR         8.704e-14   2.46e-13      0.353      0.724   -3.97e-13    5.71e-13
NZL         7.816e-14   2.54e-13      0.308      0.758    -4.2e-13    5.76e-13
POL        -6.217e-14   2.49e-13     -0.249      0.803   -5.52e-13    4.28e-13
PRT         2.132e-14   2.47e-13      0.086      0.931   -4.63e-13    5.06e-13
SVK        -7.816e-14   2.51e-13     -0.312      0.755    -5.7e-13    4.14e-13
SVN         1.066e-14   2.47e-13      0.043      0.966   -4.75e-13    4.96e-13
SWE         3.819e-14   2.46e-13      0.155      0.877   -4.45e-13    5.22e-13
TUR         5.773e-14   2.86e-13      0.202      0.840   -5.04e-13     6.2e-13
==============================================================================
Omnibus:                        2.879   Durbin-Watson:                   1.861
Prob(Omnibus):                  0.237   Jarque-Bera (JB):                2.643
Skew:                          -0.094   Prob(JB):                        0.267
Kurtosis:                       2.733   Cond. No.                         33.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
LSDV=3.3580091823871685e-22


print(coef_sim_df)

   Simulated data lasso coefficients     Column
0                          21.825187  Migration
1                          56.601287      GDPpc
2                          62.120521  Inflation


#First build and fit a Pooled OLSR model on the panel data set 
pooled_y=df_panel[y_var_name]
pooled_X=df_panel[X_var_names]
pooled_X = sm.add_constant(pooled_X)
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
pooled_olsr_model_results = pooled_olsr_model.fit()


#Setup the variables for calculating the F-test

#n=number of groups
n=len(unit_names)

#T=number of time periods per unit
T=df_panel.shape[0]/n

#N=total number of rows in the panel data set
N=n*T

#k=number of regression variables of the Pooled OLS model
k=len(X_var_names)+1


#Get the Residual Sum of Squares for the Pooled OLS model
ssr_restricted_model = pooled_olsr_model_results.ssr

#Get the Residual Sum of Squares for the Fixed Effects model
ssr_unrestricted_model = lsdv_model_results.ssr

#Get the degrees of freedom of the Pooled OLSR model
k1 = len(pooled_olsr_model_results.params)


#Get the degrees of freedom of the Fixed Effects model
k2 = len(lsdv_model_results.params)


#Calculate the F statistic
f_statistic = ((ssr_restricted_model - ssr_unrestricted_model)/ssr_unrestricted_model)*((N-k2)/(
        k2-k1))
print('F-statistic for FE model='+str(f_statistic))

#Calculate the critical value at alpha=.05
alpha=0.05
f_critical_value=st.f.ppf((1.0-alpha), (k2-k1), (N-k2))
print('F test critical value at alpha=0.05='+str(f_critical_value))

F-statistic for FE model=-16.8069857309734
F test critical value at alpha=0.05=1.4575225366557338


#For the pooled Lasso regression we delete the rows that contain NaN values
df_Lasso = df_combined.dropna()


y = pd.DataFrame(df_Lasso["Unemployment"])
X = df_Lasso.drop(["Unemployment", "Country", "Year"], axis = 1)


#Check data for y and X datasets
print(y.head())
print(X.head())

   Unemployment
0           6.3
1           6.7
2           6.3
3           5.9
4           5.5
   Migration         GDPpc  Inflation
0   0.005631  37933.185775   4.457435
1   0.006634  38952.657236   4.407135
2   0.006108  39709.935147   2.981575
3   0.006258  40906.749065   2.732596
4   0.007347  41750.715304   2.343255


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)


from sklearn.preprocessing import StandardScaler

#scaling y
scaler = StandardScaler().fit(np.asarray(y_train['Unemployment']).reshape(-1,1)) 

y_train['Unemployment'] = scaler.transform(np.asarray(y_train['Unemployment']).reshape(-1,1)) 
y_test['Unemployment'] = scaler.transform(np.asarray(y_test['Unemployment']).reshape(-1,1)) 

#scaling X
#List of numerical columns
list_numerical = [ 'Migration', 
       'GDPpc', 
       'Inflation']

scaler = StandardScaler().fit(X_train[list_numerical]) 

X_train[list_numerical] = scaler.transform(X_train[list_numerical])
X_test[list_numerical] = scaler.transform(X_test[list_numerical])


from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

# Lasso with 5 fold cross-validation
model = LassoCV(cv=5, random_state=0, max_iter=1000000)

# Fit model
model.fit(X_train, y_train)

C:\Users\gamepc\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:1571: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

LassoCV(cv=5, max_iter=1000000, random_state=0)


print(model.alpha_)

0.0729890061384064


# Set best alpha
lasso_best = Lasso(alpha=model.alpha_)
lasso_best.fit(X_train, y_train)

Lasso(alpha=0.0729890061384064)


coef_ = list(zip(lasso_best.coef_, X))
print(coef_)

[(-0.11925072109542673, 'Migration'), (-0.28712408837860676, 'GDPpc'), (-0.0, 'Inflation')]


print('R squared training set', round(lasso_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(lasso_best.score(X_test, y_test)*100, 2))

R squared training set 20.64
R squared test set 16.79


import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
import seaborn as sns

unit_names = ['AUS', 'AUT', 'BEL', 'CAN', 'CHE', 'CHL', 'CZE', 'DEU', 'DNK',
       'ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GRC', 'HUN', 'IRL', 'ISL',
       'ISR', 'ITA', 'LTU', 'LUX', 'LVA', 'MEX', 'NLD', 'NOR', 'NZL',
       'POL', 'PRT', 'SVK', 'SVN', 'SWE', 'TUR', 'USA']

unit_col_name='Country'
time_period_col_name='Year'

#Define y and X
y_var_name = 'Unemployment'
X_var_names = ['Migration','GDPpc','Inflation']


#selecting data
df_panel = df_combined.dropna()

#scaling the X features
from sklearn.preprocessing import StandardScaler

features_to_scale = ['Migration','GDPpc','Inflation','Unemployment']
df_panel[features_to_scale] = StandardScaler().fit_transform(df_panel[features_to_scale])

C:\Users\gamepc\anaconda3\lib\site-packages\pandas\core\frame.py:3678: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


#looking at the correlation
print(df_panel.corr())

#Create the dummy variables, one for each country
df_dummies = pd.get_dummies(df_panel[unit_col_name])

#Join the dummies Dataframe with the panel data set
df_panel_with_dummies = df_panel.join(df_dummies)

              Unemployment  Migration     GDPpc  Inflation
Unemployment      1.000000  -0.382632 -0.441784  -0.050550
Migration        -0.382632   1.000000  0.738889  -0.083024
GDPpc            -0.441784   0.738889  1.000000  -0.214777
Inflation        -0.050550  -0.083024 -0.214777   1.000000


lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name


print('Regression expression for OLS with dummies=' + lsdv_expr)

lsdv_model = smf.ols(formula=lsdv_expr, data=df_panel_with_dummies)
lsdv_model_results = lsdv_model.fit()
print('===============================================================================')
print('============================== OLSR With Dummies ==============================')
print(lsdv_model_results.summary())
print('LSDV='+str(lsdv_model_results.ssr))

Regression expression for OLS with dummies=Unemployment ~ Migration + GDPpc + Inflation + AUS + AUT + BEL + CAN + CHE + CHL + CZE + DEU + DNK + ESP + EST + FIN + FRA + GBR + GRC + HUN + IRL + ISL + ISR + ITA + LTU + LUX + LVA + MEX + NLD + NOR + NZL + POL + PRT + SVK + SVN + SWE + TUR
===============================================================================
============================== OLSR With Dummies ==============================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Unemployment   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.691
Method:                 Least Squares   F-statistic:                     36.29
Date:                Fri, 26 Aug 2022   Prob (F-statistic):          1.43e-119
Time:                        19:18:22   Log-Likelihood:                -454.59
No. Observations:                 569   AIC:                             983.2
Df Residuals:                     532   BIC:                             1144.
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3576      0.166      2.159      0.031       0.032       0.683
Migration     -0.1745      0.061     -2.853      0.004      -0.295      -0.054
GDPpc         -1.0470      0.126     -8.316      0.000      -1.294      -0.800
Inflation     -0.2095      0.031     -6.716      0.000      -0.271      -0.148
AUS           -0.5743      0.201     -2.860      0.004      -0.969      -0.180
AUT           -0.6320      0.206     -3.064      0.002      -1.037      -0.227
BEL           -0.4677      0.203     -2.304      0.022      -0.866      -0.069
CAN           -0.4028      0.225     -1.786      0.075      -0.846       0.040
CHE           -0.0372      0.204     -0.182      0.856      -0.439       0.364
CHL           -1.4256      0.361     -3.946      0.000      -2.135      -0.716
CZE           -1.4376      0.249     -5.784      0.000      -1.926      -0.949
DEU           -0.4057      0.211     -1.919      0.056      -0.821       0.010
DNK           -0.6018      0.185     -3.246      0.001      -0.966      -0.238
ESP            1.1528      0.248      4.646      0.000       0.665       1.640
EST           -1.0155      0.266     -3.824      0.000      -1.537      -0.494
FIN           -0.1777      0.196     -0.907      0.365      -0.562       0.207
FRA           -0.3754      0.204     -1.836      0.067      -0.777       0.026
GBR           -0.8665      0.208     -4.169      0.000      -1.275      -0.458
GRC            1.1085      0.267      4.156      0.000       0.585       1.632
HUN           -1.3301      0.275     -4.840      0.000      -1.870      -0.790
IRL            0.7852      0.187      4.197      0.000       0.418       1.153
ISL           -0.4262      0.224     -1.899      0.058      -0.867       0.015
ISR           -1.4347      0.271     -5.302      0.000      -1.966      -0.903
ITA           -0.1424      0.213     -0.669      0.504      -0.561       0.276
LTU           -1.0701      0.441     -2.424      0.016      -1.937      -0.203
LUX            3.2768      0.394      8.308      0.000       2.502       4.052
LVA           -1.1580      0.394     -2.943      0.003      -1.931      -0.385
MEX           -2.5281      0.317     -7.971      0.000      -3.151      -1.905
NLD           -0.7843      0.185     -4.237      0.000      -1.148      -0.421
NOR           -0.2509      0.183     -1.373      0.170      -0.610       0.108
NZL           -1.0172      0.300     -3.389      0.001      -1.607      -0.428
POL           -0.7752      0.287     -2.700      0.007      -1.339      -0.211
PRT           -0.7019      0.256     -2.737      0.006      -1.206      -0.198
SVK           -0.0708      0.267     -0.265      0.791      -0.596       0.454
SVN           -1.0328      0.272     -3.796      0.000      -1.567      -0.498
SWE           -0.6196      0.199     -3.113      0.002      -1.011      -0.229
TUR            0.6481      0.362      1.788      0.074      -0.064       1.360
==============================================================================
Omnibus:                       39.922   Durbin-Watson:                   0.365
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               99.234
Skew:                           0.353   Prob(JB):                     2.83e-22
Kurtosis:                       4.921   Cond. No.                         64.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
LSDV=164.65613850356738


#First build and fit a Poole OLSR model on the panel data set
pooled_y=df_panel[y_var_name]
pooled_X=df_panel[X_var_names]
pooled_X = sm.add_constant(pooled_X)
pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
pooled_olsr_model_results = pooled_olsr_model.fit()


#Setup the variables for calculating the F-test

#n=number of groups
n=len(unit_names)

#T=number of time periods per unit
T=df_panel.shape[0]/n

#N=total number of rows in the panel data set
N=n*T

#k=number of regression variables of the Pooled OLS model
k=len(X_var_names)+1


#Get the Residual Sum of Squares for the Pooled OLS model
ssr_restricted_model = pooled_olsr_model_results.ssr

#Get the Residual Sum of Squares for the Fixed Effects model
ssr_unrestricted_model = lsdv_model_results.ssr

#Get the degrees of freedom of the Pooled OLSR model
k1 = len(pooled_olsr_model_results.params)


#Get the degrees of freedom of the Fixed Effects model
k2 = len(lsdv_model_results.params)


#Calculate the F statistic
f_statistic = ((ssr_restricted_model - ssr_unrestricted_model)/ssr_unrestricted_model)*((N-k2)/(
        k2-k1))
print('F-statistic for FE model='+str(f_statistic))

#Calculate the critical value at alpha=.05
alpha=0.05
f_critical_value=st.f.ppf((1.0-alpha), (k2-k1), (N-k2))
print('F test critical value at alpha=0.05='+str(f_critical_value))

F-statistic for FE model=27.23218251543961
F test critical value at alpha=0.05=1.4584349540049237


# Import packages
import pymc3 as pm
import numpy as np



# We define two different functions to standardize our input data. 
#The first is for the variables that do not have missing values, the second one can handle missing values, which is necessary for the Migration data.
def standardize(x):
    return (x-x.mean())/x.std()

def standardize_ma(x):
    x_ma = np.ma.masked_invalid(x)
    return (x_ma-x_ma.mean())/x_ma.std()

# Run functions to standardize inputs
Unemployment = standardize(df_combined['Unemployment']) 
GDPpc  = standardize(df_combined['GDPpc']) 
Inflation  = standardize(df_combined['Inflation']) 
Migration = standardize_ma(df_combined['Migration'])

# Declare Bayesion model
with pm.Model() as normal_missing:
    

    # Initialize priors on coefficients, we set a low σ_prior to prevent overfitting.
    constant = pm.Normal('constant', mu = 0.0, sd = 1.0)
    σ_prior = 0.1
    
    # We define our coeffecients as normally distributed with a 0 mean and the aforementioned  tight σ_prior
    b_inflation = pm.Normal('b_inflation',mu = 0, sd = σ_prior)
    b_GDPpc = pm.Normal('b_GDPpc',mu = 0, sd = σ_prior)
    b_Migration = pm.Normal('b_Migration',mu = 0, sd = σ_prior)
    
    ## For Migration we use the observed values and initialize the rest also as being normally distributed
    Migration = pm.Normal('Migration', mu = 0, sd = 1.0, observed = Migration)

    ## Here we set our model formula  with Unemployment being explained by Inflation, GDP per capita and migration
    μ = constant + b_inflation*Inflation + b_GDPpc*GDPpc +   b_Migration*Migration
    σ = pm.HalfNormal('σ',1)
    
    unemployment = pm.Normal('Unemployment', μ, σ, observed=Unemployment)
    trace_normal_missing = pm.sample(cores = 1)

# For the missing values we s
with normal_missing:
        ppc_normal_missing = pm.sample_posterior_predictive(trace_normal_missing, var_names=['Migration'])

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [σ, Migration_missing, b_Migration, b_GDPpc, b_inflation, constant]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 6574 seconds.


import arviz as az
# Plot teh trace plots and a summary of the models results
data_posterior_normal_missing = az.from_pymc3(trace_normal_missing, posterior_predictive = ppc_normal_missing)
variables_unemployment = ['b_inflation','b_GDPpc', 'b_Migration']
az.plot_trace(trace_normal_missing, var_names = variables_unemployment)
az.summary(data_posterior_normal_missing.posterior ,variables_unemployment)

Got error No model on context stack. trying to find log_likelihood in translation.
Got error No model on context stack. trying to find log_likelihood in translation.
Got error No model on context stack. trying to find log_likelihood in translation.


data_posterior_normal_missing

Got error No model on context stack. trying to find log_likelihood in translation.

<xarray.Dataset>
Dimensions:                  (chain: 2, draw: 1000, Migration_missing_dim_0: 23)
Coordinates:
  * chain                    (chain) int32 0 1
  * draw                     (draw) int32 0 1 2 3 4 5 ... 995 996 997 998 999
  * Migration_missing_dim_0  (Migration_missing_dim_0) int32 0 1 2 ... 20 21 22
Data variables:
    constant                 (chain, draw) float64 -0.04934 -0.05253 ... 0.03776
    b_inflation              (chain, draw) float64 -0.1506 -0.1352 ... -0.08501
    b_GDPpc                  (chain, draw) float64 -0.3184 -0.342 ... -0.3721
    b_Migration              (chain, draw) float64 -0.1057 -0.104 ... -0.1252
    Migration_missing        (chain, draw, Migration_missing_dim_0) float64 0...
    σ                        (chain, draw) float64 0.8889 0.8892 ... 0.952
Attributes:
    created_at:                 2022-08-26T16:56:03.927020
    arviz_version:              0.12.1
    inference_library:          pymc3
    inference_library_version:  3.11.5
    sampling_time:              6573.864566326141
    tuning_steps:               1000

array([0, 1])

array([  0,   1,   2, ..., 997, 998, 999])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22])

array([[-0.04934437, -0.05253203,  0.04792663, ...,  0.02199674,
        -0.02709864,  0.0015587 ],
       [ 0.0148203 , -0.00845927, -0.00087016, ...,  0.01809082,
         0.00377294,  0.03776126]])

array([[-0.15057749, -0.13524182, -0.10982745, ..., -0.17102413,
        -0.05448086, -0.11009633],
       [-0.12199994, -0.10019326, -0.05821004, ..., -0.11624801,
        -0.13385065, -0.08500901]])

array([[-0.31838929, -0.34200683, -0.37502416, ..., -0.35360942,
        -0.33648748, -0.39180874],
       [-0.34815157, -0.36326453, -0.30252791, ..., -0.36377141,
        -0.3057699 , -0.37207166]])

array([[-0.10566729, -0.10401182, -0.08062464, ..., -0.11855244,
        -0.14137268, -0.10527206],
       [-0.12540861, -0.12070865, -0.06870178, ..., -0.15533652,
        -0.10769294, -0.12521994]])

array([[[ 3.40252737e-01,  2.65534252e-01,  7.21603156e-01, ...,
          1.54758229e-03, -8.56331814e-01,  8.35847601e-01],
        [ 1.23339620e+00,  3.43736782e-01, -4.61752471e-01, ...,
         -2.97803660e-01, -1.58299756e+00,  1.34999981e-01],
        [-2.64741489e-01, -1.17118790e+00, -2.01128155e-01, ...,
         -1.80336010e+00,  7.70323607e-01,  1.04061076e+00],
        ...,
        [-3.70690576e-02, -5.06425837e-01, -7.09088294e-01, ...,
         -2.66834126e-01,  1.12384290e+00, -2.31220243e-02],
        [-2.15782266e-01,  1.85726984e+00,  5.58629200e-01, ...,
          1.39429447e-01, -7.04549658e-01, -5.18106291e-01],
        [ 4.23849073e-01,  1.62345296e-01, -2.02905475e-01, ...,
         -1.06165168e+00, -5.01062925e-01,  7.92246756e-01]],

       [[-1.35685275e+00, -2.57955783e+00,  5.05161473e-01, ...,
          1.21273235e+00, -1.21486954e+00,  1.28884330e+00],
        [ 8.65368917e-01,  2.30876177e+00, -3.71271358e-01, ...,
         -1.29275630e+00,  1.09548221e+00, -1.33492848e+00],
        [-1.23427726e-01, -2.40053671e+00,  1.72038066e+00, ...,
         -6.40053431e-01, -2.87288768e+00,  7.72008592e-01],
        ...,
        [ 3.61811012e-01, -1.27145772e+00, -7.51578163e-01, ...,
          1.72273555e+00, -1.63532391e+00,  1.52508209e-01],
        [-3.78815739e-02, -4.96674656e-01,  2.59662925e-01, ...,
          2.19188268e+00, -1.43129741e+00,  5.33917320e-01],
        [-1.66605936e-02,  1.39105958e-01, -5.06434512e-01, ...,
          4.02788917e-01, -1.26498378e+00, -4.43031805e-01]]])


import matplotlib.colors as colors
# Bayesian analysis Graph
percentiles = np.percentile(data_posterior_normal_missing.posterior_predictive.Migration,[2.5,97.5],axis=[0,1])


# Create different colors for all countries
plt.style.use('seaborn-dark-palette')
NUM_COLORS = 34
cm = plt.get_cmap('tab20b') 
cNorm  = colors.Normalize(vmin=0, vmax=NUM_COLORS-1)
scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=([scalarMap.to_rgba(i) for i in range(NUM_COLORS)]))

# Create figure
fig, ax = plt.subplots(figsize=(13,5))

# Plot all 95% percentile ranges and data points
for country in df_combined.Country.unique():
    mask = (df_combined.Country == country)
    ax.vlines(Migration[mask],percentiles[0,:][mask],percentiles[1,:][mask], color = next(ax1._get_lines.prop_cycler)['color'],alpha=0.3)
    ax.scatter(Migration[mask], Unemployment[mask], label = country)

# Set labels    
ax.set_xlabel('Migration')
ax.set_ylabel("Native Born Unemployment(%)")

# Add legend
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.);


# We once again print the results from our Lasso regression, Fixed effects regression and Bayesian model
print("Lasso regression results:")
print(coef_)
print('R squared training set', round(lasso_best.score(X_train, y_train)*100, 2))
print('R squared test set', round(lasso_best.score(X_test, y_test)*100, 2))

print("Fixed effects regression results:")
print(lsdv_model_results.summary())

Lasso regression results:
[(-0.11925072109542662, 'Migration'), (-0.2871240883786068, 'GDPpc'), (-0.0, 'Inflation')]
R squared training set 20.64
R squared test set 16.79
Fixed effects regression results:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Unemployment   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.691
Method:                 Least Squares   F-statistic:                     36.29
Date:                Fri, 26 Aug 2022   Prob (F-statistic):          1.43e-119
Time:                        19:57:25   Log-Likelihood:                -454.59
No. Observations:                 569   AIC:                             983.2
Df Residuals:                     532   BIC:                             1144.
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3576      0.166      2.159      0.031       0.032       0.683
Migration     -0.1745      0.061     -2.853      0.004      -0.295      -0.054
GDPpc         -1.0470      0.126     -8.316      0.000      -1.294      -0.800
Inflation     -0.2095      0.031     -6.716      0.000      -0.271      -0.148
AUS           -0.5743      0.201     -2.860      0.004      -0.969      -0.180
AUT           -0.6320      0.206     -3.064      0.002      -1.037      -0.227
BEL           -0.4677      0.203     -2.304      0.022      -0.866      -0.069
CAN           -0.4028      0.225     -1.786      0.075      -0.846       0.040
CHE           -0.0372      0.204     -0.182      0.856      -0.439       0.364
CHL           -1.4256      0.361     -3.946      0.000      -2.135      -0.716
CZE           -1.4376      0.249     -5.784      0.000      -1.926      -0.949
DEU           -0.4057      0.211     -1.919      0.056      -0.821       0.010
DNK           -0.6018      0.185     -3.246      0.001      -0.966      -0.238
ESP            1.1528      0.248      4.646      0.000       0.665       1.640
EST           -1.0155      0.266     -3.824      0.000      -1.537      -0.494
FIN           -0.1777      0.196     -0.907      0.365      -0.562       0.207
FRA           -0.3754      0.204     -1.836      0.067      -0.777       0.026
GBR           -0.8665      0.208     -4.169      0.000      -1.275      -0.458
GRC            1.1085      0.267      4.156      0.000       0.585       1.632
HUN           -1.3301      0.275     -4.840      0.000      -1.870      -0.790
IRL            0.7852      0.187      4.197      0.000       0.418       1.153
ISL           -0.4262      0.224     -1.899      0.058      -0.867       0.015
ISR           -1.4347      0.271     -5.302      0.000      -1.966      -0.903
ITA           -0.1424      0.213     -0.669      0.504      -0.561       0.276
LTU           -1.0701      0.441     -2.424      0.016      -1.937      -0.203
LUX            3.2768      0.394      8.308      0.000       2.502       4.052
LVA           -1.1580      0.394     -2.943      0.003      -1.931      -0.385
MEX           -2.5281      0.317     -7.971      0.000      -3.151      -1.905
NLD           -0.7843      0.185     -4.237      0.000      -1.148      -0.421
NOR           -0.2509      0.183     -1.373      0.170      -0.610       0.108
NZL           -1.0172      0.300     -3.389      0.001      -1.607      -0.428
POL           -0.7752      0.287     -2.700      0.007      -1.339      -0.211
PRT           -0.7019      0.256     -2.737      0.006      -1.206      -0.198
SVK           -0.0708      0.267     -0.265      0.791      -0.596       0.454
SVN           -1.0328      0.272     -3.796      0.000      -1.567      -0.498
SWE           -0.6196      0.199     -3.113      0.002      -1.011      -0.229
TUR            0.6481      0.362      1.788      0.074      -0.064       1.360
==============================================================================
Omnibus:                       39.922   Durbin-Watson:                   0.365
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               99.234
Skew:                           0.353   Prob(JB):                     2.83e-22
Kurtosis:                       4.921   Cond. No.                         64.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
b_inflation	-0.106	0.034	-0.171	-0.044	0.001	0.000	3861.0	1663.0	1.0
b_GDPpc	-0.348	0.045	-0.431	-0.263	0.001	0.001	3497.0	1680.0	1.0
b_Migration	-0.119	0.044	-0.202	-0.036	0.001	0.001	4094.0	1712.0	1.0

The effect of immigration on unemployment of nativeborn citizens¶

Research question¶

Motivation¶

Method and Data¶

Preview of the answers¶

Main assumptions¶

Analysis¶

Creating Simulated data¶

Lasso regression test on simulated data¶

Fixed effects regression test on simulated data¶

¶

¶

Lasso Regression analysis¶

Fixed Effects Regression¶

¶

¶

Bayesian Model¶

Robustness analysis¶

Discussion and conclusion¶

Findings¶

References¶

	Country	Year	Unemployment	Population	Migration	GDPpc	Inflation
0	AUS	2000	6.3	19028802.0	107148.0	37933.185775	4.457435
1	AUS	2001	6.7	19274701.0	127877.0	38952.657236	4.407135
2	AUS	2002	6.3	19495210.0	119080.0	39709.935147	2.981575
3	AUS	2003	5.9	19720737.0	123411.0	40906.749065	2.732596
4	AUS	2004	5.5	19932722.0	146441.0	41750.715304	2.343255