import pandas as pd import numpy as np import statsmodels.api as sm import statsmodels.formula.api as smf import requests from io import BytesIO # Load data from URL # url = "https://bigblue.depaul.edu/jlee141/econdata/cps_data/cps_msa2013.dta" data = pd.read_stata("/var/www/html/jlee141/econdata/cps_data/cps_msa2013.dta") # 1. Creating continuous education variable education_mapping = { "Less than 1st grade":0, "1st-4th grade": 4, "5th-6th grade": 6, "7th-8th grade": 8, "9th grade": 9, "10th grade": 10, "11th grade": 11, "12th grade-no diploma": 12, "HS graduate, GED": 12, "Some college but no degree": 14, "Associate degree-occupational/vocational": 14, "Associate degree-academic program": 14, "Bachelor's degree": 16, "Master's degree": 18, "Professional school": 20, "Doctorate": 21 } data['educ92'].value_counts() # Apply the mapping to create 'educyr' column data['educyr'] = data['educ92'].apply(lambda x: education_mapping.get(x, np.nan)) frequency = data['perno'].value_counts() # 2. Creating simple dummy variables data['young'] = np.where(data['age'] < 25, 1, 0) data['retired_age'] = data['age'] > 65 data['retired_man'] = (data['age'] > 65) & (data['female'] == 0) # 3. Creating multiple dummy variables for race # Assuming 'wbho' is coded as a categorical variable with levels for race data['race1'] = np.where(data['wbho'] == "White", 1, 0) data['race2'] = np.where(data['wbho'] == "Black", 1, 0) data['race3'] = np.where(data['wbho'] == "Hispanic", 1, 0) data['race4'] = np.where(data['wbho'] == "Other", 1, 0) # Filter and select data indata = data[['hrwage','educ92', 'educyr', 'female', 'race1', 'race2', 'race3', 'race4', 'wbho']].copy() indata = indata[indata['hrwage'] < 1000] #indata = indata.dropna() # 4. Regression using dummy and factor variables # Model 1: Simple regression with log wage and education model1 = smf.ols("hrwage ~ educyr + female", data=indata).fit() print(model1.summary()) # Model 2: Including dummy variables for race model2 = smf.ols("hrwage ~ educyr + female + race2 + race3 + race4", data=indata).fit() print(model2.summary()) # Model 3: Using factor for race model3 = smf.ols("hrwage ~ educyr + female + wbho", data=indata).fit() print(model3.summary()) from statsmodels.iolib.summary2 import summary_col dfoutput = summary_col([model1,model2,model3],stars=True) print(dfoutput)