import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import requests
from io import BytesIO

# Load data from URL
# url = "https://bigblue.depaul.edu/jlee141/econdata/cps_data/cps_msa2013.dta"
data = pd.read_stata("/var/www/html/jlee141/econdata/cps_data/cps_msa2013.dta")

# 1. Creating continuous education variable
education_mapping = {
    "Less than 1st grade":0,
    "1st-4th grade": 4, 
    "5th-6th grade": 6, 
    "7th-8th grade": 8, 
    "9th grade": 9, 
    "10th grade": 10, 
    "11th grade": 11, 
    "12th grade-no diploma": 12,
    "HS graduate, GED": 12, 
    "Some college but no degree": 14, 
    "Associate degree-occupational/vocational": 14, 
    "Associate degree-academic program": 14, 
    "Bachelor's degree": 16,
    "Master's degree": 18, 
    "Professional school": 20, 
    "Doctorate": 21
}
data['educ92'].value_counts()

# Apply the mapping to create 'educyr' column
data['educyr'] = data['educ92'].apply(lambda x: education_mapping.get(x, np.nan))
frequency = data['perno'].value_counts()

# 2. Creating simple dummy variables
data['young'] = np.where(data['age'] < 25, 1, 0)
data['retired_age'] = data['age'] > 65
data['retired_man'] = (data['age'] > 65) & (data['female'] == 0)

# 3. Creating multiple dummy variables for race
# Assuming 'wbho' is coded as a categorical variable with levels for race
data['race1'] = np.where(data['wbho'] == "White", 1, 0)
data['race2'] = np.where(data['wbho'] == "Black", 1, 0)
data['race3'] = np.where(data['wbho'] == "Hispanic", 1, 0)
data['race4'] = np.where(data['wbho'] == "Other", 1, 0)

# Filter and select data
indata = data[['hrwage','educ92', 'educyr', 'female', 'race1', 'race2', 'race3', 'race4', 'wbho']].copy()
indata = indata[indata['hrwage'] < 1000]
#indata = indata.dropna()

# 4. Regression using dummy and factor variables

# Model 1: Simple regression with log wage and education
model1 = smf.ols("hrwage ~ educyr + female", data=indata).fit()
print(model1.summary())

# Model 2: Including dummy variables for race
model2 = smf.ols("hrwage ~ educyr + female + race2 + race3 + race4", data=indata).fit()
print(model2.summary())

# Model 3: Using factor for race
model3 = smf.ols("hrwage ~ educyr + female + wbho", data=indata).fit()
print(model3.summary())

from statsmodels.iolib.summary2 import summary_col
dfoutput = summary_col([model1,model2,model3],stars=True)
print(dfoutput)