# loading packages
library(tidyverse)
library(ggplot2)
library(readxl)
library(stargazer)
# loading data and generating variables
# look at the hdma documentation for more information

loandata<- data.frame(read_excel('loandata.xlsx')) %>%
  mutate(deny=ifelse(s7==3,1,0), # =1 if the person's loan app was denied
         black=ifelse(s13==3,1,0), # =1 if the applicant was black, 0 if white
         pi_ratio=s46/100, # applicant's payment to income ratio
         married=ifelse(s23a=='M',1,0) ) %>% # =1 if applicant is married
  filter(s23a!='NA') %>% # removing missing values for married
  select('deny', 'black','pi_ratio', 'married') # keeping necessary variables
# plotting the data

ggplot(loandata,aes(x=pi_ratio, y=deny)) +
  geom_point() +
  theme_minimal() +
  labs(x="Payment/Income Ratio",
       y='=1 if Denied')

# estimating the models

# probit
probit <- glm(deny~pi_ratio+black+married,
                      family=binomial(link='probit'),
                      data=loandata)
# logit
logit <- glm(deny~pi_ratio+black+married,
                      family=binomial(link='logit'),
                      data=loandata)

# linear probability model
lpm <- lm(deny~pi_ratio+black+married,data=loandata)

# combining the results
stargazer(probit, logit, lpm, type='text')
## 
## ================================================================
##                                 Dependent variable:             
##                     --------------------------------------------
##                                         deny                    
##                      probit   logistic            OLS           
##                        (1)       (2)              (3)           
## ----------------------------------------------------------------
## pi_ratio            2.720***  5.296***          0.556***        
##                      (0.382)   (0.728)          (0.060)         
##                                                                 
## black               0.686***  1.229***          0.172***        
##                      (0.084)   (0.147)          (0.018)         
##                                                                 
## married             -0.183*** -0.324**         -0.035***        
##                      (0.071)   (0.133)          (0.013)         
##                                                                 
## Constant            -2.142*** -3.906***        -0.068***        
##                      (0.144)   (0.280)          (0.023)         
##                                                                 
## ----------------------------------------------------------------
## Observations          2,378     2,378            2,378          
## R2                                               0.079          
## Adjusted R2                                      0.077          
## Log Likelihood      -793.651  -792.632                          
## Akaike Inf. Crit.   1,595.303 1,593.265                         
## Residual Std. Error                        0.312 (df = 2374)    
## F Statistic                             67.518*** (df = 3; 2374)
## ================================================================
## Note:                                *p<0.1; **p<0.05; ***p<0.01
# predicted probabilities when black equals zero and one
# you can do the same thing for the logit results
# this is not necessary for the lpm model since the coefficients can be
# interpreted as the change in the predicted probability without additional steps
# set all variables except the variable of interest to their mean value
probit_predict <- predict(probit,
                          newdata = data.frame('black' = c(0,1),
                                               'pi_ratio' = mean(loandata$pi_ratio),
                                               'married'=mean(loandata$married)
                                               ),
                   type = "response")
# difference in the predicted probabilities
# compare this to the coefficient on black in the lpm
diff(probit_predict)
##         2 
## 0.1642222