# changing the directory 
# this is the path that contains the excel file you want to load
setwd('/Users/stp48131/Library/CloudStorage/Dropbox/WKU/Teaching/ECON_307/Class_Materials/Honors/Regression')

# loading packages
library(readxl) # loads excel files
library(ggplot2) # plotting
# loading the excel file and assigning it to the data frame named data

house_data <- read_excel("Regression_R.xlsx")

Assume we want to estimate a regression where the dependent variable is the house price and the independent variable is square footage. The regression equation is: \(Price=a+b*sqft+\epsilon\) where \(\epsilon\) contains all determinants of price except for square footage.

# this estimates the regression and stores the results as reg1
reg1<-lm(price~sqft, data=house_data)
# summarize the results
summary(reg1)
## 
## Call:
## lm(formula = price ~ sqft, data = house_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77683 -30522   2576  30912  73371 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30920.276   9336.991   3.312 0.000961 ***
## sqft           85.973      3.679  23.367  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33940 on 998 degrees of freedom
## Multiple R-squared:  0.3536, Adjusted R-squared:  0.353 
## F-statistic:   546 on 1 and 998 DF,  p-value: < 2.2e-16

The estimated regression equation is \(\hat{price}=30,920+85.97sqft\)

The intercept (a) and slope (b) came from the ``Estimate” column. The “Pr(>|t|)” column has the p-value for the test that the coefficient is equal to zero in the population. If this value is less than 0.05, we reject the null hypothesis and conclude that the coefficient is statistically different from zero in the population with 95% confidence. We can estimate a regression with multiple independent variables by adding them to the right side of the lm() function.

# multiple regression
reg2 <- lm(price ~ sqft + age + pool + fireplace + college , data = house_data)
# displaying the results
summary(reg2)
## 
## Call:
## lm(formula = price ~ sqft + age + pool + fireplace + college, 
##     data = house_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47971 -10411    198  10438  44759 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6911.880   4289.365   1.611 0.107410    
## sqft           83.183      1.672  49.759  < 2e-16 ***
## age          -192.991     51.567  -3.743 0.000193 ***
## pool         4352.570   1205.261   3.611 0.000320 ***
## fireplace    1398.810    976.807   1.432 0.152452    
## college     60196.233    971.531  61.960  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15330 on 994 degrees of freedom
## Multiple R-squared:  0.8686, Adjusted R-squared:  0.8679 
## F-statistic:  1314 on 5 and 994 DF,  p-value: < 2.2e-16

The estimated regression equation is: \(\hat{price}=6911.88+83.18*sqft-192.99*age+4352.57*pool+1398.81*fireplace+60196.23*college\)

Notice the p-value for fireplace is greater than 0.05. This means that, on average, the sales prices for homes with and without fireplaces are not statistically different after we have controlled for the other variables.

Plotting

We can plot prices against square footage and add the estimated regression line. This works best when there is only one continuous independent variable.

ggplot(house_data, aes(x=sqft, y=price)) +
  geom_point() +
  geom_line(aes(y=reg1$fitted.values))

We could estimate a regression with a continuous variable and a dummy variable and plot the results.

reg3<-lm(price~sqft+college, data=house_data)
ggplot(house_data, aes(x=sqft, y=price, 
                       group=college, # groups the variables together 
                       color=factor(college) # assigns a different color to each group
                       )
       ) +
  geom_point() +
  geom_line(aes(y=reg3$fitted.values))