# # # Please participate in the R&SS Client Feedback Survey. # https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9 # # ############### Basic and Robust Linear Regression ############### # # This script assumes you have worked through all the previous notes from # the web page and you have downloaded, installed, and updated all available # R packages. # Load the following libraries if you have not already; when an additional # library is needed, it will be listed in the script and loaded. library(foreign) library(Rcmdr) # Start by using the 'foreign' library to import 'Example Data 5.sav' (available # from the web page) assigning the name 'example5'. example5 <- read.spss("http://bayes.acs.unt.edu:8083:8083/BayesContent/class/Jon/R_SC/Module3/ExampleData5.sav", use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE) attach(example5) ###### Basic Linear Regression ###### # Gather the variables of interest. subset.1 <- data.frame(apt,prison,age,peyrs,bt) summary(subset.1) # "Omit" the missing values. subset.2 <- data.frame(na.omit(subset.1)) summary(subset.2) detach(example5) rm(subset.1) attach(subset.2) nrow(subset.2) # Standard linear regression (returns un-standardized or raw coefficients). reg1 <- lm(apt~prison+age+peyrs+bt) summary(reg1) # Standardized coefficients (using QuantPsyc library & lm.beta function). library(QuantPsyc) lm.beta(reg1) # Plot. scatterplotMatrix(~apt+prison+age+peyrs+bt, reg.line=lm, smooth=TRUE, span=0.5, diagonal = 'density', data=subset.2) # Influence plot. influencePlot(reg1) # Confidence intervals for the coefficients. confint(reg1, level=.95) # Get the confidence interval for the Adj. R-square; taken from the output of # summary(reg1) above (N=number of rows/cases; p=number of predictors in the # model). # Load the required library. library(MBESS) ci.R2(R2=.1286, conf.level=.95, N=726, p=4) ### Some diagnostic plots. oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2)) plot(reg1) par(oldpar) #### Mahalanobis distance. ## Robust Mahalanobis distance. # Using library(chemometrics) and Moutlier function; note: there are several dependent packages. library(chemometrics) # Get the robust Mahalanobis # distances for each case of our current data; the plot contains both robust # and classical Mahalanobis distances. r.md <- Moutlier(subset.2,quantile=0.975,plot=TRUE)$rd summary(r.md) # Q to Q plot of the robust Mahalanobis distance (used to look for outliers). par(mfrow=c(1,1)) qqPlot(r.md, dist= "norm", line="robust") legend("topleft", "Robust Mahalanobis Distances", pch=1, title="Quantile-Quantile Plot", col="red") # If desired; you can get the classic Mahalanobis distances for each case # of our current data. c.md <- Moutlier(subset.2,quantile=0.975,plot=FALSE)$md summary(c.md) par(mfrow=c(1,1)) qqPlot(c.md, dist= "norm", line="robust") legend("topleft", "Classic Mahalanobis Distances", pch=1, title="Quantile-Quantile Plot", col="red") par(oldpar) ## Comparison of Classic & Robust Mahalanobis Distances. scatterplot(r.md, c.md, smooth=TRUE, ellipse=TRUE, span =1, reg.line=lm, boxplots='xy') # Cases with extreme Mahalanobis distances (upper right) are likely to be # true multivariate outliers. ###### Robust regression using 'Robust' library/package. ###### library(robust) # Robust Regression. rob.reg <- lmRob(apt~prison+age+peyrs+bt, subset.2) summary(rob.reg) # The 'Test for Bias' output is a diagnostic for the fitting procedure and # is not direct summary output for the fitted model. # Get the confidence interval for the R-square. ci.R2(R2=.12996, conf.level=.95, N=726, p=4) # In the console, you will have to make number selections using the number # keys and then hit enter to get each graph; # THEN, you will have to hit the zero key or the 'escape' key to exit the # graphs dialogue. plot(rob.reg) ###### Relative Importance of Predictors to the Outcome in a model. ###### # Due to issues of multicolinearity (the so-called 'bouncing beta' problem); it # is often desirable to use the relative importance functions to gain a better # understanding of how each predictor contributes to the outcome variable in a # regression model (uses the 'relaimpo' library). library(relaimpo) # Calculate the relative importance of the 4 predictors; larger 'lmg' is # better (meaning more important). calc.relimp(reg1) # Conduct the bootstrapped relative importance function (b=number of bootstrapped # resamples). boot.reg1 <- boot.relimp(subset.2, b = 200, type = c("lmg"), rank = TRUE, diff = TRUE, rela = TRUE) booteval.relimp(boot.reg1) plot(booteval.relimp(boot.reg1)) # Conduct the bootstrapped relative importance function; this time using # ALL possible relative importance indices (lmg, last, first, pratt). boot.reg1.2<-boot.relimp(subset.2,b = 200,type = c("lmg","last","first", "pratt"), rank = TRUE, diff = TRUE, rela = TRUE) booteval.relimp(boot.reg1.2) plot(booteval.relimp(boot.reg1.2)) ###### Best Subsets Regression ###### # Another alternative (also based on relative importance) would be the Best # Possible Subsets function (leaps library), which calculates the best # predictor(s) for a given model with 1, 2, or 3 predictors (out of the 4). # Best possible subsets; with NO MORE than 3 predictors (of the 4) retained; # the asterisk in the output denotes which variable qualifies as best in a # given model with 1 predictor, 2 predictors, or 3 predictors. library(leaps) reg2 <- regsubsets(apt~prison+age+peyrs+bt, subset.2, intercept=T, nvmax=3) summary(reg2) # This time; isolating the best 2 predictors out of the 4 available. reg3 <- regsubsets(apt~prison+age+peyrs+bt, subset.2, intercept=T, nvmax=2) summary(reg3) ###### Direct comparison of two fitted models. ###### # Comparison of two models in terms of R-square change; standard regression. # Sometimes called sequential or hierarchical regression; often used to # evaluate co-variates or different predictors (not to be confused with # step-wise regression). RegMod.1 <- lm(apt~prison+age+peyrs) summary(RegMod.1) RegMod.2 <- lm(apt~prison+age+peyrs+bt) summary(RegMod.2) anova(RegMod.1,RegMod.2) # The anova is testing for R-square change from model 1 to model 2 (R-square # change different from zero?). Here; the F-value is relatively small (and the # p-value is larger than .05) so we could safely say that R-square change is # not significantly different from zero. # However; we can calculate more useful (and less biased) statistics for # comparing fitted models. In both cases (AIC & BIC) smaller values indicate # better fit. # The Akaike Information Criterion (AIC). AIC(RegMod.1) AIC(RegMod.2) # The Bayes Information Criterion (BIC) AIC(RegMod.1, k = log(nrow(subset.2))) AIC(RegMod.2, k = log(nrow(subset.2))) ## See also: http://www.unt.edu/benchmarks/archives/2008/june08/rss.htm # # Please participate in the R&SS Client Feedback Survey. # https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9 # # End: December 2010.