# # # Please participate in the DSA Client Feedback Survey. # https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9 # # # # The information below is also available in article form at: # http://bayes.acs.unt.edu:8083:8083/BayesContent/class/Jon/Benchmarks/RefCatLogReg_L_JDS_Nov2018.pdf # ###### Simple script for setting a reference category and interpreting coefficients. # # Changing the 'levels' of a factor to reflect specific values so # you can force one 'level' to be the reference category in 'lm' or 'glm'. x1 <- factor(rep(letters[1:4], 3)) x1 summary(x1) levels(x1) y <- c(1.1, 2.1, 3.1, 4.1, 1.5, 2.5, 3.5, 4.5, 1.9, 2.9, 3.9, 4.9) x3 <- y summary(lm(y ~ x1)) plot(x1, y) # How to set a specific 'reference category.' We order the levels so whatever # level (or category) is first will be the one we want as a reference. x2 <- factor(x1, levels = c("d","a","b","c")) x2 summary(x2) levels(x2) summary(lm(y ~ x2)) plot(x2, y) # So, looking at the 'x2' model, directly above; we see that the mean (y-value) of # "a" is 3.0 units *less than* the mean (y-value) of "d" (which is listed as the # intercept). # Likewise, the mean of "b" is 2.0 units *less than* the mean of "d" and the mean of "c" # is 1.0 units *less than* the mean of "d" -- because "d" is # the 'reference category' in the linear regression and the negative coefficients # represent the *less than* in the interpretation. # The t-test is simply testing if the difference between, say the category 'a' # coefficient and the reference category, 'd' is different than zero. # 4.50 - 1.50 = -3.00; is that 'difference' greater than zero; yes, p = 0.0000159. # To 'see' all the coefficients, we can run a 'no-intercept' model. summary(lm(y ~ 0 + x2)) # To interpret all four coefficients (listed in the 'no-intercept' model)...we would # say that all cases with a value of "d" on 'x2' would be predicted to have a # value of 4.5 on 'y'. # The t-tests are simply testing whether the coefficients are different than zero (here, # all four are). ########### LOGISTIC REGRESSION # See: https://stats.stackexchange.com/questions/60817/significance-of-categorical-predictor-in-logistic-regression # And: https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/ prob <- c(.001,.01,.05,.1,.15,.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,.99,.999) odds <- round(prob/(1-prob), 4) logodds <- round(log(odds), 4) logit.df <- data.frame(prob,odds,logodds) rm(prob,odds,logodds) logit.df # Now, let's look at a binomial GLM. f <- factor(c("0","0","s","s","0","0","s","0","0","0","0","s"), levels = c("0","s")) f summary(f) levels(f) # First, start with the simplest version, an intercept only model. summary(glm(f ~ 1, family = "binomial")) exp(-.6931)/(1+exp(-.6931)) # probability of "s"; 4/12 = .3333. # Next, what happens when we introduce a numeric predictor term? summary(glm(f ~ x3, family = "binomial")) # The intercept coefficient is the log odds of someone with a "x3" value of zero # having a "f" value of "s". We can translate log odds into just odds, # or even probability (of "s"). Both of which are extremely small. exp(-4.9954) exp(-4.9954)/(1+exp(-4.9954)) # A typical logistic regression coefficient (i.e. the coefficient for a numeric # variable) is the expected amount of change in the logit for each unit change # in the predictor. The logit is what is being predicted; it is the log odds of # membership in the NON-reference category of the outcome variable # value (here a 's', rather than '0'). The closer a logistic coefficient is to # zero, the less influence it has in predicting the logit. So, for every unit # change in "x3", we expect the log odds (or logit) to increase by 1.3182. # So, to put the logistic coefficient in context, consider the equation of # our model: # logit = -4.9954 + (1.3182*x3) # If we give "x3" a value, say 5.0 then we get: -4.9954 + (1.3182*2.0) # and if we give it 6.0 -4.9954 + (1.3182*3.0) # The difference between the two equations above is the coefficient: -2.359 - -1.0408 # Similar to other forms of regression coeffients, the logistic coefficient is # the amount of change in the outcome (i.e. the logit) for every unit change # in our predictor ("x3"). So, for every unit of "x3" we would expect a greater # probability of "s" on the outcome. Keep in mind, the reason for using the # log odds is to attempt to represent the linear form (i.e. linear equation in # the logit). If there is a non-linear relationship between a predictor and the # outcome, then the standard logistic model is inappropriate. For example there # may be a situation where low values of the predictor decrease the log odds of # the logit and medium values of the predictor increase the log odds and high values # of the predictor decrease the log odds -- the popular "U-shaped" curve. # # # Next we explore the binomial logistic regression with a single categorical # predictor. summary(glm(f ~ x1, family = "binomial")) # The first thing to state is that the model above has the same outcome variable # ("f") as the previous two models. So, we know we are predicting membership # in the non-reference category of "f" (which is the log odds, odds, or probability # of the value "s"). However, the coefficients in this model are a little different. # The coefficient for the intercept term is actually the coefficient of the # reference category of the "x1" predictor, "a" = -19.57. That number is the log odds # of "s" for cases with "a" on the "x1" predictor. Again, we can translate # that value into odds or probability, as was done previously, both of which are # incredibly small (but not the same value): exp(-19.57) exp(-19.57)/(1+exp(-19.57)) # The other 3 coefficients (for "b", "c", & "d") are not actually coefficients but, # instead they represent the difference between their respective coefficients and # the reference category coefficient: "a" = -19.57. Notice that the difference # (Estimate), standard error, z-value, and p-value is the same for both "c" and # "d". That may seem strange, but it will be explained in the next paragraph. # The p-values (and Z-test values) are used to determine if these 'difference scores' # ("Estimate" for "b", "c", & "d") are statistically different than zero. Note, the # p-value for the reference category ("a") is testing if that (true) coefficient is # different than zero. Naturally, one would now be wondering what the (true) # coefficients are for "b", "c", and "d". If we run a no-intercept model, we # are provided with all the coffiients (and their associated tests to determine # if they are different from zero): summary(glm(f ~ 0 + x1, family = "binomial")) # Most of the output is identical to what was produced with an intercept term; only # the 3 rows in the "Coefficients" table for "b", "c", and "d" are different. It may # seem strange that we have two sets of identical coefficients; but this can # happen and there is a perfectly simple explanation for it. You can see why those # coefficients are the same (for the two pairs of categories) if you review a # cross-tabluation of the two variables "f" and "x1". xtabs( ~ f + x1) # Categories "a" and "b" have the same number of 'zero' and 's' and categories "c" # and "d" have the same number of 'zero' and 's'. We can also see why "a" and "b" # have such large negative coefficients (and corresponding tiny odds and # probabilities); because, they have no values of 's'. The p-value(s) for "a" # and "b" are extremely misleading in this example. First of all, the tiny # sample size contributes to large standard error(s). df.1 <- data.frame(f, x1) df.2 <- df.1 for (i in 1:1000){ df.2 <- rbind(df.2, df.1) }; rm(i, df.1) nrow(df.2) summary(glm(f ~ 0 + x1, df.2, family = "binomial")) # Second, even with a large sample the p-value might mislead, because logistic # regression is all about predicting the logit (log odds, odds, probability of # the non-reference category of the outcome variable). So, if the proability # of "s" on the outcome is nearly zero (0.000000003168524; see above # intecept version of the model)... exp(-19.57)/(1+exp(-19.57)) # ...then the probability of "0" (zero) on the outcome variable for a particular # predictor or category (e.g. "a" & "b")is... 1 - (exp(-19.57)/(1+exp(-19.57))) # The coefficients for "c" and "d" are the log odds of "s" for those two categories # of the predictor. We can translate that coefficient to probability using the formula # from above... exp(.69315)/(1+exp(.69315)) # ...or, because of the simplicity of the data, we can simply look again at # the cell counts. xtabs( ~ f + x1) # With 2 out of 3 values being "s" (for both "c" and "d", 2/3rds = 0.6666. So we might # interpret the result as: An observation of "c" has a probability of 0.66 of # displaying a value of "s" and a probabilty of 0.33 of displaying a value of "0" # on the outcome variable; when all other variables are held constant. # # # Next we introduce a one categorical and one numeric predictor. # The interpretation of predictor coefficients does not change with the addition # of more than one predictor; which is why the caveat at the end of the previous # paragraph is critically important. summary(glm(f ~ x1 + x3, family = "binomial")) # Retrieving the predicted probabilities (i.e. fitted values). fit <- glm(f ~ x1 + x3, family = "binomial") summary(fit) fit$fitted.values # Using the likelihood ratio test to compare models and determine if a variable needs # to stay in the model (i.e. it is important) or not. This is a rudimentary way of # checking variable importance. mod.1 <- glm(f ~ x1 + x3, family = "binomial") mod.2 <- glm(f ~ x3, family = "binomial") anova(mod.1, mod.2, test = "LRT") # The LRT is NON-significant, so we could leave out variable 'x1'; meaning there # is no significant difference (in model fit) between the two models (with and # without the "x1" variable). # # Please participate in the DSA Client Feedback Survey. # https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9 # # End script.