#
#
# Please participate in the DSA Client Feedback Survey.
# https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9
#
#
#
# The information below is also available in article form at: 
# http://bayes.acs.unt.edu:8083:8083/BayesContent/class/Jon/Benchmarks/RefCatLogReg_L_JDS_Nov2018.pdf
#
###### Simple script for setting a reference category and interpreting coefficients. 
#
# Changing the 'levels' of a factor to reflect specific values so 
# you can force one 'level' to be the reference category in 'lm' or 'glm'.

x1 <- factor(rep(letters[1:4], 3))
x1
summary(x1)
levels(x1)

y <- c(1.1, 2.1, 3.1, 4.1, 1.5, 2.5, 3.5, 4.5, 1.9, 2.9, 3.9, 4.9)
x3 <- y
summary(lm(y ~ x1))
plot(x1, y)

# How to set a specific 'reference category.' We order the levels so whatever 
# level (or category) is first will be the one we want as a reference. 

x2 <- factor(x1, levels = c("d","a","b","c"))
x2
summary(x2)
levels(x2)

summary(lm(y ~ x2))
plot(x2, y)

# So, looking at the 'x2' model, directly above; we see that the mean (y-value) of 
# "a" is 3.0 units *less than* the mean (y-value) of "d" (which is listed as the 
# intercept). 
# Likewise, the mean of "b" is 2.0 units *less than* the mean of "d" and the mean of "c" 
# is 1.0 units *less than* the mean of "d" -- because "d" is 
# the 'reference category' in the linear regression and the negative coefficients 
# represent the *less than* in the interpretation. 
# The t-test is simply testing if the difference between, say the category 'a' 
# coefficient and the reference category, 'd' is different than zero. 
# 4.50 - 1.50 = -3.00; is that 'difference' greater than zero; yes, p = 0.0000159.
# To 'see' all the coefficients, we can run a 'no-intercept' model. 

summary(lm(y ~ 0 + x2))

# To interpret all four coefficients (listed in the 'no-intercept' model)...we would 
# say that all cases with a value of "d" on 'x2' would be predicted to have a 
# value of 4.5 on 'y'. 
# The t-tests are  simply testing whether the coefficients are different than zero (here, 
# all four are). 

########### LOGISTIC REGRESSION

# See: https://stats.stackexchange.com/questions/60817/significance-of-categorical-predictor-in-logistic-regression
# And: https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/

prob <- c(.001,.01,.05,.1,.15,.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,.99,.999)
odds <- round(prob/(1-prob), 4)
logodds <- round(log(odds), 4)
logit.df <- data.frame(prob,odds,logodds)
rm(prob,odds,logodds)
logit.df

# Now, let's look at a binomial GLM. 

f <- factor(c("0","0","s","s","0","0","s","0","0","0","0","s"), 
            levels = c("0","s"))
f
summary(f)
levels(f)

# First, start with the simplest version, an intercept only model. 

summary(glm(f ~ 1, family = "binomial"))

exp(-.6931)/(1+exp(-.6931)) # probability of "s"; 4/12 = .3333. 

# Next, what happens when we introduce a numeric predictor term?

summary(glm(f ~ x3, family = "binomial"))

# The intercept coefficient is the log odds of someone with a "x3" value of zero 
# having a "f" value of "s". We can translate log odds into just odds, 
# or even probability (of "s"). Both of which are extremely small.

exp(-4.9954)
exp(-4.9954)/(1+exp(-4.9954)) 

# A typical logistic regression coefficient (i.e. the coefficient for a numeric 
# variable) is the expected amount of change in the logit for each unit change 
# in the predictor. The logit is what is being predicted; it is the log odds of 
# membership in the NON-reference category of the outcome variable
# value (here a 's', rather than '0'). The closer a logistic coefficient is to 
# zero, the less influence it has in predicting the logit. So, for every unit 
# change in "x3", we expect the log odds (or logit) to increase by 1.3182. 
# So, to put the logistic coefficient in context, consider the equation of 
# our model:
# logit = -4.9954 + (1.3182*x3)
# If we give "x3" a value, say 5.0 then we get: 

-4.9954 + (1.3182*2.0)

# and if we give it 6.0

-4.9954 + (1.3182*3.0)

# The difference between the two equations above is the coefficient: 

-2.359 - -1.0408

# Similar to other forms of regression coeffients, the logistic coefficient is 
# the amount of change in the outcome (i.e. the logit) for every unit change 
# in our predictor ("x3"). So, for every unit of "x3" we would expect a greater 
# probability of "s" on the outcome. Keep in mind, the reason for using the 
# log odds is to attempt to represent the linear form (i.e. linear equation in 
# the logit). If there is a non-linear relationship between a predictor and the 
# outcome, then the standard logistic model is inappropriate. For example there 
# may be a situation where low values of the predictor decrease the log odds of 
# the logit and medium values of the predictor increase the log odds and high values 
# of the predictor decrease the log odds -- the popular "U-shaped" curve. 
#
#
# Next we explore the binomial logistic regression with a single categorical 
# predictor. 

summary(glm(f ~ x1, family = "binomial"))

# The first thing to state is that the model above has the same outcome variable 
# ("f") as the previous two models. So, we know we are predicting membership 
# in the non-reference category of "f" (which is the log odds, odds, or probability 
# of the value "s"). However, the coefficients in this model are a little different. 
# The coefficient for the intercept term is actually the coefficient of the 
# reference category of the "x1" predictor, "a" = -19.57. That number is the log odds 
# of "s" for cases with "a" on the "x1" predictor. Again, we can translate 
# that value into odds or probability, as was done previously, both of which are 
# incredibly small (but not the same value): 

exp(-19.57)
exp(-19.57)/(1+exp(-19.57)) 

# The other 3 coefficients (for "b", "c", & "d") are not actually coefficients but, 
# instead they represent the difference between their respective coefficients and 
# the reference category coefficient: "a" = -19.57. Notice that the difference 
# (Estimate), standard error, z-value, and p-value is the same for both "c" and 
# "d". That may seem strange, but it will be explained in the next paragraph. 
# The p-values (and Z-test values) are used to determine if these 'difference scores' 
# ("Estimate" for "b", "c", & "d") are statistically different than zero. Note, the 
# p-value for the reference category ("a") is testing if that (true) coefficient is 
# different than zero. Naturally, one would now be wondering what the (true) 
# coefficients are for "b", "c", and "d". If we run a no-intercept model, we 
# are provided with all the coffiients (and their associated tests to determine 
# if they are different from zero): 

summary(glm(f ~ 0 + x1, family = "binomial"))

# Most of the output is identical to what was produced with an intercept term; only 
# the 3 rows in the "Coefficients" table for "b", "c", and "d" are different. It may 
# seem strange that we have two sets of identical coefficients; but this can 
# happen and there is a perfectly simple explanation for it. You can see why those 
# coefficients are the same (for the two pairs of categories) if you review a 
# cross-tabluation of the two variables "f" and "x1".

xtabs( ~ f + x1)

# Categories "a" and "b" have the same number of 'zero' and 's' and categories "c" 
# and "d" have the same number of 'zero' and 's'. We can also see why "a" and "b" 
# have such large negative coefficients (and corresponding tiny odds and 
# probabilities); because, they have no values of 's'. The p-value(s) for "a" 
# and "b" are extremely misleading in this example. First of all, the tiny 
# sample size contributes to large standard error(s). 

df.1 <- data.frame(f, x1)
df.2 <- df.1
for (i in 1:1000){
  df.2 <- rbind(df.2, df.1)
}; rm(i, df.1)

nrow(df.2)
summary(glm(f ~ 0 + x1, df.2, family = "binomial"))

# Second, even with a large sample the p-value might mislead, because logistic 
# regression is all about predicting the logit (log odds, odds, probability of 
# the non-reference category of the outcome variable). So, if the proability 
# of "s" on the outcome is nearly zero (0.000000003168524; see above 
# intecept version of the model)... 

exp(-19.57)/(1+exp(-19.57)) 

# ...then the probability of "0" (zero) on the outcome variable for a particular 
# predictor or category (e.g. "a" & "b")is... 

1 - (exp(-19.57)/(1+exp(-19.57))) 

# The coefficients for "c" and "d" are the log odds of "s" for those two categories 
# of the predictor. We can translate that coefficient to probability using the formula 
# from above...

exp(.69315)/(1+exp(.69315))

# ...or, because of the simplicity of the data, we can simply look again at 
# the cell counts. 

xtabs( ~ f + x1)

# With 2 out of 3 values being "s" (for both "c" and "d", 2/3rds = 0.6666. So we might 
# interpret the result as: An observation of "c" has a probability of 0.66 of 
# displaying a value of "s" and a probabilty of 0.33 of displaying a value of "0"
# on the outcome variable; when all other variables are held constant. 
#
# 
# Next we introduce a one categorical and one numeric predictor.
# The interpretation of predictor coefficients does not change with the addition 
# of more than one predictor; which is why the caveat at the end of the previous 
# paragraph is critically important.

summary(glm(f ~ x1 + x3, family = "binomial"))

# Retrieving the predicted probabilities (i.e. fitted values). 

fit <- glm(f ~ x1 + x3, family = "binomial")
summary(fit)
fit$fitted.values


# Using the likelihood ratio test to compare models and determine if a variable needs 
# to stay in the model (i.e. it is important) or not. This is a rudimentary way of 
# checking variable importance. 

mod.1 <- glm(f ~ x1 + x3, family = "binomial")
mod.2 <- glm(f ~ x3, family = "binomial")
anova(mod.1, mod.2, test = "LRT")

# The LRT is NON-significant, so we could leave out variable 'x1'; meaning there 
# is no significant difference (in model fit) between the two models (with and 
# without the "x1" variable).  


#
# Please participate in the DSA Client Feedback Survey.
# https://unt.az1.qualtrics.com/SE/?SID=SV_diLLVU9iuJZN8C9
#



# End script.