4. 1 Main reported model.
Presence in Social care survey as outcome. Independent variables and interactions as commented in the code chunk below. R automatically fits each individual variable seperately before applying interactions. Both years of data with those living in a care home removed.
4.1.1. Fit model
Model data. Logistic regression with Average Marginal Effects (AME) calculated. To do this over both years of data the dataframe is “nested” and then the model function is applied over the column containing data for each year. New columns are created with model coefficients and AMEs in a tidy fashion so they can be easily tabulated and plotted.
df %<>%
#keep only required variables for the model
select(index, year, scs_flag, sex, age_grp, simd, total_ch)
df$simd <- fct_rev(df$simd) #Reverse order of SIMD so Decile 10 is reference
start <- Sys.time() #Set timer start
#Create a function for the model
sc_model <- function(my_df){
glm(scs_flag ~ #The formula for the model
sex*age_grp + #Interaction: Sex and age group
simd*total_ch + #Interaction: SIMD and total BNF chapters
age_grp*total_ch, #Interaction: Age group and BNF chapters
family = binomial(), #Logistic regression
data = my_df) #identify which data to use
}
sc_mm_mod_1 <-
df %>% #Main dataframe
group_by(year) %>%
#Nest data for each year
nest() %>%
#Create model column using above function
mutate(mod_var = map(data, sc_model),
#Tidy coefficients using broom::tidy into a column
tidy_var = map(mod_var, tidy),
#compute AMEs and add as a column
marginals = map2(mod_var, data, ~margins_summary(.x, data = .y)))
print(Sys.time()-start)
The above code chunk took 12hrs to run - mostly in calculating variances for marginal effects.
4.1.2 Main analysis - Tabulate Odds Ratios
This next chunk unnests the tidied model coefficients from both years of data, calculates odds-ratios (and CIs) for all coefficients, then tidies up labels for the resulting table.
sc_mm_mod_1 %>%
select(year, tidy_var) %>%
unnest(cols = tidy_var, names_repair = tidyr_legacy) %>%
#exponentiate the coefficents for OR
mutate(odds_ratio = exp(estimate),
#Calculate Lower CI and exponentiate
conf_int_low = exp(estimate - 1.96*std.error),
#Calculate Upper CI and exponentiate
conf_int_high = exp(estimate + 1.96*std.error)) %>%
#round to three decimal places
round_df(., dig = 3) %>%
#Tidy up p-values so they don't show as "0"
mutate(p.value = as.character(p.value),
p.value = if_else(p.value == "0", "<0.001", p.value)) -> tidy_tab
#Tidy up labels
prefixes <- c("sex", "scs_flag")
tidy_tab$term <- prefix_strip(tidy_tab$term, prefixes)
tidy_tab$term <- prefix_replace(tidy_tab$term, "Total_ch",
"BNF chpts: ")
tidy_tab$term <- prefix_replace(tidy_tab$term,
"Age_grp", "Age: ")
tidy_tab$term <- prefix_replace(tidy_tab$term,
"Simd", "SIMD Decile: ",
toTitle = FALSE)
tidy_tab %<>%
mutate(year = case_when(!!!year_char))
my_datatable(tidy_tab)
4.1.3. Main analysis - Tabulate goodness-of-fit
Here (utilising broom::glance
) an additional column is added to the main model object with fit statistics for both models.
sc_mm_mod_1 %<>%
mutate(glance_var = map(mod_var, glance))
Now manually calcluate McFadden’s pseudo \(R^{2}\) which has the formula:
\[R^{2}_{McFadden} = 1 - \frac {ln(LM_{1})}{ln(LM_{0})} \]
Where \(ln(LM_{1})\) is the log likelihood value for the fitted model and \(ln(LM_{0})\) is the log likelihood for the null model with only an intercept as a predictor.
Calculate the pseudo \(R^2\), then rearrange variable order- Print this out.
sc_mm_mod_1 %>%
ungroup %>%
select(year, glance_var) %>%
unnest(cols = glance_var, names_repair = tidyr_legacy) %>%
mutate(pseud_r2 = 1 - (logLik / (null.deviance / -2)), #Calculate McFadden's
year = case_when(!!!year_char),
null_LogLik = null.deviance/-2) %>%
select(year, pseud_r2, deviance, logLik, df.residual,
null.deviance, null_LogLik, df.null, AIC, BIC) %>%
#round to three sig figs
round_df(dig = 3) -> gof_tab
my_datatable(gof_tab)
Anything with pseudo \(R^2\) 0.2-0.4 is considered a very good fit
4.1.4 Main analysis - Tabulate and Plot AMEs
Now look at AMEs. Unnest the values from the model object and then print out the table
sc_mm_mod_1 %>%
select(year, marginals) %>%
unnest(marginals, names_repair = tidyr_legacy) %>%
#round to 3 sig fig
round_df(dig = 3) %>%
arrange(year) -> sc_mm_marg_1
#Tidy up labels
prefixes <- c("sex")
sc_mm_marg_1$factor <- prefix_strip(sc_mm_marg_1$factor, prefixes)
sc_mm_marg_1$factor <- prefix_replace(sc_mm_marg_1$factor, "Total_ch",
"BNF chpts: ")
sc_mm_marg_1$factor <- prefix_replace(sc_mm_marg_1$factor,
"Age_grp", "Age: ")
sc_mm_marg_1$factor <- prefix_replace(sc_mm_marg_1$factor,
"Simd", "SIMD Decile: ",
toTitle = FALSE)
sc_mm_marg_1 %<>%
mutate(year = case_when(!!!year_char),
p = as.character(p),
p = if_else(p == "0", "<0.001", p))
my_datatable(sc_mm_marg_1)
Rather than comparing lots of figures over two models it is always easier to visualise these values….
sc_mm_marg_1 %>%
ggplot(aes(factor(factor), AME,
ymin = lower, ymax = upper,
colour = factor(year))) +
geom_hline(yintercept = 0, colour = "#555555") +
geom_pointrange(fatten = 6, position = position_jitter(width = 0.3)) +
coord_flip() +
scale_colour_ptol(labels = year_labels) +
scale_y_continuous(limits = c(-0.01, 0.55),
breaks = scales::pretty_breaks(n = 8),
labels = scales::percent_format(accuracy = 1)) +
theme(legend.position = "top",
plot.title = element_text(hjust = 0.5)) +
labs(x = "",
y= "Average Marginal Effect",
colour = "",
title = "Influence of variables on receiving social care",
caption = "Ref group SIMD: Decile 10\nRef group BNF Chpts: 0\nRef group Age: 65-69") -> sc_mm_bnf_AME
sc_mm_bnf_AME