# Load needed packages
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --
## v broom 0.7.9 v recipes 0.1.17
## v dials 0.0.10 v rsample 0.1.0
## v dplyr 1.0.7 v tibble 3.1.5
## v ggplot2 3.3.5 v tidyr 1.1.4
## v infer 1.0.0 v tune 0.1.6
## v modeldata 0.1.1 v workflows 0.2.4
## v parsnip 0.1.7 v workflowsets 0.1.0
## v purrr 0.3.4 v yardstick 0.0.8
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
## * Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v readr 1.4.0 v forcats 0.5.1
## v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x readr::spec() masks yardstick::spec()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:purrr':
##
## set_names
library(rlang)
##
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
##
## set_names
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
library(dotwhisker) # for visualizing regression results
processdata <- readRDS(here::here("data","processed_data","processeddata.rds"))
### Linear Linear Model ###
#First, let's set engine for performing linear regression
linear_fit <-
linear_reg() %>%
set_engine("lm")
lm_1 <-
linear_fit %>%
fit(BodyTemp ~ RunnyNose, data = processdata)
lm_1
## parsnip model object
##
## Fit time: 0ms
##
## Call:
## stats::lm(formula = BodyTemp ~ RunnyNose, data = data)
##
## Coefficients:
## (Intercept) RunnyNoseYes
## 99.1431 -0.2926
#summarize the results
tidy(lm_1)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 99.1 0.0819 1210. 0
## 2 RunnyNoseYes -0.293 0.0971 -3.01 0.00268
lm_2 <- linear_fit %>%
fit(BodyTemp ~ ., data = processdata)
lm_2
## parsnip model object
##
## Fit time: 20ms
##
## Call:
## stats::lm(formula = BodyTemp ~ ., data = data)
##
## Coefficients:
## (Intercept) SwollenLymphNodesYes ChestCongestionYes
## 97.925243 -0.165302 0.087326
## ChillsSweatsYes NasalCongestionYes CoughYNYes
## 0.201266 -0.215771 0.313893
## SneezeYes FatigueYes SubjectiveFeverYes
## -0.361924 0.264762 0.436837
## HeadacheYes WeaknessMild WeaknessModerate
## 0.011453 0.018229 0.098944
## WeaknessSevere WeaknessYNYes CoughIntensityMild
## 0.373435 NA 0.084881
## CoughIntensityModerate CoughIntensitySevere CoughYN2Yes
## -0.061384 -0.037272 NA
## MyalgiaMild MyalgiaModerate MyalgiaSevere
## 0.164242 -0.024064 -0.129263
## MyalgiaYNYes RunnyNoseYes AbPainYes
## NA -0.080485 0.031574
## ChestPainYes DiarrheaYes EyePnYes
## 0.105071 -0.156806 0.131544
## InsomniaYes ItchyEyeYes NauseaYes
## -0.006824 -0.008016 -0.034066
## EarPnYes HearingYes PharyngitisYes
## 0.093790 0.232203 0.317581
## BreathlessYes ToothPnYes VisionYes
## 0.090526 -0.022876 -0.274625
## VomitYes WheezeYes
## 0.165272 -0.046665
#summarize the results
tidy(lm_2)
## # A tibble: 38 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 97.9 0.304 322. 0
## 2 SwollenLymphNodesYes -0.165 0.0920 -1.80 0.0727
## 3 ChestCongestionYes 0.0873 0.0975 0.895 0.371
## 4 ChillsSweatsYes 0.201 0.127 1.58 0.114
## 5 NasalCongestionYes -0.216 0.114 -1.90 0.0584
## 6 CoughYNYes 0.314 0.241 1.30 0.193
## 7 SneezeYes -0.362 0.0983 -3.68 0.000249
## 8 FatigueYes 0.265 0.161 1.65 0.0996
## 9 SubjectiveFeverYes 0.437 0.103 4.22 0.0000271
## 10 HeadacheYes 0.0115 0.125 0.0913 0.927
## # ... with 28 more rows
#generate a dot-and-whisker plot of linear regression results with all predictors
dotwhisker_cont_all_pred <- tidy(lm_2) %>%
dotwhisker::dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
#Compares the model results for the linear regression model with just the main predictor and all predictors.
##Analysis of deviance and model selection
res_lm_aov <-
anova(lm_1$fit, lm_2$fit, test = "Chisq")
res_lm_aov
## Analysis of Variance Table
##
## Model 1: BodyTemp ~ RunnyNose
## Model 2: BodyTemp ~ SwollenLymphNodes + ChestCongestion + ChillsSweats +
## NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever +
## Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 +
## Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea +
## EyePn + Insomnia + ItchyEye + Nausea + EarPn + Hearing +
## Pharyngitis + Breathless + ToothPn + Vision + Vomit + Wheeze
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 728 1030.53
## 2 695 909.12 33 121.41 1.357e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value is smaller than 0.05, therefore the model with all predictors had better fit than the single variable model
# Let's set engine for performing linear regression
logistic_fit <-
logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
glm_1 <-
logistic_fit %>%
fit(Nausea ~ RunnyNose, data = processdata)
glm_1
## parsnip model object
##
## Fit time: 0ms
##
## Call: stats::glm(formula = Nausea ~ RunnyNose, family = stats::binomial,
## data = data)
##
## Coefficients:
## (Intercept) RunnyNoseYes
## -0.65781 0.05018
##
## Degrees of Freedom: 729 Total (i.e. Null); 728 Residual
## Null Deviance: 944.7
## Residual Deviance: 944.6 AIC: 948.6
#summarize the results
tidy(glm_1)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.658 0.145 -4.53 0.00000589
## 2 RunnyNoseYes 0.0502 0.172 0.292 0.770
glm_2 <- logistic_fit %>%
fit(Nausea ~ ., data = processdata)
glm_2
## parsnip model object
##
## Fit time: 30ms
##
## Call: stats::glm(formula = Nausea ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) SwollenLymphNodesYes ChestCongestionYes
## 0.222870 -0.251083 0.275554
## ChillsSweatsYes NasalCongestionYes CoughYNYes
## 0.274097 0.425817 -0.140423
## SneezeYes FatigueYes SubjectiveFeverYes
## 0.176724 0.229062 0.277741
## HeadacheYes WeaknessMild WeaknessModerate
## 0.331259 -0.121606 0.310849
## WeaknessSevere WeaknessYNYes CoughIntensityMild
## 0.823187 NA -0.220794
## CoughIntensityModerate CoughIntensitySevere CoughYN2Yes
## -0.362678 -0.950544 NA
## MyalgiaMild MyalgiaModerate MyalgiaSevere
## -0.004146 0.204743 0.120758
## MyalgiaYNYes RunnyNoseYes AbPainYes
## NA 0.045324 0.939304
## ChestPainYes DiarrheaYes EyePnYes
## 0.070777 1.063934 -0.341991
## InsomniaYes ItchyEyeYes EarPnYes
## 0.084175 -0.063364 -0.181719
## HearingYes PharyngitisYes BreathlessYes
## 0.323052 0.275364 0.526801
## ToothPnYes VisionYes VomitYes
## 0.480649 0.125498 2.458466
## WheezeYes BodyTemp
## -0.304435 -0.031246
##
## Degrees of Freedom: 729 Total (i.e. Null); 695 Residual
## Null Deviance: 944.7
## Residual Deviance: 751.5 AIC: 821.5
#summarize the results
tidy(glm_2)
## # A tibble: 38 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.223 7.83 0.0285 0.977
## 2 SwollenLymphNodesYes -0.251 0.196 -1.28 0.200
## 3 ChestCongestionYes 0.276 0.213 1.30 0.195
## 4 ChillsSweatsYes 0.274 0.288 0.952 0.341
## 5 NasalCongestionYes 0.426 0.255 1.67 0.0944
## 6 CoughYNYes -0.140 0.519 -0.271 0.787
## 7 SneezeYes 0.177 0.210 0.840 0.401
## 8 FatigueYes 0.229 0.372 0.616 0.538
## 9 SubjectiveFeverYes 0.278 0.225 1.23 0.218
## 10 HeadacheYes 0.331 0.285 1.16 0.245
## # ... with 28 more rows
#generate a dot-and-whisker plot of logistic regression results with all predictors
dotwhisker_binary_all_pred <- tidy(glm_2) %>%
dotwhisker::dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
#Compares the model results for the logistic regression model with just the main predictor and all predictors.
##Analysis of deviance and model selection
res_glm_aov <-
anova(glm_1$fit, glm_2$fit, test = "Chisq")
res_glm_aov
## Analysis of Deviance Table
##
## Model 1: Nausea ~ RunnyNose
## Model 2: Nausea ~ SwollenLymphNodes + ChestCongestion + ChillsSweats +
## NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever +
## Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 +
## Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea +
## EyePn + Insomnia + ItchyEye + EarPn + Hearing + Pharyngitis +
## Breathless + ToothPn + Vision + Vomit + Wheeze + BodyTemp
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 728 944.57
## 2 695 751.47 33 193.09 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value is smaller than 0.05, therefore the model with all predictors had better fit than the single variable model