load required packages

# Load needed packages
library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --

## v broom        0.7.9      v recipes      0.1.17
## v dials        0.0.10     v rsample      0.1.0 
## v dplyr        1.0.7      v tibble       3.1.5 
## v ggplot2      3.3.5      v tidyr        1.1.4 
## v infer        1.0.0      v tune         0.1.6 
## v modeldata    0.1.1      v workflows    0.2.4 
## v parsnip      0.1.7      v workflowsets 0.1.0 
## v purrr        0.3.4      v yardstick    0.0.8

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Use suppressPackageStartupMessages() to eliminate package startup messages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v readr   1.4.0     v forcats 0.5.1
## v stringr 1.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:purrr':
## 
##     set_names

library(rlang)

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:magrittr':
## 
##     set_names

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

library(dotwhisker)  # for visualizing regression results

Load data

processdata <- readRDS(here::here("data","processed_data","processeddata.rds"))

Model fitting

### Linear Linear Model ###

#First, let's set engine for performing linear regression
linear_fit <- 
    linear_reg() %>% 
    set_engine("lm")

Linear Model 1: regression body temperature on runny nose

lm_1 <- 
    linear_fit %>% 
    fit(BodyTemp ~ RunnyNose, data = processdata)
lm_1

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## stats::lm(formula = BodyTemp ~ RunnyNose, data = data)
## 
## Coefficients:
##  (Intercept)  RunnyNoseYes  
##      99.1431       -0.2926

#summarize the results
tidy(lm_1)

## # A tibble: 2 x 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    99.1      0.0819   1210.   0      
## 2 RunnyNoseYes   -0.293    0.0971     -3.01 0.00268

Linear Model 2: regression body temperature on all predictors

lm_2 <- linear_fit %>% 
    fit(BodyTemp ~ ., data = processdata)
lm_2

## parsnip model object
## 
## Fit time:  20ms 
## 
## Call:
## stats::lm(formula = BodyTemp ~ ., data = data)
## 
## Coefficients:
##            (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
##              97.925243               -0.165302                0.087326  
##        ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
##               0.201266               -0.215771                0.313893  
##              SneezeYes              FatigueYes      SubjectiveFeverYes  
##              -0.361924                0.264762                0.436837  
##            HeadacheYes            WeaknessMild        WeaknessModerate  
##               0.011453                0.018229                0.098944  
##         WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
##               0.373435                      NA                0.084881  
## CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
##              -0.061384               -0.037272                      NA  
##            MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
##               0.164242               -0.024064               -0.129263  
##           MyalgiaYNYes            RunnyNoseYes               AbPainYes  
##                     NA               -0.080485                0.031574  
##           ChestPainYes             DiarrheaYes                EyePnYes  
##               0.105071               -0.156806                0.131544  
##            InsomniaYes             ItchyEyeYes               NauseaYes  
##              -0.006824               -0.008016               -0.034066  
##               EarPnYes              HearingYes          PharyngitisYes  
##               0.093790                0.232203                0.317581  
##          BreathlessYes              ToothPnYes               VisionYes  
##               0.090526               -0.022876               -0.274625  
##               VomitYes               WheezeYes  
##               0.165272               -0.046665

#summarize the results
tidy(lm_2)

## # A tibble: 38 x 5
##    term                 estimate std.error statistic   p.value
##    <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)           97.9       0.304   322.     0        
##  2 SwollenLymphNodesYes  -0.165     0.0920   -1.80   0.0727   
##  3 ChestCongestionYes     0.0873    0.0975    0.895  0.371    
##  4 ChillsSweatsYes        0.201     0.127     1.58   0.114    
##  5 NasalCongestionYes    -0.216     0.114    -1.90   0.0584   
##  6 CoughYNYes             0.314     0.241     1.30   0.193    
##  7 SneezeYes             -0.362     0.0983   -3.68   0.000249 
##  8 FatigueYes             0.265     0.161     1.65   0.0996   
##  9 SubjectiveFeverYes     0.437     0.103     4.22   0.0000271
## 10 HeadacheYes            0.0115    0.125     0.0913 0.927    
## # ... with 28 more rows

plotting

#generate a dot-and-whisker plot of linear regression results with all predictors
dotwhisker_cont_all_pred <- tidy(lm_2) %>% 
    dotwhisker::dwplot(dot_args = list(size = 2, color = "black"),
                                         whisker_args = list(color = "black"),
                                         vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))

#Compares the model results for the linear regression model with just the main predictor and all predictors.
##Analysis of deviance and model selection
res_lm_aov <-
    anova(lm_1$fit, lm_2$fit, test = "Chisq")
res_lm_aov

## Analysis of Variance Table
## 
## Model 1: BodyTemp ~ RunnyNose
## Model 2: BodyTemp ~ SwollenLymphNodes + ChestCongestion + ChillsSweats + 
##     NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever + 
##     Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 + 
##     Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea + 
##     EyePn + Insomnia + ItchyEye + Nausea + EarPn + Hearing + 
##     Pharyngitis + Breathless + ToothPn + Vision + Vomit + Wheeze
##   Res.Df     RSS Df Sum of Sq  Pr(>Chi)    
## 1    728 1030.53                           
## 2    695  909.12 33    121.41 1.357e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is smaller than 0.05, therefore the model with all predictors had better fit than the single variable model

Logistic regression Model

# Let's set engine for performing linear regression
logistic_fit <- 
    logistic_reg() %>% 
    set_mode("classification") %>% 
    set_engine("glm")

Logistic Model 1: regression body temperature on runny nose

glm_1 <- 
    logistic_fit %>% 
    fit(Nausea ~ RunnyNose, data = processdata)
glm_1

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:  stats::glm(formula = Nausea ~ RunnyNose, family = stats::binomial, 
##     data = data)
## 
## Coefficients:
##  (Intercept)  RunnyNoseYes  
##     -0.65781       0.05018  
## 
## Degrees of Freedom: 729 Total (i.e. Null);  728 Residual
## Null Deviance:       944.7 
## Residual Deviance: 944.6     AIC: 948.6

#summarize the results
tidy(glm_1)

## # A tibble: 2 x 5
##   term         estimate std.error statistic    p.value
##   <chr>           <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)   -0.658      0.145    -4.53  0.00000589
## 2 RunnyNoseYes   0.0502     0.172     0.292 0.770

Logistic Model 2: regression body temperature on all predictors

glm_2 <- logistic_fit %>% 
    fit(Nausea ~ ., data = processdata)
glm_2

## parsnip model object
## 
## Fit time:  30ms 
## 
## Call:  stats::glm(formula = Nausea ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
##               0.222870               -0.251083                0.275554  
##        ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
##               0.274097                0.425817               -0.140423  
##              SneezeYes              FatigueYes      SubjectiveFeverYes  
##               0.176724                0.229062                0.277741  
##            HeadacheYes            WeaknessMild        WeaknessModerate  
##               0.331259               -0.121606                0.310849  
##         WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
##               0.823187                      NA               -0.220794  
## CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
##              -0.362678               -0.950544                      NA  
##            MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
##              -0.004146                0.204743                0.120758  
##           MyalgiaYNYes            RunnyNoseYes               AbPainYes  
##                     NA                0.045324                0.939304  
##           ChestPainYes             DiarrheaYes                EyePnYes  
##               0.070777                1.063934               -0.341991  
##            InsomniaYes             ItchyEyeYes                EarPnYes  
##               0.084175               -0.063364               -0.181719  
##             HearingYes          PharyngitisYes           BreathlessYes  
##               0.323052                0.275364                0.526801  
##             ToothPnYes               VisionYes                VomitYes  
##               0.480649                0.125498                2.458466  
##              WheezeYes                BodyTemp  
##              -0.304435               -0.031246  
## 
## Degrees of Freedom: 729 Total (i.e. Null);  695 Residual
## Null Deviance:       944.7 
## Residual Deviance: 751.5     AIC: 821.5

#summarize the results
tidy(glm_2)

## # A tibble: 38 x 5
##    term                 estimate std.error statistic p.value
##    <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)             0.223     7.83     0.0285  0.977 
##  2 SwollenLymphNodesYes   -0.251     0.196   -1.28    0.200 
##  3 ChestCongestionYes      0.276     0.213    1.30    0.195 
##  4 ChillsSweatsYes         0.274     0.288    0.952   0.341 
##  5 NasalCongestionYes      0.426     0.255    1.67    0.0944
##  6 CoughYNYes             -0.140     0.519   -0.271   0.787 
##  7 SneezeYes               0.177     0.210    0.840   0.401 
##  8 FatigueYes              0.229     0.372    0.616   0.538 
##  9 SubjectiveFeverYes      0.278     0.225    1.23    0.218 
## 10 HeadacheYes             0.331     0.285    1.16    0.245 
## # ... with 28 more rows

Plotting

#generate a dot-and-whisker plot of logistic regression results with all predictors
dotwhisker_binary_all_pred <- tidy(glm_2) %>% 
    dotwhisker::dwplot(dot_args = list(size = 2, color = "black"),
                                         whisker_args = list(color = "black"),
                                         vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))

#Compares the model results for the logistic regression model with just the main predictor and all predictors.
##Analysis of deviance and model selection
res_glm_aov <-
    anova(glm_1$fit, glm_2$fit, test = "Chisq")
res_glm_aov

## Analysis of Deviance Table
## 
## Model 1: Nausea ~ RunnyNose
## Model 2: Nausea ~ SwollenLymphNodes + ChestCongestion + ChillsSweats + 
##     NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever + 
##     Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 + 
##     Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea + 
##     EyePn + Insomnia + ItchyEye + EarPn + Hearing + Pharyngitis + 
##     Breathless + ToothPn + Vision + Vomit + Wheeze + BodyTemp
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       728     944.57                          
## 2       695     751.47 33   193.09 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is smaller than 0.05, therefore the model with all predictors had better fit than the single variable model

Module 8 exercise: Fitting Basic Statistical Models

Tzu-Chun Chu

10/12/2021

load required packages

Load data

Model fitting

Linear Model 1: regression body temperature on runny nose

Linear Model 2: regression body temperature on all predictors

plotting

Logistic regression Model

Logistic Model 1: regression body temperature on runny nose

Logistic Model 2: regression body temperature on all predictors

Plotting