load required packages

library(here) # set path to load data

## here() starts at /Users/savannahhammerton/Local Desktop/GitHub/MADA/Tzu-Chun-MADA-analysis3

library(tidymodels) # for the recipes package and other functions in tidymodels

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──

## ✓ broom        0.7.9      ✓ recipes      0.1.17
## ✓ dials        0.0.10     ✓ rsample      0.1.0 
## ✓ dplyr        1.0.7      ✓ tibble       3.1.5 
## ✓ ggplot2      3.3.5      ✓ tidyr        1.1.4 
## ✓ infer        1.0.0      ✓ tune         0.1.6 
## ✓ modeldata    0.1.1      ✓ workflows    0.2.4 
## ✓ parsnip      0.1.7      ✓ workflowsets 0.1.0 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(tidyverse) # data wrangling

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ stringr 1.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()

library(parsnip) # for model fitting
library(rsample) # for splitting data
library(yardstick) # for creating roc curve

Load data

clean.data <- readRDS(here::here("data","processed_data","processeddata.rds"))

Data splitting:

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 
set.seed(1234)

# Put 3/4 of the data into the training set 
data_split <- initial_split(clean.data, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Workflow creation and model fitting (categorical outcome: Nausea)

# Initiate new recipe
SymptAct_rec <- 
  recipe(Nausea ~ ., data = train_data) 

# Set a engine
glm_mod <- 
  logistic_reg() %>% 
  set_engine("glm")

# Pair a model and recipe together using model workflow
SymptAct_wflow <- 
  workflow() %>% 
  add_model(glm_mod) %>% 
  add_recipe(SymptAct_rec)

SymptAct_wflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

# Prepare recipe and train the model 
SymptAct_fit <- 
  SymptAct_wflow %>% 
  fit(data = train_data)

SymptAct_fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
##               -4.00932                -0.40039                 0.18669  
##        ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
##                0.34332                 0.38350                -0.25584  
##              SneezeYes              FatigueYes      SubjectiveFeverYes  
##                0.03839                 0.14822                 0.18493  
##            HeadacheYes            WeaknessMild        WeaknessModerate  
##                0.57108                -0.46993                 0.06382  
##         WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
##                0.50553                      NA                -0.09652  
## CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
##               -0.16693                -0.98402                      NA  
##            MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
##                0.16979                 0.19642                 0.25894  
##           MyalgiaYNYes            RunnyNoseYes               AbPainYes  
##                     NA                 0.15012                 1.02638  
##           ChestPainYes             DiarrheaYes                EyePnYes  
##                0.16010                 0.88960                -0.30056  
##            InsomniaYes             ItchyEyeYes                EarPnYes  
##                0.14692                 0.11801                -0.17251  
##             HearingYes          PharyngitisYes           BreathlessYes  
##                0.32065                 0.26024                 0.51853  
##             ToothPnYes               VisionYes                VomitYes  
##                0.47012                 0.13946                 2.51947  
##              WheezeYes                BodyTemp  
##               -0.30524                 0.01306  
## 
## Degrees of Freedom: 546 Total (i.e. Null);  512 Residual
## Null Deviance:       712.6 
## Residual Deviance: 561.5     AIC: 631.5

# Extract model objects from the workflow and get a tidy tibble of model coefficients
SymptAct_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()

## # A tibble: 38 × 5
##    term                 estimate std.error statistic p.value
##    <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept)           -4.01       8.97     -0.447  0.655 
##  2 SwollenLymphNodesYes  -0.400      0.229    -1.75   0.0802
##  3 ChestCongestionYes     0.187      0.247     0.755  0.450 
##  4 ChillsSweatsYes        0.343      0.328     1.05   0.296 
##  5 NasalCongestionYes     0.384      0.290     1.32   0.185 
##  6 CoughYNYes            -0.256      0.590    -0.434  0.664 
##  7 SneezeYes              0.0384     0.246     0.156  0.876 
##  8 FatigueYes             0.148      0.448     0.331  0.741 
##  9 SubjectiveFeverYes     0.185      0.257     0.719  0.472 
## 10 HeadacheYes            0.571      0.351     1.63   0.104 
## # … with 28 more rows

SymptAct_fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##            (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
##               -4.00932                -0.40039                 0.18669  
##        ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
##                0.34332                 0.38350                -0.25584  
##              SneezeYes              FatigueYes      SubjectiveFeverYes  
##                0.03839                 0.14822                 0.18493  
##            HeadacheYes            WeaknessMild        WeaknessModerate  
##                0.57108                -0.46993                 0.06382  
##         WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
##                0.50553                      NA                -0.09652  
## CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
##               -0.16693                -0.98402                      NA  
##            MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
##                0.16979                 0.19642                 0.25894  
##           MyalgiaYNYes            RunnyNoseYes               AbPainYes  
##                     NA                 0.15012                 1.02638  
##           ChestPainYes             DiarrheaYes                EyePnYes  
##                0.16010                 0.88960                -0.30056  
##            InsomniaYes             ItchyEyeYes                EarPnYes  
##                0.14692                 0.11801                -0.17251  
##             HearingYes          PharyngitisYes           BreathlessYes  
##                0.32065                 0.26024                 0.51853  
##             ToothPnYes               VisionYes                VomitYes  
##                0.47012                 0.13946                 2.51947  
##              WheezeYes                BodyTemp  
##               -0.30524                 0.01306  
## 
## Degrees of Freedom: 546 Total (i.e. Null);  512 Residual
## Null Deviance:       712.6 
## Residual Deviance: 561.5     AIC: 631.5

Model 1 evaluation

# Predict the unseen test data 
predict(SymptAct_fit, test_data)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## # A tibble: 183 × 1
##    .pred_class
##    <fct>      
##  1 Yes        
##  2 No         
##  3 No         
##  4 No         
##  5 No         
##  6 No         
##  7 No         
##  8 No         
##  9 No         
## 10 No         
## # … with 173 more rows

Outcome is binary variable (factor), hence the outcome class: Yes versus No was returned. In order to return the predicted probabilities for each nausea status instead. We will specify type = “prob” in the predict() or use augment() with the model plus test data to save them together.

# Returns predicted probabilities for outcome status on training and test data
SymptAct_aug_train <- 
    augment(SymptAct_fit, train_data)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

SymptAct_aug_train

## # A tibble: 547 × 35
##    SwollenLymphNodes ChestCongestion ChillsSweats NasalCongestion CoughYN Sneeze
##    <fct>             <fct>           <fct>        <fct>           <fct>   <fct> 
##  1 Yes               No              Yes          No              Yes     No    
##  2 No                Yes             Yes          No              Yes     No    
##  3 No                Yes             Yes          Yes             Yes     Yes   
##  4 Yes               Yes             Yes          Yes             Yes     Yes   
##  5 Yes               Yes             Yes          No              Yes     No    
##  6 Yes               No              No           Yes             Yes     Yes   
##  7 Yes               No              No           Yes             Yes     No    
##  8 No                No              No           No              No      Yes   
##  9 No                Yes             No           Yes             Yes     Yes   
## 10 Yes               Yes             Yes          Yes             Yes     Yes   
## # … with 537 more rows, and 29 more variables: Fatigue <fct>,
## #   SubjectiveFever <fct>, Headache <fct>, Weakness <fct>, WeaknessYN <fct>,
## #   CoughIntensity <fct>, CoughYN2 <fct>, Myalgia <fct>, MyalgiaYN <fct>,
## #   RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
## #   EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
## #   Hearing <fct>, Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>,
## #   Vision <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>, …

SymptAct_aug_test <- 
    augment(SymptAct_fit, test_data)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

SymptAct_aug_test

## # A tibble: 183 × 35
##    SwollenLymphNodes ChestCongestion ChillsSweats NasalCongestion CoughYN Sneeze
##    <fct>             <fct>           <fct>        <fct>           <fct>   <fct> 
##  1 Yes               Yes             Yes          Yes             No      Yes   
##  2 Yes               No              Yes          No              No      No    
##  3 No                No              Yes          No              Yes     Yes   
##  4 No                Yes             Yes          Yes             Yes     Yes   
##  5 Yes               Yes             Yes          Yes             Yes     No    
##  6 No                Yes             Yes          Yes             Yes     Yes   
##  7 No                No              Yes          No              Yes     No    
##  8 Yes               Yes             Yes          Yes             Yes     Yes   
##  9 No                Yes             No           No              Yes     Yes   
## 10 Yes               Yes             Yes          Yes             Yes     Yes   
## # … with 173 more rows, and 29 more variables: Fatigue <fct>,
## #   SubjectiveFever <fct>, Headache <fct>, Weakness <fct>, WeaknessYN <fct>,
## #   CoughIntensity <fct>, CoughYN2 <fct>, Myalgia <fct>, MyalgiaYN <fct>,
## #   RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
## #   EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
## #   Hearing <fct>, Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>,
## #   Vision <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>, …

# The data look like: 
SymptAct_aug_train %>%
  select(Nausea, .pred_class, .pred_Yes)

## # A tibble: 547 × 3
##    Nausea .pred_class .pred_Yes
##    <fct>  <fct>           <dbl>
##  1 No     No             0.166 
##  2 No     No             0.403 
##  3 Yes    No             0.478 
##  4 No     No             0.406 
##  5 No     No             0.381 
##  6 Yes    No             0.337 
##  7 No     No             0.0603
##  8 No     No             0.135 
##  9 No     No             0.277 
## 10 Yes    Yes            0.619 
## # … with 537 more rows

SymptAct_aug_test %>%
  select(Nausea, .pred_class, .pred_Yes)

## # A tibble: 183 × 3
##    Nausea .pred_class .pred_Yes
##    <fct>  <fct>           <dbl>
##  1 Yes    Yes             0.897
##  2 Yes    No              0.181
##  3 Yes    No              0.481
##  4 No     No              0.443
##  5 Yes    No              0.332
##  6 Yes    No              0.445
##  7 No     No              0.208
##  8 Yes    No              0.406
##  9 No     No              0.152
## 10 No     No              0.292
## # … with 173 more rows

ROC curve

# In yardstick, the default is to use the first level, and the first level of Nausea is "No", so event_level argument was used to specify the second level as the event
SymptAct_aug_train %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

SymptAct_aug_test %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

# estimate the auc
SymptAct_aug_train %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.781

SymptAct_aug_test %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.740

For training data, AUC = 0.78, demonstrates a good discriminative ability of the model (model performs better in training data as expected). For test data, AUC = 0.74, demonstrates a good discriminative ability of the model.

Alternative model (only fit runny nose as the main predictor to the categorical outcome)

# Set a new recipe with runny nose
new_SymptAct_rec <- 
  recipe(Nausea ~ RunnyNose, data = train_data) 

# New workflow with runny nose
new_SymptAct_wflow <- 
  workflow() %>% 
  add_model(glm_mod) %>% 
  add_recipe(new_SymptAct_rec)

new_SymptAct_wflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

# Fit the model with runny nose
new_SymptAct_fit <- 
  new_SymptAct_wflow %>% 
  fit(data = train_data)
new_SymptAct_fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
##  (Intercept)  RunnyNoseYes  
##      -0.6639        0.1013  
## 
## Degrees of Freedom: 546 Total (i.e. Null);  545 Residual
## Null Deviance:       712.6 
## Residual Deviance: 712.3     AIC: 716.3

Evaluate ROC on train data

new_SymptAct_aug_train <- 
    augment(new_SymptAct_fit, train_data)


# ROC curve
new_SymptAct_aug_train %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

# Estimate auc
new_SymptAct_aug_train %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.510

Evaluate ROC curves on test data

new_SymptAct_aug_test <- 
    augment(new_SymptAct_fit, test_data)


# ROC curve
new_SymptAct_aug_test %>% 
  roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>% 
  autoplot()

# Estimate auc
new_SymptAct_aug_test %>% 
  roc_auc(truth = Nausea, .pred_Yes, event_level = "second")

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.488

The AUC for the single variable model on both training and test data performed poorly (AUCs are close to 0.5).

Part Two - Savannah’s Contribution

Since the data is already split, I’ll start by creating a new recipe and workflow for the continuous outcome (body temperature).

Body Temperature and all Predictor Variables

#Initiate a new recipe 
Temp_rec <- recipe(BodyTemp ~ ., data = train_data)

#Set engine 
lm_mod <- linear_reg() %>% 
  set_engine('lm')

#Pair a model and recipe together using model workflow
Temp_Workflow <- workflow() %>% 
  add_model(lm_mod) %>% 
  add_recipe(Temp_rec)

Temp_Workflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

#Prepare recipe and train model 
Temp_Fit <- Temp_Workflow %>% 
  fit(data = train_data)

Temp_Fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##            (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
##              97.539863               -0.288629                0.135143  
##        ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
##               0.198567               -0.198702                0.327505  
##              SneezeYes              FatigueYes      SubjectiveFeverYes  
##              -0.335144                0.266783                0.458268  
##            HeadacheYes            WeaknessMild        WeaknessModerate  
##              -0.032819                0.241701                0.198816  
##         WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
##               0.575665                      NA                0.163017  
## CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
##               0.100006                0.192905                      NA  
##            MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
##               0.231368                0.070044               -0.232207  
##           MyalgiaYNYes            RunnyNoseYes               AbPainYes  
##                     NA               -0.039814                0.009099  
##           ChestPainYes             DiarrheaYes                EyePnYes  
##               0.058577               -0.124276                0.115814  
##            InsomniaYes             ItchyEyeYes               NauseaYes  
##              -0.116242               -0.015127                0.012763  
##               EarPnYes              HearingYes          PharyngitisYes  
##               0.121943                0.297368                0.446884  
##          BreathlessYes              ToothPnYes               VisionYes  
##               0.033949                0.068289               -0.267204  
##               VomitYes               WheezeYes  
##               0.078283               -0.132855

# Extract model objects from the workflow and get a tidy tibble of model coefficients
Temp_Fit %>% 
  extract_fit_parsnip() %>% 
  tidy() %>% 
  knitr::kable()

term	estimate	std.error	statistic	p.value
(Intercept)	97.5398634	0.3824408	255.0456285	0.0000000
SwollenLymphNodesYes	-0.2886291	0.1080718	-2.6707153	0.0078098
ChestCongestionYes	0.1351433	0.1146478	1.1787686	0.2390378
ChillsSweatsYes	0.1985666	0.1476811	1.3445634	0.1793616
NasalCongestionYes	-0.1987023	0.1300638	-1.5277294	0.1271973
CoughYNYes	0.3275052	0.2727054	1.2009485	0.2303265
SneezeYes	-0.3351436	0.1151488	-2.9105265	0.0037653
FatigueYes	0.2667828	0.1968260	1.3554244	0.1758800
SubjectiveFeverYes	0.4582678	0.1214332	3.7738255	0.0001796
HeadacheYes	-0.0328194	0.1510628	-0.2172566	0.8280950
WeaknessMild	0.2417006	0.2336466	1.0344708	0.3014045
WeaknessModerate	0.1988160	0.2486508	0.7995792	0.4243254
WeaknessSevere	0.5756650	0.2848415	2.0210009	0.0437996
WeaknessYNYes	NA	NA	NA	NA
CoughIntensityMild	0.1630171	0.3272016	0.4982161	0.6185457
CoughIntensityModerate	0.1000056	0.3526852	0.2835547	0.7768663
CoughIntensitySevere	0.1929047	0.3660682	0.5269639	0.5984469
CoughYN2Yes	NA	NA	NA	NA
MyalgiaMild	0.2313683	0.1913966	1.2088422	0.2272814
MyalgiaModerate	0.0700436	0.2045168	0.3424836	0.7321276
MyalgiaSevere	-0.2322075	0.2476795	-0.9375322	0.3489269
MyalgiaYNYes	NA	NA	NA	NA
RunnyNoseYes	-0.0398141	0.1264285	-0.3149136	0.7529556
AbPainYes	0.0090986	0.1638164	0.0555417	0.9557286
ChestPainYes	0.0585767	0.1249010	0.4689850	0.6392800
DiarrheaYes	-0.1242756	0.1576333	-0.7883843	0.4308368
EyePnYes	0.1158143	0.1502894	0.7706084	0.4412945
InsomniaYes	-0.1162423	0.1065901	-1.0905547	0.2759818
ItchyEyeYes	-0.0151273	0.1282112	-0.1179871	0.9061241
NauseaYes	0.0127633	0.1191778	0.1070948	0.9147557
EarPnYes	0.1219426	0.1324502	0.9206679	0.3576574
HearingYes	0.2973685	0.2594807	1.1460141	0.2523247
PharyngitisYes	0.4468845	0.1442171	3.0986926	0.0020506
BreathlessYes	0.0339486	0.1167120	0.2908746	0.7712651
ToothPnYes	0.0682892	0.1365546	0.5000875	0.6172283
VisionYes	-0.2672039	0.3203912	-0.8339928	0.4046740
VomitYes	0.0782832	0.1780406	0.4396932	0.6603448
WheezeYes	-0.1328550	0.1251465	-1.0615962	0.2889195

#Model Evaluation:
#Predict unseen test data 
predict(Temp_Fit, test_data)

## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading

## # A tibble: 183 × 1
##    .pred
##    <dbl>
##  1  98.7
##  2  99.0
##  3  99.1
##  4  98.6
##  5  98.7
##  6  98.7
##  7  99.1
##  8  98.9
##  9  99.2
## 10  99.1
## # … with 173 more rows

# Returns predicted probabilities for outcome status on training and test data
#Test
Temp_Aug_Test <- augment(Temp_Fit, test_data)

## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading

#View the data:
Temp_Aug_Test %>% 
  select(BodyTemp, .pred)

## # A tibble: 183 × 2
##    BodyTemp .pred
##       <dbl> <dbl>
##  1    101.   98.7
##  2    100.   99.0
##  3     98.4  99.1
##  4     98.4  98.6
##  5     98.5  98.7
##  6     99.2  98.7
##  7     99.3  99.1
##  8     98.2  98.9
##  9     97.8  99.2
## 10    100.   99.1
## # … with 173 more rows

#Train
Temp_Aug_Train <- augment(Temp_Fit, train_data)

## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading

#View the data:
Temp_Aug_Train %>% 
  select(BodyTemp, .pred)

## # A tibble: 547 × 2
##    BodyTemp .pred
##       <dbl> <dbl>
##  1     99.4  99.2
##  2     99.7  99.2
##  3     97.8  98.5
##  4     98.5  98.7
##  5     98.4  99.6
##  6     98.1  98.1
##  7     98.9  97.9
##  8     98    98.3
##  9    101.   98.5
## 10     98.5  98.4
## # … with 537 more rows

#Using RMSE to compare model fits  
#Test 
Temp_Aug_Test %>% 
  rmse(truth = BodyTemp, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.15

#Train
Temp_Aug_Train %>% 
  rmse(truth = BodyTemp, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.12

When comparing the two models, the model seems to fit the train data better. This can be seen in the lower RMSE value of the train data (1.12) than the test data (1.15).

Body Tempurature and Runny Nose

#Initiate a new recipe 
Temp_Run_rec <- recipe(BodyTemp ~ RunnyNose, data = train_data)

#Pair a model and recipe together using model workflow
Temp_Run_Workflow <- workflow() %>% 
  add_model(lm_mod) %>% 
  add_recipe(Temp_Run_rec)

Temp_Run_Workflow

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

#Prepare recipe and train model 
Temp_Run_Fit <- Temp_Run_Workflow %>% 
  fit(data = train_data)

Temp_Run_Fit

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##  (Intercept)  RunnyNoseYes  
##      99.0987       -0.2286

# Extract model objects from the workflow and get a tidy tibble of model coefficients
Temp_Run_Fit %>% 
  extract_fit_parsnip() %>% 
  tidy() %>% 
  knitr::kable()

term	estimate	std.error	statistic	p.value
(Intercept)	99.098693	0.0977719	1013.570032	0.0000000
RunnyNoseYes	-0.228642	0.1152019	-1.984707	0.0476781

#Model Evaluation:
#Predict unseen test data 
predict(Temp_Run_Fit, test_data)

## # A tibble: 183 × 1
##    .pred
##    <dbl>
##  1  98.9
##  2  99.1
##  3  99.1
##  4  98.9
##  5  98.9
##  6  98.9
##  7  99.1
##  8  98.9
##  9  99.1
## 10  98.9
## # … with 173 more rows

# Returns predicted probabilities for outcome status on training and test data
#Test
Temp_Run_Aug_Test <- augment(Temp_Run_Fit, test_data)
#View the data:
Temp_Run_Aug_Test %>% 
  select(BodyTemp, .pred)

## # A tibble: 183 × 2
##    BodyTemp .pred
##       <dbl> <dbl>
##  1    101.   98.9
##  2    100.   99.1
##  3     98.4  99.1
##  4     98.4  98.9
##  5     98.5  98.9
##  6     99.2  98.9
##  7     99.3  99.1
##  8     98.2  98.9
##  9     97.8  99.1
## 10    100.   98.9
## # … with 173 more rows

#Train
Temp_Run_Aug_Train <- augment(Temp_Run_Fit, train_data)
#View the data:
Temp_Run_Aug_Train %>% 
  select(BodyTemp, .pred)

## # A tibble: 547 × 2
##    BodyTemp .pred
##       <dbl> <dbl>
##  1     99.4  98.9
##  2     99.7  99.1
##  3     97.8  98.9
##  4     98.5  98.9
##  5     98.4  99.1
##  6     98.1  98.9
##  7     98.9  98.9
##  8     98    98.9
##  9    101.   98.9
## 10     98.5  98.9
## # … with 537 more rows

#Using RMSE to compare model fits  
#Test 
Temp_Run_Aug_Test %>% 
  rmse(truth = BodyTemp, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.13

#Train
Temp_Run_Aug_Train %>% 
  rmse(truth = BodyTemp, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.21

In the data with just the Runny Nose predictor, the model fit the test data (RMSE = 1.13) better than the train data (RMSE = 1.21).

Overall, the best model fit was the model including all predictor variables when fit to the training data, as this had the lowest RMSE error (1.12).

Module 9 exercise: Model Fitting and Evaluation

Tzu-Chun Chu

10/20/2021