library(here) # set path to load data
## here() starts at /Users/savannahhammerton/Local Desktop/GitHub/MADA/Tzu-Chun-MADA-analysis3
library(tidymodels) # for the recipes package and other functions in tidymodels
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.9 ✓ recipes 0.1.17
## ✓ dials 0.0.10 ✓ rsample 0.1.0
## ✓ dplyr 1.0.7 ✓ tibble 3.1.5
## ✓ ggplot2 3.3.5 ✓ tidyr 1.1.4
## ✓ infer 1.0.0 ✓ tune 0.1.6
## ✓ modeldata 0.1.1 ✓ workflows 0.2.4
## ✓ parsnip 0.1.7 ✓ workflowsets 0.1.0
## ✓ purrr 0.3.4 ✓ yardstick 0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse) # data wrangling
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x readr::spec() masks yardstick::spec()
library(parsnip) # for model fitting
library(rsample) # for splitting data
library(yardstick) # for creating roc curve
clean.data <- readRDS(here::here("data","processed_data","processeddata.rds"))
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible when random numbers are used
set.seed(1234)
# Put 3/4 of the data into the training set
data_split <- initial_split(clean.data, prop = 3/4)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Initiate new recipe
SymptAct_rec <-
recipe(Nausea ~ ., data = train_data)
# Set a engine
glm_mod <-
logistic_reg() %>%
set_engine("glm")
# Pair a model and recipe together using model workflow
SymptAct_wflow <-
workflow() %>%
add_model(glm_mod) %>%
add_recipe(SymptAct_rec)
SymptAct_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
# Prepare recipe and train the model
SymptAct_fit <-
SymptAct_wflow %>%
fit(data = train_data)
SymptAct_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) SwollenLymphNodesYes ChestCongestionYes
## -4.00932 -0.40039 0.18669
## ChillsSweatsYes NasalCongestionYes CoughYNYes
## 0.34332 0.38350 -0.25584
## SneezeYes FatigueYes SubjectiveFeverYes
## 0.03839 0.14822 0.18493
## HeadacheYes WeaknessMild WeaknessModerate
## 0.57108 -0.46993 0.06382
## WeaknessSevere WeaknessYNYes CoughIntensityMild
## 0.50553 NA -0.09652
## CoughIntensityModerate CoughIntensitySevere CoughYN2Yes
## -0.16693 -0.98402 NA
## MyalgiaMild MyalgiaModerate MyalgiaSevere
## 0.16979 0.19642 0.25894
## MyalgiaYNYes RunnyNoseYes AbPainYes
## NA 0.15012 1.02638
## ChestPainYes DiarrheaYes EyePnYes
## 0.16010 0.88960 -0.30056
## InsomniaYes ItchyEyeYes EarPnYes
## 0.14692 0.11801 -0.17251
## HearingYes PharyngitisYes BreathlessYes
## 0.32065 0.26024 0.51853
## ToothPnYes VisionYes VomitYes
## 0.47012 0.13946 2.51947
## WheezeYes BodyTemp
## -0.30524 0.01306
##
## Degrees of Freedom: 546 Total (i.e. Null); 512 Residual
## Null Deviance: 712.6
## Residual Deviance: 561.5 AIC: 631.5
# Extract model objects from the workflow and get a tidy tibble of model coefficients
SymptAct_fit %>%
extract_fit_parsnip() %>%
tidy()
## # A tibble: 38 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.01 8.97 -0.447 0.655
## 2 SwollenLymphNodesYes -0.400 0.229 -1.75 0.0802
## 3 ChestCongestionYes 0.187 0.247 0.755 0.450
## 4 ChillsSweatsYes 0.343 0.328 1.05 0.296
## 5 NasalCongestionYes 0.384 0.290 1.32 0.185
## 6 CoughYNYes -0.256 0.590 -0.434 0.664
## 7 SneezeYes 0.0384 0.246 0.156 0.876
## 8 FatigueYes 0.148 0.448 0.331 0.741
## 9 SubjectiveFeverYes 0.185 0.257 0.719 0.472
## 10 HeadacheYes 0.571 0.351 1.63 0.104
## # … with 28 more rows
SymptAct_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) SwollenLymphNodesYes ChestCongestionYes
## -4.00932 -0.40039 0.18669
## ChillsSweatsYes NasalCongestionYes CoughYNYes
## 0.34332 0.38350 -0.25584
## SneezeYes FatigueYes SubjectiveFeverYes
## 0.03839 0.14822 0.18493
## HeadacheYes WeaknessMild WeaknessModerate
## 0.57108 -0.46993 0.06382
## WeaknessSevere WeaknessYNYes CoughIntensityMild
## 0.50553 NA -0.09652
## CoughIntensityModerate CoughIntensitySevere CoughYN2Yes
## -0.16693 -0.98402 NA
## MyalgiaMild MyalgiaModerate MyalgiaSevere
## 0.16979 0.19642 0.25894
## MyalgiaYNYes RunnyNoseYes AbPainYes
## NA 0.15012 1.02638
## ChestPainYes DiarrheaYes EyePnYes
## 0.16010 0.88960 -0.30056
## InsomniaYes ItchyEyeYes EarPnYes
## 0.14692 0.11801 -0.17251
## HearingYes PharyngitisYes BreathlessYes
## 0.32065 0.26024 0.51853
## ToothPnYes VisionYes VomitYes
## 0.47012 0.13946 2.51947
## WheezeYes BodyTemp
## -0.30524 0.01306
##
## Degrees of Freedom: 546 Total (i.e. Null); 512 Residual
## Null Deviance: 712.6
## Residual Deviance: 561.5 AIC: 631.5
# Predict the unseen test data
predict(SymptAct_fit, test_data)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # A tibble: 183 × 1
## .pred_class
## <fct>
## 1 Yes
## 2 No
## 3 No
## 4 No
## 5 No
## 6 No
## 7 No
## 8 No
## 9 No
## 10 No
## # … with 173 more rows
Outcome is binary variable (factor), hence the outcome class: Yes versus No was returned. In order to return the predicted probabilities for each nausea status instead. We will specify type = “prob” in the predict() or use augment() with the model plus test data to save them together.
# Returns predicted probabilities for outcome status on training and test data
SymptAct_aug_train <-
augment(SymptAct_fit, train_data)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
SymptAct_aug_train
## # A tibble: 547 × 35
## SwollenLymphNodes ChestCongestion ChillsSweats NasalCongestion CoughYN Sneeze
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 Yes No Yes No Yes No
## 2 No Yes Yes No Yes No
## 3 No Yes Yes Yes Yes Yes
## 4 Yes Yes Yes Yes Yes Yes
## 5 Yes Yes Yes No Yes No
## 6 Yes No No Yes Yes Yes
## 7 Yes No No Yes Yes No
## 8 No No No No No Yes
## 9 No Yes No Yes Yes Yes
## 10 Yes Yes Yes Yes Yes Yes
## # … with 537 more rows, and 29 more variables: Fatigue <fct>,
## # SubjectiveFever <fct>, Headache <fct>, Weakness <fct>, WeaknessYN <fct>,
## # CoughIntensity <fct>, CoughYN2 <fct>, Myalgia <fct>, MyalgiaYN <fct>,
## # RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
## # EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
## # Hearing <fct>, Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>,
## # Vision <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>, …
SymptAct_aug_test <-
augment(SymptAct_fit, test_data)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
SymptAct_aug_test
## # A tibble: 183 × 35
## SwollenLymphNodes ChestCongestion ChillsSweats NasalCongestion CoughYN Sneeze
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 Yes Yes Yes Yes No Yes
## 2 Yes No Yes No No No
## 3 No No Yes No Yes Yes
## 4 No Yes Yes Yes Yes Yes
## 5 Yes Yes Yes Yes Yes No
## 6 No Yes Yes Yes Yes Yes
## 7 No No Yes No Yes No
## 8 Yes Yes Yes Yes Yes Yes
## 9 No Yes No No Yes Yes
## 10 Yes Yes Yes Yes Yes Yes
## # … with 173 more rows, and 29 more variables: Fatigue <fct>,
## # SubjectiveFever <fct>, Headache <fct>, Weakness <fct>, WeaknessYN <fct>,
## # CoughIntensity <fct>, CoughYN2 <fct>, Myalgia <fct>, MyalgiaYN <fct>,
## # RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
## # EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
## # Hearing <fct>, Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>,
## # Vision <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>, …
# The data look like:
SymptAct_aug_train %>%
select(Nausea, .pred_class, .pred_Yes)
## # A tibble: 547 × 3
## Nausea .pred_class .pred_Yes
## <fct> <fct> <dbl>
## 1 No No 0.166
## 2 No No 0.403
## 3 Yes No 0.478
## 4 No No 0.406
## 5 No No 0.381
## 6 Yes No 0.337
## 7 No No 0.0603
## 8 No No 0.135
## 9 No No 0.277
## 10 Yes Yes 0.619
## # … with 537 more rows
SymptAct_aug_test %>%
select(Nausea, .pred_class, .pred_Yes)
## # A tibble: 183 × 3
## Nausea .pred_class .pred_Yes
## <fct> <fct> <dbl>
## 1 Yes Yes 0.897
## 2 Yes No 0.181
## 3 Yes No 0.481
## 4 No No 0.443
## 5 Yes No 0.332
## 6 Yes No 0.445
## 7 No No 0.208
## 8 Yes No 0.406
## 9 No No 0.152
## 10 No No 0.292
## # … with 173 more rows
# In yardstick, the default is to use the first level, and the first level of Nausea is "No", so event_level argument was used to specify the second level as the event
SymptAct_aug_train %>%
roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>%
autoplot()
SymptAct_aug_test %>%
roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>%
autoplot()
# estimate the auc
SymptAct_aug_train %>%
roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.781
SymptAct_aug_test %>%
roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.740
For training data, AUC = 0.78, demonstrates a good discriminative ability of the model (model performs better in training data as expected). For test data, AUC = 0.74, demonstrates a good discriminative ability of the model.
# Set a new recipe with runny nose
new_SymptAct_rec <-
recipe(Nausea ~ RunnyNose, data = train_data)
# New workflow with runny nose
new_SymptAct_wflow <-
workflow() %>%
add_model(glm_mod) %>%
add_recipe(new_SymptAct_rec)
new_SymptAct_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
# Fit the model with runny nose
new_SymptAct_fit <-
new_SymptAct_wflow %>%
fit(data = train_data)
new_SymptAct_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) RunnyNoseYes
## -0.6639 0.1013
##
## Degrees of Freedom: 546 Total (i.e. Null); 545 Residual
## Null Deviance: 712.6
## Residual Deviance: 712.3 AIC: 716.3
new_SymptAct_aug_train <-
augment(new_SymptAct_fit, train_data)
# ROC curve
new_SymptAct_aug_train %>%
roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>%
autoplot()
# Estimate auc
new_SymptAct_aug_train %>%
roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.510
new_SymptAct_aug_test <-
augment(new_SymptAct_fit, test_data)
# ROC curve
new_SymptAct_aug_test %>%
roc_curve(truth = Nausea, .pred_Yes, event_level = "second") %>%
autoplot()
# Estimate auc
new_SymptAct_aug_test %>%
roc_auc(truth = Nausea, .pred_Yes, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.488
The AUC for the single variable model on both training and test data performed poorly (AUCs are close to 0.5).
Since the data is already split, I’ll start by creating a new recipe and workflow for the continuous outcome (body temperature).
#Initiate a new recipe
Temp_rec <- recipe(BodyTemp ~ ., data = train_data)
#Set engine
lm_mod <- linear_reg() %>%
set_engine('lm')
#Pair a model and recipe together using model workflow
Temp_Workflow <- workflow() %>%
add_model(lm_mod) %>%
add_recipe(Temp_rec)
Temp_Workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
#Prepare recipe and train model
Temp_Fit <- Temp_Workflow %>%
fit(data = train_data)
Temp_Fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) SwollenLymphNodesYes ChestCongestionYes
## 97.539863 -0.288629 0.135143
## ChillsSweatsYes NasalCongestionYes CoughYNYes
## 0.198567 -0.198702 0.327505
## SneezeYes FatigueYes SubjectiveFeverYes
## -0.335144 0.266783 0.458268
## HeadacheYes WeaknessMild WeaknessModerate
## -0.032819 0.241701 0.198816
## WeaknessSevere WeaknessYNYes CoughIntensityMild
## 0.575665 NA 0.163017
## CoughIntensityModerate CoughIntensitySevere CoughYN2Yes
## 0.100006 0.192905 NA
## MyalgiaMild MyalgiaModerate MyalgiaSevere
## 0.231368 0.070044 -0.232207
## MyalgiaYNYes RunnyNoseYes AbPainYes
## NA -0.039814 0.009099
## ChestPainYes DiarrheaYes EyePnYes
## 0.058577 -0.124276 0.115814
## InsomniaYes ItchyEyeYes NauseaYes
## -0.116242 -0.015127 0.012763
## EarPnYes HearingYes PharyngitisYes
## 0.121943 0.297368 0.446884
## BreathlessYes ToothPnYes VisionYes
## 0.033949 0.068289 -0.267204
## VomitYes WheezeYes
## 0.078283 -0.132855
# Extract model objects from the workflow and get a tidy tibble of model coefficients
Temp_Fit %>%
extract_fit_parsnip() %>%
tidy() %>%
knitr::kable()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 97.5398634 | 0.3824408 | 255.0456285 | 0.0000000 |
SwollenLymphNodesYes | -0.2886291 | 0.1080718 | -2.6707153 | 0.0078098 |
ChestCongestionYes | 0.1351433 | 0.1146478 | 1.1787686 | 0.2390378 |
ChillsSweatsYes | 0.1985666 | 0.1476811 | 1.3445634 | 0.1793616 |
NasalCongestionYes | -0.1987023 | 0.1300638 | -1.5277294 | 0.1271973 |
CoughYNYes | 0.3275052 | 0.2727054 | 1.2009485 | 0.2303265 |
SneezeYes | -0.3351436 | 0.1151488 | -2.9105265 | 0.0037653 |
FatigueYes | 0.2667828 | 0.1968260 | 1.3554244 | 0.1758800 |
SubjectiveFeverYes | 0.4582678 | 0.1214332 | 3.7738255 | 0.0001796 |
HeadacheYes | -0.0328194 | 0.1510628 | -0.2172566 | 0.8280950 |
WeaknessMild | 0.2417006 | 0.2336466 | 1.0344708 | 0.3014045 |
WeaknessModerate | 0.1988160 | 0.2486508 | 0.7995792 | 0.4243254 |
WeaknessSevere | 0.5756650 | 0.2848415 | 2.0210009 | 0.0437996 |
WeaknessYNYes | NA | NA | NA | NA |
CoughIntensityMild | 0.1630171 | 0.3272016 | 0.4982161 | 0.6185457 |
CoughIntensityModerate | 0.1000056 | 0.3526852 | 0.2835547 | 0.7768663 |
CoughIntensitySevere | 0.1929047 | 0.3660682 | 0.5269639 | 0.5984469 |
CoughYN2Yes | NA | NA | NA | NA |
MyalgiaMild | 0.2313683 | 0.1913966 | 1.2088422 | 0.2272814 |
MyalgiaModerate | 0.0700436 | 0.2045168 | 0.3424836 | 0.7321276 |
MyalgiaSevere | -0.2322075 | 0.2476795 | -0.9375322 | 0.3489269 |
MyalgiaYNYes | NA | NA | NA | NA |
RunnyNoseYes | -0.0398141 | 0.1264285 | -0.3149136 | 0.7529556 |
AbPainYes | 0.0090986 | 0.1638164 | 0.0555417 | 0.9557286 |
ChestPainYes | 0.0585767 | 0.1249010 | 0.4689850 | 0.6392800 |
DiarrheaYes | -0.1242756 | 0.1576333 | -0.7883843 | 0.4308368 |
EyePnYes | 0.1158143 | 0.1502894 | 0.7706084 | 0.4412945 |
InsomniaYes | -0.1162423 | 0.1065901 | -1.0905547 | 0.2759818 |
ItchyEyeYes | -0.0151273 | 0.1282112 | -0.1179871 | 0.9061241 |
NauseaYes | 0.0127633 | 0.1191778 | 0.1070948 | 0.9147557 |
EarPnYes | 0.1219426 | 0.1324502 | 0.9206679 | 0.3576574 |
HearingYes | 0.2973685 | 0.2594807 | 1.1460141 | 0.2523247 |
PharyngitisYes | 0.4468845 | 0.1442171 | 3.0986926 | 0.0020506 |
BreathlessYes | 0.0339486 | 0.1167120 | 0.2908746 | 0.7712651 |
ToothPnYes | 0.0682892 | 0.1365546 | 0.5000875 | 0.6172283 |
VisionYes | -0.2672039 | 0.3203912 | -0.8339928 | 0.4046740 |
VomitYes | 0.0782832 | 0.1780406 | 0.4396932 | 0.6603448 |
WheezeYes | -0.1328550 | 0.1251465 | -1.0615962 | 0.2889195 |
#Model Evaluation:
#Predict unseen test data
predict(Temp_Fit, test_data)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
## # A tibble: 183 × 1
## .pred
## <dbl>
## 1 98.7
## 2 99.0
## 3 99.1
## 4 98.6
## 5 98.7
## 6 98.7
## 7 99.1
## 8 98.9
## 9 99.2
## 10 99.1
## # … with 173 more rows
# Returns predicted probabilities for outcome status on training and test data
#Test
Temp_Aug_Test <- augment(Temp_Fit, test_data)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
#View the data:
Temp_Aug_Test %>%
select(BodyTemp, .pred)
## # A tibble: 183 × 2
## BodyTemp .pred
## <dbl> <dbl>
## 1 101. 98.7
## 2 100. 99.0
## 3 98.4 99.1
## 4 98.4 98.6
## 5 98.5 98.7
## 6 99.2 98.7
## 7 99.3 99.1
## 8 98.2 98.9
## 9 97.8 99.2
## 10 100. 99.1
## # … with 173 more rows
#Train
Temp_Aug_Train <- augment(Temp_Fit, train_data)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
#View the data:
Temp_Aug_Train %>%
select(BodyTemp, .pred)
## # A tibble: 547 × 2
## BodyTemp .pred
## <dbl> <dbl>
## 1 99.4 99.2
## 2 99.7 99.2
## 3 97.8 98.5
## 4 98.5 98.7
## 5 98.4 99.6
## 6 98.1 98.1
## 7 98.9 97.9
## 8 98 98.3
## 9 101. 98.5
## 10 98.5 98.4
## # … with 537 more rows
#Using RMSE to compare model fits
#Test
Temp_Aug_Test %>%
rmse(truth = BodyTemp, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.15
#Train
Temp_Aug_Train %>%
rmse(truth = BodyTemp, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.12
When comparing the two models, the model seems to fit the train data better. This can be seen in the lower RMSE value of the train data (1.12) than the test data (1.15).
#Initiate a new recipe
Temp_Run_rec <- recipe(BodyTemp ~ RunnyNose, data = train_data)
#Pair a model and recipe together using model workflow
Temp_Run_Workflow <- workflow() %>%
add_model(lm_mod) %>%
add_recipe(Temp_Run_rec)
Temp_Run_Workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
#Prepare recipe and train model
Temp_Run_Fit <- Temp_Run_Workflow %>%
fit(data = train_data)
Temp_Run_Fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) RunnyNoseYes
## 99.0987 -0.2286
# Extract model objects from the workflow and get a tidy tibble of model coefficients
Temp_Run_Fit %>%
extract_fit_parsnip() %>%
tidy() %>%
knitr::kable()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 99.098693 | 0.0977719 | 1013.570032 | 0.0000000 |
RunnyNoseYes | -0.228642 | 0.1152019 | -1.984707 | 0.0476781 |
#Model Evaluation:
#Predict unseen test data
predict(Temp_Run_Fit, test_data)
## # A tibble: 183 × 1
## .pred
## <dbl>
## 1 98.9
## 2 99.1
## 3 99.1
## 4 98.9
## 5 98.9
## 6 98.9
## 7 99.1
## 8 98.9
## 9 99.1
## 10 98.9
## # … with 173 more rows
# Returns predicted probabilities for outcome status on training and test data
#Test
Temp_Run_Aug_Test <- augment(Temp_Run_Fit, test_data)
#View the data:
Temp_Run_Aug_Test %>%
select(BodyTemp, .pred)
## # A tibble: 183 × 2
## BodyTemp .pred
## <dbl> <dbl>
## 1 101. 98.9
## 2 100. 99.1
## 3 98.4 99.1
## 4 98.4 98.9
## 5 98.5 98.9
## 6 99.2 98.9
## 7 99.3 99.1
## 8 98.2 98.9
## 9 97.8 99.1
## 10 100. 98.9
## # … with 173 more rows
#Train
Temp_Run_Aug_Train <- augment(Temp_Run_Fit, train_data)
#View the data:
Temp_Run_Aug_Train %>%
select(BodyTemp, .pred)
## # A tibble: 547 × 2
## BodyTemp .pred
## <dbl> <dbl>
## 1 99.4 98.9
## 2 99.7 99.1
## 3 97.8 98.9
## 4 98.5 98.9
## 5 98.4 99.1
## 6 98.1 98.9
## 7 98.9 98.9
## 8 98 98.9
## 9 101. 98.9
## 10 98.5 98.9
## # … with 537 more rows
#Using RMSE to compare model fits
#Test
Temp_Run_Aug_Test %>%
rmse(truth = BodyTemp, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.13
#Train
Temp_Run_Aug_Train %>%
rmse(truth = BodyTemp, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.21
In the data with just the Runny Nose predictor, the model fit the test data (RMSE = 1.13) better than the train data (RMSE = 1.21).
Overall, the best model fit was the model including all predictor variables when fit to the training data, as this had the lowest RMSE error (1.12).