load required packages

load data

processdata <- readRDS(here::here("data","processed_data","processeddata.rds"))

# check the data
str(processdata)
## 'data.frame':    730 obs. of  32 variables:
##  $ SwollenLymphNodes: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 2 1 ...
##  $ ChestCongestion  : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 2 2 ...
##  $ ChillsSweats     : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 2 2 1 ...
##  $ NasalCongestion  : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 2 2 ...
##  $ CoughYN          : Factor w/ 2 levels "No","Yes": 2 2 1 2 1 2 2 2 2 2 ...
##  $ Sneeze           : Factor w/ 2 levels "No","Yes": 1 1 2 2 1 2 1 2 1 1 ...
##  $ Fatigue          : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SubjectiveFever  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Headache         : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ Weakness         : Factor w/ 4 levels "None","Mild",..: 2 4 4 4 3 3 2 4 3 3 ...
##  $ WeaknessYN       : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ CoughIntensity   : Factor w/ 4 levels "None","Mild",..: 4 4 2 3 1 3 4 3 3 3 ...
##  $ CoughYN2         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ Myalgia          : Factor w/ 4 levels "None","Mild",..: 2 4 4 4 2 3 2 4 3 2 ...
##  $ MyalgiaYN        : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ RunnyNose        : Factor w/ 2 levels "No","Yes": 1 1 2 2 1 1 2 2 2 2 ...
##  $ AbPain           : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ ChestPain        : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 2 1 1 1 ...
##  $ Diarrhea         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
##  $ EyePn            : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ Insomnia         : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 2 2 2 ...
##  $ ItchyEye         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Nausea           : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ EarPn            : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 1 1 1 1 1 ...
##  $ Hearing          : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Pharyngitis      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 1 1 ...
##  $ Breathless       : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 2 ...
##  $ ToothPn          : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 1 ...
##  $ Vision           : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Vomit            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ Wheeze           : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 1 1 ...
##  $ BodyTemp         : num  98.3 100.4 100.8 98.8 100.5 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 133 243 363 577 585
##   ..- attr(*, "names")= chr [1:5] "133" "243" "363" "577" ...
glimpse(processdata)
## Rows: 730
## Columns: 32
## $ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
## $ ChestCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
## $ ChillsSweats      <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
## $ NasalCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
## $ CoughYN           <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
## $ Sneeze            <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
## $ Fatigue           <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ SubjectiveFever   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
## $ Headache          <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
## $ Weakness          <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
## $ WeaknessYN        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ CoughIntensity    <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
## $ CoughYN2          <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
## $ Myalgia           <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
## $ MyalgiaYN         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ RunnyNose         <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
## $ AbPain            <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
## $ ChestPain         <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
## $ Diarrhea          <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
## $ EyePn             <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
## $ Insomnia          <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
## $ ItchyEye          <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
## $ Nausea            <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
## $ EarPn             <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
## $ Hearing           <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
## $ Pharyngitis       <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
## $ Breathless        <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
## $ ToothPn           <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
## $ Vision            <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
## $ Vomit             <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
## $ Wheeze            <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
## $ BodyTemp          <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~

For this week’s exercise, the main continuous outcome of interest is body temperature and categorical outcome of interest is nausea. I will focus on 6 other covariates listed as follows, and explore their correlation and association with the outcomes of interest. List of covariates: * Swollen Lymph Nodes (Yes/No) * Fatigue (Yes/No) * Runny Nose (Yes/No) * Diarrhea (Yes/No) * Breathless (Yes/No) * Wheeze (Yes/No)

Descriptive summary table

processdata %>% 
# select only important variables
  select("SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze","BodyTemp","Nausea") %>% 
  tbl_summary() %>% 
  modify_header(label ~ "**Variable**") %>% 
  bold_labels()  %>%
  as_flex_table()

Variable

N = 7301

SwollenLymphNodes

312 (43%)

Fatigue

666 (91%)

RunnyNose

519 (71%)

Diarrhea

99 (14%)

Breathless

294 (40%)

Wheeze

220 (30%)

BodyTemp

98.50 (98.20, 99.30)

Nausea

255 (35%)

1n (%); Median (IQR)

The first table contains a descriptive summary of all outcome variables and predictors of interest. Now, I would like to explore whether there is difference in the distribution of these clinical signs between presence of nausea.

Descriptive summary table stratified by presence of nausea

finaldata <- processdata %>% 
# select only important variables
  select("SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze","BodyTemp","Nausea")
# create summary table
finaldata %>% 
  tbl_summary(by = Nausea) %>% 
  modify_header(label ~ "**Variable**") %>% 
  bold_labels()  %>%
  as_flex_table()

Variable

No, N = 4751

Yes, N = 2551

SwollenLymphNodes

200 (42%)

112 (44%)

Fatigue

424 (89%)

242 (95%)

RunnyNose

336 (71%)

183 (72%)

Diarrhea

40 (8.4%)

59 (23%)

Breathless

164 (35%)

130 (51%)

Wheeze

137 (29%)

83 (33%)

BodyTemp

98.50 (98.20, 99.30)

98.60 (98.20, 99.30)

1n (%); Median (IQR)

As shown in the table, the prevalence of clinical symptoms such as fatigue, diarrhea and breathless appeared to be higher among those individuals with presence of nausea compared to those without presence of nausea.

plotting for the continuous outcome, body temperature

finaldata %>% 
ggplot(aes(x=BodyTemp)) + 
  geom_histogram(bins = 50) +
  scale_x_continuous(limits = c(97, 104), breaks = seq(97,103,1)) +
  xlab("Body temperature") +
  ylab("Frequency") +
  ggtitle("Histogram of body temperature") +
  theme_bw()
## Warning: Removed 2 rows containing missing values (geom_bar).

The range of body temperature spans from around 97 to 103 degrees, and peaks between 98 and 99 degrees. The distribution is right-skewed.

plotting for the categorical outcome, nausea

finaldata %>% 
ggplot(aes(x=Nausea, fill=Nausea)) + 
  geom_bar( width = 0.7) +
  scale_fill_manual("Nausea", values = c("orange","brown")) +
  xlab("Presence of nausea") +
  ylab("Frequency") +
  ggtitle("Bar plot of presence of nausea") +
  theme_bw()

From summary table, we know that 35% of individuals had nausea.

plotting for all predictors of interest

finaldata %>% 
# create subject ID for plotting purpose
  mutate(ID = seq(1,730,1)) %>% 
# select predictors of interests
  select("ID","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze") %>% 
  pivot_longer(!ID,names_to = "predictor", values_to = "count") %>% 
# create bar plot for each predictor of interest 
ggplot(aes(x=count, fill=count)) + 
  geom_bar() +
  xlab("") +
  ylab("Frequency") +
  scale_fill_manual("Presence of symptom", values = c("#8ABB80","#E66767")) +
  ggtitle("Stacked barplot for each predictor of interest") +
# wrap multiple plots together
  facet_wrap(~predictor) +
  theme_bw()

It looks like majority of the individuals had signs of fatigue and runny nose (>50%), and diarrhea is rare. So far, we have looked at the distribution of each outcome variables and predictors of interest separately. Now, I am interest to know whether prevalence of nausea differs between those with or without predictor of interest. Hence, I will plot a stacked barplot showing the frequency of each symptom grouped by presence of nausea.

plotting for all predictors of interest by nausea (Stacked barplot)

finaldata %>% 
# select predictors of interests
  select("Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze") %>% 
  pivot_longer(!Nausea,names_to = "predictor", values_to = "count") %>% 
# create bar plot for each predictor of interest 
ggplot(aes(x=count, fill=Nausea)) + 
  geom_bar() +
  xlab("Presence of symptom") +
  ylab("Frequency") +
  scale_fill_manual("Nausea", values = c("#8ABB80","#E66767")) +
  ggtitle("Stacked barplot for each predictor of interest stratefied by nausea") +
# wrap multiple plots together
  facet_wrap(~predictor) +
  theme_bw()

The presentation of the plots makes it a bit difficult to compare the percentage of nausea between groups. Let’s try again and this time I will create a percent stacked barplot.

plotting for all predictors of interest by nausea (Percent stacked barplot)

finaldata %>% 
# select predictors of interests
  select("Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze") %>% 
  pivot_longer(!Nausea,names_to = "predictor", values_to = "count") %>% 
# create bar plot for each predictor of interest 
ggplot(aes(x=count, fill=Nausea)) + 
  geom_bar(position="fill") +
  xlab("Presence of symptom") +
  ylab("Frequency") +
  scale_fill_manual("Nausea", values = c("#8ABB80","#E66767")) +
  ggtitle("Percent stacked barplot for each predictor of interest stratefied by nausea") +
# wrap multiple plots together
  facet_wrap(~predictor) +
  theme_bw()

The barplot shows that presence of nausea were higher among those individual with signs of breathlessness, diarrhea and fatigue. However, the presence of nausea appears to be similar between those with or without runny nose, swollen lymph nodes or wheeze. In order to explore the relation between continuous outcome variable, body temperature and each predictors, I will also create a boxplot for each predictor

finaldata %>% 
# select predictors of interests
  select("BodyTemp","Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
             "Wheeze") %>% 
  pivot_longer(3:8,names_to = "predictor", values_to = "count") %>% 
# grouped boxplot
  ggplot(aes(x=count, y=BodyTemp, fill=Nausea)) + 
    geom_boxplot() +
    xlab("") +
    facet_wrap(~predictor, scale="free")