processdata <- readRDS(here::here("data","processed_data","processeddata.rds"))
# check the data
str(processdata)
## 'data.frame': 730 obs. of 32 variables:
## $ SwollenLymphNodes: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 2 1 ...
## $ ChestCongestion : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 2 2 ...
## $ ChillsSweats : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 2 2 1 ...
## $ NasalCongestion : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 2 2 ...
## $ CoughYN : Factor w/ 2 levels "No","Yes": 2 2 1 2 1 2 2 2 2 2 ...
## $ Sneeze : Factor w/ 2 levels "No","Yes": 1 1 2 2 1 2 1 2 1 1 ...
## $ Fatigue : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ SubjectiveFever : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
## $ Headache : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ Weakness : Factor w/ 4 levels "None","Mild",..: 2 4 4 4 3 3 2 4 3 3 ...
## $ WeaknessYN : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ CoughIntensity : Factor w/ 4 levels "None","Mild",..: 4 4 2 3 1 3 4 3 3 3 ...
## $ CoughYN2 : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ Myalgia : Factor w/ 4 levels "None","Mild",..: 2 4 4 4 2 3 2 4 3 2 ...
## $ MyalgiaYN : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ RunnyNose : Factor w/ 2 levels "No","Yes": 1 1 2 2 1 1 2 2 2 2 ...
## $ AbPain : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ ChestPain : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 2 1 1 1 ...
## $ Diarrhea : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
## $ EyePn : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ Insomnia : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 2 2 2 ...
## $ ItchyEye : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Nausea : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ EarPn : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 1 1 1 1 1 ...
## $ Hearing : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ Pharyngitis : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 1 1 ...
## $ Breathless : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 2 ...
## $ ToothPn : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 2 1 ...
## $ Vision : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Vomit : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ Wheeze : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 1 1 ...
## $ BodyTemp : num 98.3 100.4 100.8 98.8 100.5 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 133 243 363 577 585
## ..- attr(*, "names")= chr [1:5] "133" "243" "363" "577" ...
glimpse(processdata)
## Rows: 730
## Columns: 32
## $ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
## $ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
## $ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
## $ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
## $ CoughYN <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
## $ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
## $ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
## $ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
## $ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
## $ WeaknessYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
## $ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
## $ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
## $ MyalgiaYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
## $ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
## $ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
## $ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
## $ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
## $ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
## $ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
## $ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
## $ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
## $ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
## $ Hearing <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
## $ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
## $ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
## $ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
## $ Vision <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
## $ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
## $ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
## $ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~
For this week’s exercise, the main continuous outcome of interest is body temperature and categorical outcome of interest is nausea. I will focus on 6 other covariates listed as follows, and explore their correlation and association with the outcomes of interest. List of covariates: * Swollen Lymph Nodes (Yes/No) * Fatigue (Yes/No) * Runny Nose (Yes/No) * Diarrhea (Yes/No) * Breathless (Yes/No) * Wheeze (Yes/No)
processdata %>%
# select only important variables
select("SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze","BodyTemp","Nausea") %>%
tbl_summary() %>%
modify_header(label ~ "**Variable**") %>%
bold_labels() %>%
as_flex_table()
Variable | N = 7301 |
SwollenLymphNodes | 312 (43%) |
Fatigue | 666 (91%) |
RunnyNose | 519 (71%) |
Diarrhea | 99 (14%) |
Breathless | 294 (40%) |
Wheeze | 220 (30%) |
BodyTemp | 98.50 (98.20, 99.30) |
Nausea | 255 (35%) |
1n (%); Median (IQR) |
The first table contains a descriptive summary of all outcome variables and predictors of interest. Now, I would like to explore whether there is difference in the distribution of these clinical signs between presence of nausea.
finaldata <- processdata %>%
# select only important variables
select("SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze","BodyTemp","Nausea")
# create summary table
finaldata %>%
tbl_summary(by = Nausea) %>%
modify_header(label ~ "**Variable**") %>%
bold_labels() %>%
as_flex_table()
Variable | No, N = 4751 | Yes, N = 2551 |
SwollenLymphNodes | 200 (42%) | 112 (44%) |
Fatigue | 424 (89%) | 242 (95%) |
RunnyNose | 336 (71%) | 183 (72%) |
Diarrhea | 40 (8.4%) | 59 (23%) |
Breathless | 164 (35%) | 130 (51%) |
Wheeze | 137 (29%) | 83 (33%) |
BodyTemp | 98.50 (98.20, 99.30) | 98.60 (98.20, 99.30) |
1n (%); Median (IQR) |
As shown in the table, the prevalence of clinical symptoms such as fatigue, diarrhea and breathless appeared to be higher among those individuals with presence of nausea compared to those without presence of nausea.
finaldata %>%
ggplot(aes(x=BodyTemp)) +
geom_histogram(bins = 50) +
scale_x_continuous(limits = c(97, 104), breaks = seq(97,103,1)) +
xlab("Body temperature") +
ylab("Frequency") +
ggtitle("Histogram of body temperature") +
theme_bw()
## Warning: Removed 2 rows containing missing values (geom_bar).
The range of body temperature spans from around 97 to 103 degrees, and peaks between 98 and 99 degrees. The distribution is right-skewed.
finaldata %>%
ggplot(aes(x=Nausea, fill=Nausea)) +
geom_bar( width = 0.7) +
scale_fill_manual("Nausea", values = c("orange","brown")) +
xlab("Presence of nausea") +
ylab("Frequency") +
ggtitle("Bar plot of presence of nausea") +
theme_bw()
From summary table, we know that 35% of individuals had nausea.
finaldata %>%
# create subject ID for plotting purpose
mutate(ID = seq(1,730,1)) %>%
# select predictors of interests
select("ID","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze") %>%
pivot_longer(!ID,names_to = "predictor", values_to = "count") %>%
# create bar plot for each predictor of interest
ggplot(aes(x=count, fill=count)) +
geom_bar() +
xlab("") +
ylab("Frequency") +
scale_fill_manual("Presence of symptom", values = c("#8ABB80","#E66767")) +
ggtitle("Stacked barplot for each predictor of interest") +
# wrap multiple plots together
facet_wrap(~predictor) +
theme_bw()
It looks like majority of the individuals had signs of fatigue and runny nose (>50%), and diarrhea is rare. So far, we have looked at the distribution of each outcome variables and predictors of interest separately. Now, I am interest to know whether prevalence of nausea differs between those with or without predictor of interest. Hence, I will plot a stacked barplot showing the frequency of each symptom grouped by presence of nausea.
finaldata %>%
# select predictors of interests
select("Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze") %>%
pivot_longer(!Nausea,names_to = "predictor", values_to = "count") %>%
# create bar plot for each predictor of interest
ggplot(aes(x=count, fill=Nausea)) +
geom_bar() +
xlab("Presence of symptom") +
ylab("Frequency") +
scale_fill_manual("Nausea", values = c("#8ABB80","#E66767")) +
ggtitle("Stacked barplot for each predictor of interest stratefied by nausea") +
# wrap multiple plots together
facet_wrap(~predictor) +
theme_bw()
The presentation of the plots makes it a bit difficult to compare the percentage of nausea between groups. Let’s try again and this time I will create a percent stacked barplot.
finaldata %>%
# select predictors of interests
select("Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze") %>%
pivot_longer(!Nausea,names_to = "predictor", values_to = "count") %>%
# create bar plot for each predictor of interest
ggplot(aes(x=count, fill=Nausea)) +
geom_bar(position="fill") +
xlab("Presence of symptom") +
ylab("Frequency") +
scale_fill_manual("Nausea", values = c("#8ABB80","#E66767")) +
ggtitle("Percent stacked barplot for each predictor of interest stratefied by nausea") +
# wrap multiple plots together
facet_wrap(~predictor) +
theme_bw()
The barplot shows that presence of nausea were higher among those individual with signs of breathlessness, diarrhea and fatigue. However, the presence of nausea appears to be similar between those with or without runny nose, swollen lymph nodes or wheeze. In order to explore the relation between continuous outcome variable, body temperature and each predictors, I will also create a boxplot for each predictor
finaldata %>%
# select predictors of interests
select("BodyTemp","Nausea","SwollenLymphNodes","Fatigue","RunnyNose","Diarrhea","Breathless",
"Wheeze") %>%
pivot_longer(3:8,names_to = "predictor", values_to = "count") %>%
# grouped boxplot
ggplot(aes(x=count, y=BodyTemp, fill=Nausea)) +
geom_boxplot() +
xlab("") +
facet_wrap(~predictor, scale="free")