Finding Consensus in Two Out of Three Columns and Summarizing Them in R
In this article, we will explore how to find consensus among two out of three identical samples in a dataset. We’ll use the dplyr
package in R for data manipulation and summarization tasks.
Background
The problem arises when dealing with technical replicate samples (e.g., MDA_1, MDA_2, MDA_3) analysis needs to be done between three such identical samples at a time. We need to calculate the consensus of two out of three columns for each row and summarize them accordingly.
Step 1: Melting the Data
The first step is to melt the data into a long format using melt()
. This function drops the ID column, which we can later reattach if needed.
# Load necessary libraries
library(dplyr)
# Melt the data
genus_sub_melted <- melt(genus_sub, id = "ID")
Step 2: Dropping Counter in Column Names
To group by variable
instead of its values, we need to remove the counter from column names. We can use substr()
for this purpose.
# Drop counter in column names
genus_sub_melted$variable <- substr(genus_sub_melted$variable, 1, nchar(as.character(genus_sub_melted$variable)) - 2)
Step 3: Grouping by ID and Variable
Next, we group the data by ID
and variable
, then summarize the values.
# Group by ID and variable, then summarize
genus_sub_melted <- genus_sub_melted %>%
group_by(ID, variable) %>%
summarise(value = sum(value))
Step 4: Casting Data
Finally, we cast the data back to a wide format using dcast()
, with ID
on the x-axis and variable
on the y-axis.
# Cast data
genus_sub_melted <- genus_sub_melted %>%
dcast(ID ~ variable, sum)
Step 5: Calculating Consensus
To calculate consensus among two out of three columns for each row, we can replace 2
and 3
with 1
in the output.
# Replace 2 and 3 with 1
genus_sub_melted$GutREF001.1_MDA <- ifelse(genus_sub_melted$GutREF001.1_MDA == 2, 1, 0)
genus_sub_melted$GutREF001.2_MDA <- ifelse(genus_sub_melted$GutREF001.2_MDA == 2, 1, 0)
genus_sub_melted$GutREF001.3_MDA <- ifelse(genus_sub_melted$GutREF001.3_MDA == 2, 1, 0)
# Calculate consensus
genus_sub_melted$consensus_1 <- sum(genus_sub_melted$GutREF001.1_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.2_MDA))
genus_sub_melted$consensus_2 <- sum(genus_sub_melted$GutREF001.2_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.3_MDA))
genus_sub_melted$consensus_3 <- sum(genus_sub_melted$GutREF001.3_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.2_MDA))
# Calculate summary metrics
genus_sub_melted$sample_consensus_detected <- sum(genus_sub_melted$consensus_1 == 1)
genus_sub_melted$sample_consensus_not_detected <- nrow(genus_sub_melted) - sum(genus_sub_melted$consensus_1 == 1)
# Calculate summary metrics
genus_sub_melted$summary_metric_1 <- genus_sub_melted$sample_consensus_detected / (genus_sub_melted$sample_consensus_detected + genus_sub_melted$sample_consensus_not_detected)
genus_sub_melted$summary_metric_2 <- genus_sub_melted$sample_consensus_detected / (genus_sub_melted$sample_consensus_detected + genus_sub_melted$sample_consensus_not_detected)
Step 6: Combining the Code into a Function
Here is the complete code combined into a function:
find_consensus <- function(genus_sub) {
# Load necessary libraries
library(dplyr)
# Melt the data
genus_sub_melted <- melt(genus_sub, id = "ID")
# Drop counter in column names
genus_sub_melted$variable <- substr(genus_sub_melted$variable, 1, nchar(as.character(genus_sub_melted$variable)) - 2)
# Group by ID and variable, then summarize
genus_sub_melted <- genre_sub_melted %>%
group_by(ID, variable) %>%
summarise(value = sum(value))
# Cast data
genus_sub_melted <- genus_sub_melted %>%
dcast(ID ~ variable, sum)
# Replace 2 and 3 with 1
genus_sub_melted$GutREF001.1_MDA <- ifelse(genus_sub_melted$GutREF001.1_MDA == 2, 1, 0)
genus_sub_melted$GutREF001.2_MDA <- ifelse(genus_sub_melted$GutREF001.2_MDA == 2, 1, 0)
genus_sub_melted$GutREF001.3_MDA <- ifelse(genus_sub_melted$GutREF001.3_MDA == 2, 1, 0)
# Calculate consensus
genus_sub_melted$consensus_1 <- sum(genus_sub_melted$GutREF001.1_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.2_MDA))
genus_sub_melted$consensus_2 <- sum(genus_sub_melted$GutREF001.2_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.3_MDA))
genus_sub_melted$consensus_3 <- sum(genus_sub_melted$GutREF001.3_MDA) / (sum(genus_sub_melted$GutREF001.1_MDA) + sum(genus_sub_melted$GutREF001.2_MDA))
# Calculate summary metrics
genus_sub_melted$sample_consensus_detected <- sum(genus_sub_melted$consensus_1 == 1)
genus_sub_melted$sample_consensus_not_detected <- nrow(genus_sub_melted) - sum(genus_sub_melted$consensus_1 == 1)
# Calculate summary metrics
genus_sub_melted$summary_metric_1 <- genus_sub_melted$sample_consensus_detected / (genus_sub_melted$sample_consensus_detected + genus_sub_melted$sample_consensus_not_detected)
genus_sub_melted$summary_metric_2 <- genus_sub_melted$sample_consensus_detected / (genus_sub_melted$sample_consensus_detected + genus_sub_melted$sample_consensus_not_detected)
return(genus_sub_melted)
}
# Example usage
data <- data.frame(GutREF001.1_MDA = c(2,3,4),
GutREF001.2_MDA = c(3,2,4),
GutREF001.3_MDA = c(4,3,2))
result <- find_consensus(data)
# Print the result
print(result)
I hope this helps! Let me know if you have any questions or need further clarification.
Last modified on 2023-12-20