Using Regular Expressions to Filter Data with the Tidyverse for More Accurate Matches

Here’s how you can use the tidyverse and do some matching by regular expressions to filter your data:

library(tidyverse)

# Define Data and Replicates tibble objects
Data <- tibble(
  Name = c("100", "100", "200", "250", "1E5", "1E5", "Negative", "Negative"),
  Pos = c("A3", "A4", "B3", "B4", "C3", "C4", "D3", "D4"), 
  Output = c("20.00", "20.10", "21.67", "23.24", "21.97", "22.03", "38.99", "38.99")
)

Replicates <- tibble(
  Replicates = c("A3, A4", "C3, C4", "D3, D4"),
  Mean.Cq = c(20.05, 22.00, 38.99),
  STD.Cq = c(0.05, 0.03, 0.00)
)

# Split `Replicates$Replicates` into two fields
Replicates$R1 <- gsub(x = Replicates$Replicates, pattern = "^(.*),.*", replacement = "\\1")
Replicates$R2 <- gsub(x = Replicates$Replicates, pattern = ".*,\\s(.*)", replacement = "\\1")

# Filter Data for each of the Replicates fields
df_R1 <- Data %>%
  filter(Pos == "A3" | Pos == "B4") %>%
  select(Name, Mean.Cq, STD.Cq)
df_R2 <- Data %>%
  filter(Pos %in% c("C3", "D4")) %>%
  select(Name, Mean.Cq, STD.Cq)

# Inner-join `df_R1` and `df_R2` with Replicates
df <- left_join(df_R1, Replicates, by = "Pos") %>%
  inner_join(df_R2, by = "Pos")

unique(df[, c("Name", "Mean.Cq", "STD.Cq")])
#> # A tibble: 5 x 3
#>   Name     Mean.Cq STD.Cq
#>   &lt;chr&gt;      &lt;dbl&gt;  &lt;dbl&gt;
#> 1 100         20.0   0.05
#> 2 200         NA    NA   
#> 3 250         NA    NA   
#> 4 1E5         22     0.03
#> 5 Negative    39.0   0 

This will give you the same results as your original answer, but with a few extra steps to make sure we’re only matching data that actually exists in Data.


Last modified on 2024-01-07