library(DBI)
library(tidyverse)
library(data.table)
library(ggplot2)
library(ggbeeswarm)
con <- DBI::dbConnect(RPostgreSQL::PostgreSQL(), 
                      dbname="kooplex-ebi",
                      host = "157.181.153.9",
                      port = "6543",
                      user = "kooplex_ebi_read_user",
                      password = "secret"
)

Allele frequency of defining mutations of specific Pangolin variants

voc_data <- tbl(con, "lineage_def") %>%
  collect() %>% as.data.table()
voc_data[,.(num_mutations=.N),.(pango,description,variant_id)]

Analysis of the allele frequency distributions per mutation for each lineage

B.1.1.318

variant <- "VUI202102/04"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  scale_y_continuous(limits = c(0.1,1)) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
  scale_y_continuous(limits = c(0.1,1)) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold

if (nrow(variant_data)>0) {
  variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
  plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
  plotdata[,af_cat:=as.numeric(as.character(af_cat))]
  
  ggplot(plotdata) + 
    geom_point(aes(x=af_cat, y=cum_freq)) +
    scale_x_continuous(limits = c(0.1,1)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle=45,hjust=1)) +
    labs(x="AF threshold",y="N_samples")
}

B.1.351

variant <- "VOC202012/02"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold

if (nrow(variant_data)>0) {
  variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
  plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
  plotdata[,af_cat:=as.numeric(as.character(af_cat))]
  
  ggplot(plotdata) + 
    geom_point(aes(x=af_cat, y=cum_freq)) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle=45,hjust=1)) +
    labs(x="AF threshold",y="N_samples")
}

A.23.1

variant <- "VUI202102/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold

if (nrow(variant_data)>0) {
  variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
  plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
  plotdata[,af_cat:=as.numeric(as.character(af_cat))]
  
  ggplot(plotdata) + 
    geom_point(aes(x=af_cat, y=cum_freq)) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle=45,hjust=1)) +
    labs(x="AF threshold",y="N_samples")
}

B.1.324.1

variant <- "VUI202103/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold

if (nrow(variant_data)>0) {
  variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
  plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
  plotdata[,af_cat:=as.numeric(as.character(af_cat))]
  
  ggplot(plotdata) + 
    geom_point(aes(x=af_cat, y=cum_freq)) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle=45,hjust=1)) +
    labs(x="AF threshold",y="N_samples")
}

B.1.1.7

variant <- "VOC202012/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold

if (nrow(variant_data)>0) {
  variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
  plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
  plotdata[,af_cat:=as.numeric(as.character(af_cat))]
  
  ggplot(plotdata) + 
    geom_point(aes(x=af_cat, y=cum_freq)) + 
    theme_bw() +
    theme(axis.text.x = element_text(angle=45,hjust=1)) +
    labs(x="AF threshold",y="N_samples")
}

B.1.525

variant <- "VUI202102/03"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)

variant_data <- tbl(con, "vcf") %>%
  filter(pos %in% variant_positions) %>%
  filter(ann_num==1) %>%
  filter(af>0.1) %>%
  inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
  collect() %>% as.data.table()

Allele frequency distribution per mutation (AA level)

if (nrow(variant_data)>0) {
  ggplot(variant_data) + 
  geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
  facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,hjust=1)) +
  labs(y="Allele frequency", x="AA variation")
}