Allele frequency of defining mutations of specific Pangolin variants
voc_data <- tbl(con, "lineage_def") %>%
collect() %>% as.data.table()
voc_data[,.(num_mutations=.N),.(pango,description,variant_id)]
Analysis of the allele frequency distributions per mutation for each lineage
B.1.1.318
variant <- "VUI202102/04"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
scale_y_continuous(limits = c(0.1,1)) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}
Allele frequency distribution per mutation (nt level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
scale_y_continuous(limits = c(0.1,1)) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="nt variation")
}
Number of samples with all variant mutations as a function of AF threshold
if (nrow(variant_data)>0) {
variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
plotdata[,af_cat:=as.numeric(as.character(af_cat))]
ggplot(plotdata) +
geom_point(aes(x=af_cat, y=cum_freq)) +
scale_x_continuous(limits = c(0.1,1)) +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(x="AF threshold",y="N_samples")
}
B.1.351
variant <- "VOC202012/02"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold
if (nrow(variant_data)>0) {
variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
plotdata[,af_cat:=as.numeric(as.character(af_cat))]
ggplot(plotdata) +
geom_point(aes(x=af_cat, y=cum_freq)) +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(x="AF threshold",y="N_samples")
}

A.23.1
variant <- "VUI202102/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold
if (nrow(variant_data)>0) {
variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
plotdata[,af_cat:=as.numeric(as.character(af_cat))]
ggplot(plotdata) +
geom_point(aes(x=af_cat, y=cum_freq)) +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(x="AF threshold",y="N_samples")
}

B.1.324.1
variant <- "VUI202103/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}
Allele frequency distribution per mutation (nt level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="nt variation")
}
Number of samples with all variant mutations as a function of AF threshold
if (nrow(variant_data)>0) {
variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
plotdata[,af_cat:=as.numeric(as.character(af_cat))]
ggplot(plotdata) +
geom_point(aes(x=af_cat, y=cum_freq)) +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(x="AF threshold",y="N_samples")
}
B.1.1.7
variant <- "VOC202012/01"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}

Allele frequency distribution per mutation (nt level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',ref,'>',alt), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="nt variation")
}

Number of samples with all variant mutations as a function of AF threshold
if (nrow(variant_data)>0) {
variant_data[,af_cat:=cut(af,seq(0,1,0.01), labels = seq(0.01,1,0.01))]
plotdata <- variant_data[,.N,.(ena_run,af_cat)][order(af_cat,decreasing = T),.(cum_freq=cumsum(N),af_cat),.(ena_run)][cum_freq==n_var,.N,af_cat][order(af_cat, decreasing = T),.(af_cat,cum_freq=cumsum(N))]
plotdata[,af_cat:=as.numeric(as.character(af_cat))]
ggplot(plotdata) +
geom_point(aes(x=af_cat, y=cum_freq)) +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(x="AF threshold",y="N_samples")
}

B.1.525
variant <- "VUI202102/03"
voc_data[variant_id==variant, .(gene, pos, ref_pos_alt,codon_change, amino_acid_change)]
variant_positions <- voc_data[variant_id==variant, pos]
n_var <- length(variant_positions)
variant_data <- tbl(con, "vcf") %>%
filter(pos %in% variant_positions) %>%
filter(ann_num==1) %>%
filter(af>0.1) %>%
inner_join(filter(tbl(con, "lineage10pct"), variant_id==variant), by = "ena_run" )%>%
collect() %>% as.data.table()
Allele frequency distribution per mutation (AA level)
if (nrow(variant_data)>0) {
ggplot(variant_data) +
geom_quasirandom(aes(x=paste0(pos,' ',hgvs_p), y=af), alpha=0.1) +
facet_grid(. ~ gene_name, scales="free_x", space = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle=45,hjust=1)) +
labs(y="Allele frequency", x="AA variation")
}
