Already determined T cell epitopes were downloaded (https://www.iedb.org/) (“/v/projects/geno2pheno-kpapp/small_data/Tcell”)
The COVEO database will be searched for these epitopes
library(tidyverse)
library(openxlsx)
library(DBI)
config <- config::get()
con <- dbConnect(
drv = RPostgreSQL::PostgreSQL(),
dbname = config$dbname,
host = config$host,
port = config$port,
user = config$user,
password = config$password,
option = config$option
)
selected_schema <- str_remove(config$option, pattern = "-c search_path=")
The table below is the original download table:
f <- "/v/projects/geno2pheno-kpapp/small_data/Tcell/CD4_MHC_II_epitope_table_export_1693249691.xlsx"
epitope_raw <- read_excel(f, skip = 1)[,1:17]
New names:
names(epitope_raw) <- c("iedb_iri", "object_type", "name", "modified_residue",
"modifications", "start", "end", "iri", "synonyms",
"source_molecule", "source_molecule_iri", "molecule_parent",
"molecule_parent_iri", "source_organism",
"source_organism_iri", "species", "species_iri")
epitope_raw
There are a few rows, where the start
and end
positions are not filled, these will be removed.
epitope <- epitope_raw %>%
drop_na(name, start, end)
epitope
Based on the table it looks that these sequences derived from reference sequences, but need to check it.
The reference sequence is here: https://www.ncbi.nlm.nih.gov/nuccore/MN908947 Spike protein (QHD43416.1) is coded in the reference sequence position: 21563-25384
Database will be searched for the reference Spike protein sequence:
select * from datahub_0.translated_aa_ref_sequence(21563, 25384)
spike_seq <- str_c(spike_seq$aa_symbol, collapse = "")
spike_seq
[1] "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT"
str_length(spike_seq)
[1] 1273
The Spike protein contains 1273 amino acides
Check if the name
column of the epitope table contains really the reference sequence.
for (i in 1:nrow(epitope)) {
ref <- str_sub(spike_seq,
start = epitope[i, 'start'],
end = epitope[i, 'end'])
if (epitope[i, 'name'] == ref) epitope[i, 'match_with_ref'] <- TRUE else epitope[i, 'match_with_ref'] <- FALSE
}
Lets see the result of the comparison
table(epitope$match_with_ref)
FALSE TRUE
22 563
There are 22 epitopes where the name
column does not match with the reference.
These are the epitopes those do not match:
epitope[epitope$match_with_ref==FALSE,]
Later needs to focus on these too, but in the first round I will remove them from the list.
The table below contains the epitopes those derived from reference sequence:
epitope_selected <- epitope %>%
filter(match_with_ref) %>%
separate(col = iedb_iri, into = c(NA, NA, NA, NA, "iedb_id"),
sep = "/", remove = FALSE)
epitope_selected
In the next step those barcodes (and samples) will be selected that contains mutation between the S protein epitope defined start-end position. These samples contain altered T-cell target sequence that maybe not recognized anymore by the given T-cell clone.
d <- tibble(collection_date = Date())
for (i in 1:nrow(epitope_selected)){
#for (i in 1:2){
start <- epitope_selected[i, 'start']
end <- epitope_selected[i, 'end']
id <- str_c("id_", as.character(epitope_selected[i, 'iedb_id']))
sql_query1 <- glue_sql("
SELECT collection_date, count(*)
FROM (SELECT distinct(barcode_id) FROM datahub_0.barcodes_with_mutation_positions({start}, {end})) as f
JOIN datahub_0.aa_mutation_new amn
ON amn.barcode_id = f.barcode_id
GROUP BY collection_date
", .con = con)
x <- dbGetQuery(con, sql_query1) %>%
drop_na()
names(x) <- c("collection_date", id)
d <- d %>%
full_join(x, by = "collection_date")
}
save(d, file = "/v/projects/geno2pheno-kpapp/kpapp/t_cell_epitop/d_CD4_MHC_II_epitope_table_export_1693249691.Rdata")
Summarize the counts for weeks:
iedb_id
column is the id of a given epitope (eg. “id_100428”) that can be used to get further information (eg. https://www.iedb.org/epitope/100428)How many sample was collected on a given week? (This will be the background)
SELECT collection_date, count(*)
FROM aa_mutation_new
GROUP BY collection_date
names(all_samples) <- c("collection_date", "all")
all_samples <- all_samples %>%
mutate(iso_year=isoyear(collection_date),
iso_week= isoweek(collection_date)) %>%
filter(iso_year > 2020,
iso_year < 2024) %>%
mutate(iso_week = ifelse(iso_week < 10, str_c("0", as.character(iso_week)),
as.character(iso_week))) %>%
mutate(date = ISOweek2date(paste(iso_year, "-W", iso_week, "-1", sep = ""))) %>%
select(!c("iso_year", "iso_week", "collection_date")) %>%
mutate(date = as.character(date)) %>%
group_by(date) %>%
summarise_at(c("all"), sum)
all_samples$date <- ymd(all_samples$date)
ggplot(data = all_samples, aes(x = date, y = all)) +
geom_line(show.legend = FALSE) +
labs(title = "All collected samples",
x = "Collection date (weekly)",
y = "Number of sample") +
theme_minimal()
Visualize the number of samples in time those maybe escaped from a given T-cell clone:
ggplot(data = d, aes(x = date, y = count, color = iedb_id)) +
geom_line(show.legend = FALSE) +
labs(title = "Potentially escaped samples",
x = "Collection date (weekly)",
y = "Number of sample") +
theme_minimal()
It is also important to know how many samples were collected on a given week
ggplot(data = d, aes(x = date, y = count, color = iedb_id)) +
geom_line(show.legend = FALSE) +
geom_line(data = all_samples, aes(x = date, y = all), show.legend = FALSE,
color = "black", size = 2) +
labs(title = "Potentially escaped samples",
subtitle = "(Black represent all sample on a givek week)",
x = "Collection date (weekly)",
y = "Number of sample") +
theme_minimal()
Percentage of the potentially escaped samples:
x <- d %>%
pivot_wider(names_from = "iedb_id", values_from = "count") %>%
inner_join(all_samples, by = "date")
for (i in 2:(ncol(x)-1)){
x[,i] <- x[,i]/x[,"all"]*100
}
x <- x %>%
select(-all) %>%
pivot_longer(cols = starts_with("id_"), names_to = "iedb_id",
values_to = "count")
x$date <- ymd(x$date)
ggplot(data = x, aes(x = date, y = count, color = iedb_id)) +
geom_line(show.legend = FALSE) +
labs(title = "Potentially escaped samples",
x = "Collection date (weekly)",
y = "Percentage of samples") +
theme_minimal()
There are 563 epitopes in the epitope table. Which region of the spike protein is covered by these epitopes?
x <- epitope_selected %>%
select(iedb_id, start, end) %>%
mutate(start = as.numeric(start),
end = as.numeric(end)) %>%
arrange(start)
x$y <- 1:nrow(x)
x <- x %>%
pivot_longer(cols = start:end, values_to = "x")
ggplot(data = x, aes(x = x, y = y, fill = iedb_id)) +
geom_line(alpha = 0.4) +
labs(title = "Distribution of T-cell epitopes on spike protein",
x = "Amino acide position",
y = "") +
theme_minimal() +
theme(aspect.ratio=2/6)