from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Other notation of the same mutation: p.Gln613His
This notebook was run on 15th October 2021, when 613,984 samples (from the 1,654,147 of sequenced samples at covid19dataportal) have been processed by the variant calling pipeline. The weekly number of samples is not uniform and we have a lag, so many of the recent samples are not processed yet.
import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
def psql(query):
conn = psycopg2.connect("dbname=coveo user=****** password=******* host=postgres.coveo port=5432")
dummy = sqlio.read_sql_query(query, conn)
conn.close()
return dummy
There are four so called defining mutations. We have thelineage_def
table in the SQL database that contains info for all catalogued lineages.
query = '''
SELECT * FROM lineage_def
WHERE variant_id='Delta'
'''
lineage_delta = psql(query)
lineage_delta
First check how many Delta variant mutations are present in samples at each week. Here we collect information by combining the vcf
, the lineage_def
and the meta
tables. They contain the genomic position and mutation, variant definition and dates and other metadata, repectively. In the figures we mark May 2021 as the date when Delta variants start to dominate.
%%time
query = '''
SELECT m.*, l.amino_acid_change, v.af, v.dp
FROM vcf v JOIN lineage_def l on v.pos=l.pos and v.ref=l.ref and v.alt=l.alt
JOIN meta m on v.ena_run = m.ena_run
WHERE ann_num=1
AND af>3/SQRT(dp)
AND variant_id = 'Delta'
AND date_isoyear IN (2020,2021);
'''
delta_all = psql(query)
print(delta_all.shape)
# (481998, 42)
# CPU times: user 8.34 s, sys: 1.63 s, total: 9.98 s
# Wall time: 25.3 s
Convert year and week number to proper date type.
_years = delta_all.date_isoyear.values.astype(int).astype(str)
_weeks = delta_all.date_isoweek.values.astype(int).astype(str)
_sep = ['-']*len(_years)
_dayofweek = ['-1']*len(_years)
_ = np.core.defchararray.add(np.core.defchararray.add(np.core.defchararray.add(_years,_sep),_weeks),_dayofweek)
weekDate = [datetime.datetime.strptime(_d, "%Y-%W-%w") for _d in _]
delta_all['weekDate'] = weekDate
Count the number of samples that contain the different defining mutations.
delta_all_weekly = delta_all.groupby(['weekDate','amino_acid_change']).size().reset_index().rename(columns={0:'cnt'})
fig, ax = plt.subplots(2,1,figsize=[16,8])
fig.suptitle('Weekly # of samples with various Delta defining mutations on linear and logarithmic scales')
for aa in lineage_delta.amino_acid_change.values:
_ = delta_all_weekly[delta_all_weekly.amino_acid_change==aa]
ax[0].plot(_.weekDate,_.cnt,label=aa)
ax[0].set_xlabel("date")
ax[0].set_ylabel("# of samples")
ax[0].legend()
ax[0].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[0].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
for aa in lineage_delta.amino_acid_change.values:
_ = delta_all_weekly[delta_all_weekly.amino_acid_change==aa]
ax[1].semilogy(_.weekDate,_.cnt,label=aa)
ax[1].set_xlabel("date")
ax[1].set_ylabel("# of samples")
ax[1].legend()
ax[1].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[1].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
In one sample one or several of the mutations can be present. Check the number of defining mutations.
_ = delta_all.groupby(['weekDate','ena_run']).size().reset_index().rename(columns={0:'n'})
number_of_deltadefining = _.groupby(['weekDate','n']).size().reset_index().rename(columns={0:'cnt'})
fig, ax = plt.subplots(2,1,figsize=[16,8])
fig.suptitle('Weekly # of samples with 1,2,3 or all 4 Delta defining mutations on linear and logarithmic scales')
for i in range(1,5):
_ = number_of_deltadefining[number_of_deltadefining.n==i]
ax[0].plot(_.weekDate,_.cnt,label=i)
ax[0].set_xlabel("date")
ax[0].set_ylabel("# of samples")
ax[0].legend()
ax[0].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[0].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
for i in range(1,5):
_ = number_of_deltadefining[number_of_deltadefining.n==i]
ax[1].semilogy(_.weekDate,_.cnt,label=i)
ax[1].set_xlabel("date")
ax[1].set_ylabel("# of samples")
ax[1].legend()
ax[1].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[1].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
For most samples all the defining mutations are present, but there are few samples that have one or less than all four of the mutations earlier. Especially L452R emerged around 2020-09 and appeared in ~1000 samples per week before the "complete" Delta variants with all four defining mutations took over.
Before the complete Delta variant appeared the individual defining mutations appered in different samples. I.e. the different mutations (maybe except T19R) were present for a few weeks, but in separate samples (see the number of coincident mutations on the second set of figures above).
Samples with 1,2 and 3 (less than the all four) defining mutations are still present in ~100 samples per week. These may be lack of detections or true variants.
The vcf
table in the database contains per basis information and the meta
table contians dates, and other metadata. Since the tables are indexed, it is very fast to search the whole database, we can get the information in less than 10 seconds.
%%time
query = '''SELECT *
FROM vcf JOIN meta on vcf.ena_run = meta.ena_run WHERE hgvs_p = 'p.Gln613His'
AND ann_num=1
AND af>3/SQRT(dp)
AND clean_host = 'Homo sapiens'
AND date_isoyear IN (2020,2021)
'''
Q613H = psql(query)
print(Q613H.shape)
# (1172, 74)
# CPU times: user 53.9 ms, sys: 2.53 ms, total: 56.4 ms
# Wall time: 7.18 s
In our dataset have found 1172 samples with Q613H mutation.
Q613H.head()
Convert year and week number to proper date type.
_years = Q613H.date_isoyear.values.astype(int).astype(str)
_weeks = Q613H.date_isoweek.values.astype(int).astype(str)
_sep = ['-']*len(_years)
_dayofweek = ['-1']*len(_years)
_ = np.core.defchararray.add(np.core.defchararray.add(np.core.defchararray.add(_years,_sep),_weeks),_dayofweek)
weekDate = [datetime.datetime.strptime(_d, "%Y-%W-%w") for _d in _]
Q613H['weekDate'] = weekDate
fig, ax = plt.subplots(1,1,figsize=[16,4])
fig.suptitle('Allele frequency of all the samples with Q613H mutations')
ax.plot(Q613H.weekDate,Q613H.af,'o', alpha=0.3)
ax.set_xlabel("date")
ax.set_ylabel("AF")
ax.set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax.axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
Q613H mutations were present in (relatively) many samples before the onset of Delta variant.
delta_all.groupby('clean_country').size()/delta_all.shape[0]*100
Q613H.groupby('clean_country').size()/Q613H.shape[0]*100
Calculate the number of delta defining mutations ...
_definingMutCnt = delta_all.groupby('ena_run').size().reset_index().rename(columns={0:'n'})
... and merge it with the Q613H table
delta_Q613H = pd.merge(Q613H.iloc[:,1:][['ena_run','weekDate','dp','af']],_definingMutCnt,left_on='ena_run',right_on='ena_run',how='left')
delta_Q613H['n'] = delta_Q613H['n'].replace(np.nan, 0)
... and count weekly stats.
weekly_delta_Q613H = delta_Q613H.groupby(['weekDate','n']).size().reset_index().rename(columns={0:'cnt'})
fig, ax = plt.subplots(2,1,figsize=[16,8])
fig.suptitle('Samples with Q613H mutation and 0,1,2,3 or all 4 Delta defining mutations')
pid = 1
for i in range(0,5):
_ = weekly_delta_Q613H[weekly_delta_Q613H.n==i]
ax[pid].plot(_.weekDate,_.cnt,'o-',label=i)
ax[pid].set_xlabel("date")
ax[pid].set_ylabel("# of samples")
ax[pid].legend()
ax[pid].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[pid].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')
pid = 0
_ = weekly_delta_Q613H.groupby('weekDate').sum().reset_index().rename(columns={0:'cnt'})
ax[pid].plot(_.weekDate,_.cnt,'o-',label='All Q613H')
ax[pid].set_xlabel("date")
ax[pid].set_ylabel("# of samples")
ax[pid].legend()
ax[pid].set_xlim(datetime.datetime.strptime("2020-1-1", "%Y-%W-%w"), datetime.datetime.strptime("2021-40-1", "%Y-%W-%w"))
ax[pid].axvline(x=datetime.datetime.strptime("2021-18-1", "%Y-%W-%w"),c='gray')