Find emerging mutations that were not present few weeks ago, but now start to dominate many samples. We do not distinguish type of mutations (indels, etc.), only count any type at the position where it is registered.
We use the multiple sequence alingment (MSA) and metadata of all GISAID SARS-CoV-2 sequences. As of 2021.11.03 it is 4.8M samples. For all positions of all samples in the MSA FASTA files the difference from the reference EPI_ISL_402124 is recorded and loaded into an SQL database. For this dataset this provided almost 4 billion rows. Since many of these are early deletions ('-') or non-detections ('n') we focus only on more real mutations, and reduce the data table to 158M mutations.
We looked for mutations, that have very low prevalence before August 2021 (week 32 of 2021) and higher prevalence (at least 40x affected samples than before) after that. This date is selected as a date that is beyond the relatively wide spread of the original Delta variant.
We could identify mutations with very recent rapid growth. The two top positions are interestingly not in the S (spike) gene:
As a function of time there is clear rapid growing trend. Some other positions also show growing with lower overall count.
We gratefully acknowledge both the originating and submitting laboratories for the sequence data in GISAID EpiCoV on which the SARS-CoV-2 variants data are partially based.
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
# load python libraries
import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
import datetime
# this connects to database and executes a query
def psql(query):
conn = psycopg2.connect(
dbname='csabai',
host='pg.coveo',
user='csabai',
password='ohXai7ee'
)
dummy = sqlio.read_sql_query(query, conn)
conn.close()
return dummy
def psql_execute(query):
conn = None
try:
conn = psycopg2.connect(
dbname='csabai',
host='pg.coveo',
user='csabai',
password='ohXai7ee'
)
conn.autocommit = True
except:
print("Connection failed.")
if(conn != None):
cursor = conn.cursor()
cursor.execute(query)
conn.close()
return True
query='''SELECT COUNT(DISTINCT seqid) AS number_of_samples FROM merged_gisaid_20211103;'''
psql(query)
query='''SELECT reltuples AS number_of_mutations FROM pg_class where relname = 'merged_gisaid_20211103';'''
psql(query)
%%time
query = '''
DROP TABLE IF EXISTS after_week32_gisaid_20211103;
SELECT pos, SUM(cnt) AS cnt, AVG(CAST(cnt AS REAL)) AS avg
INTO after_week32_gisaid_20211103
FROM perweek_perpos_gisaid_20211103
WHERE date_isoyear=2021 AND date_isoweek>32 GROUP BY pos;
DROP TABLE IF EXISTS before_week32_gisaid_20211103;
SELECT pos, SUM(cnt) AS cnt, AVG(CAST(cnt AS REAL)) AS avg
INTO before_week32_gisaid_20211103
FROM perweek_perpos_gisaid_20211103
WHERE date_isoyear BETWEEN 2020 AND 2021 AND date_isoweek<=32 GROUP BY pos;
'''
psql_execute(query)
query = ''' SELECT * FROM after_week32_gisaid_20211103'''
after32_n = psql(query)
after32_n = after32_n.sort_values('pos')
query = ''' SELECT * FROM before_week32_gisaid_20211103'''
before32_n = psql(query)
before32_n= before32_n.sort_values('pos')
_ = pd.merge(before32_n, after32_n, how='outer', on='pos')
_ = _.fillna(0)
_['cnt_x_norm'] = _['cnt_x']/_.cnt_x.sum()
_['cnt_y_norm'] = _['cnt_y']/_.cnt_y.sum()
_['avg_x_norm'] = _['avg_x']/_.cnt_x.sum()
_['avg_y_norm'] = _['avg_y']/_.cnt_y.sum()
We look for mutation positions with very few samples before, and many samples after the pivot day. These points would be in the upper left corner of the plot. We are not interested in positions that already have high prevelance for a long time.
plt.figure(figsize=[8,8])
plt.plot(_.cnt_x,_.cnt_y,'.')
#plt.loglog(np.array([0,4e-2]),np.array([0,4e-2]),'-')
plt.xlabel('# of mutated samples before')
plt.ylabel('# ofmutated samples after')
plt.title('Each dot corresponds to a SARS-CoV-2 genome position. \n Axes are the counts of samples that have the given mutation')
#plt.plot(np.array([1,1e6]),np.array([1,1e6]),'-')
plt.plot(np.array([1,1e6]),np.array([1,1e6]),'-',label='1x')
plt.legend()
plt.figure(figsize=[8,8])
plt.plot(_[_.cnt_x<1000].cnt_x,_[_.cnt_x<1000].cnt_y,'.')
plt.xlabel('# of mutated samples before')
plt.ylabel('# ofmutated samples after')
plt.title('Zoomed version. Each dot corresponds to a SARS-CoV-2 genome position. \n Axes are the counts of samples that have the given mutation')
plt.plot(np.array([1,1e3]),np.array([1,40*1e3]),'-',label='40x')
plt.legend()
plt.figure(figsize=[8,8])
plt.loglog(_.avg_x,_.avg_y,'.')
#plt.plot(np.array([1,3e6]),np.array([1,3e6]),'-')
plt.xlabel('# of mutated samples before')
plt.ylabel('# ofmutated samples after')
plt.title('Normalized, Log scale. Each dot corresponds to a SARS-CoV-2 genome position. \n Axes are the normalized counts of samples that have the given mutation')
#plt.plot(np.array([1,1e6]),np.array([1,1e6]),'-')
plt.loglog(np.array([0,1e5]),np.array([0,40*1e5]),'-',label='40x')
plt.loglog(np.array([0,1e5]),np.array([0,1e5]),'-',label='1x')
plt.legend()
_[(_.cnt_x<1000) & (_.cnt_y>40*_.cnt_x)].sort_values('cnt_y')
_[(_.avg_x<1000) & (_.avg_y>40*_.avg_x)].sort_values('avg_y')
posList = _[(_.avg_x<1000) & (_.avg_y>40*_.avg_x)].sort_values('avg_y',ascending=False).pos.values
posList
%%time
query ='''
SELECT m.*,l.location,wl.cnt as wl_cnt, w.cnt as w_cnt
FROM merged_gisaid_20211103 m JOIN location l ON m.location_code=l.location_code
JOIN samplesperweek_perlocation_gisaid_20211103 wl
ON wl.date_isoyear=m.date_isoyear AND wl.date_isoweek=m.date_isoweek AND wl.location_code=m.location_code
JOIN samplesperweek_gisaid_20211103 w
ON w.date_isoyear=m.date_isoyear AND w.date_isoweek=m.date_isoweek
WHERE pos IN (13797, 10967, 12421, 11195, 23506, 71, 7352, 2307, 14551, 25434, 24984, 8834, 15520)
'''
posAll40x_w32 = psql(query)
posAll40x_w32['weekdate'] = [datetime.datetime.strptime('-'.join([str(_d[0]),str(_d[1]),'1']), "%Y-%W-%w") for _d in posAll40x_w32[['date_isoyear','date_isoweek']].values]
_ = posAll40x_w32.groupby(['pos','weekdate','w_cnt']).size().reset_index().rename(columns={0:'cnt1'})
_ = _.sort_values('weekdate')
d1 = datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
for p in posList:
_1 = _[(_.pos==p) & (_.weekdate>d1) & (_.weekdate<d2)]
plt.figure(figsize=[16,8])
plt.plot(_1.weekdate,_1.cnt1/_1.w_cnt,'.-',lw=3)
plt.xlim(d1,d2)
plt.xlabel('pos')
plt.ylabel('Affected sample ratio')
#plt.ylim(0,max(_1.cnt1/_1.w_cnt))
plt.ylim(0,1)
plt.title('position='+str(p))
plt.show()
d1 = datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
for p in posList:
_1 = _[(_.pos==p) & (_.weekdate>d1) & (_.weekdate<d2)]
plt.figure(figsize=[16,8])
plt.semilogy(_1.weekdate,_1.cnt1/_1.w_cnt,'.-',lw=3)
plt.xlim(d1,d2)
plt.xlabel('pos')
plt.ylabel('Affected sample ratio')
#plt.ylim(0,max(_1.cnt1/_1.w_cnt))
plt.ylim(1e-5,1)
plt.title('Log y scale. position='+str(p))
plt.show()
The non-uniform, non-representative collection and sharing of data makes location identification problematic
posAll40x_w32.columns
dPivot = datetime.datetime.strptime("2021-08-01", "%Y-%m-%d")
_ = posAll40x_w32[ (posAll40x_w32.pos==15520) & (posAll40x_w32.weekdate>dPivot)]
loc = _.groupby(['weekdate','location','wl_cnt']).size().reset_index().rename(columns={0:'cnt_perweek_perlocation'})
loc['country'] = [c[1] for c in loc.location.str.split(' / ').values]
loc_country = loc.groupby(['weekdate','country']).sum().reset_index()
loc_country['normCnt'] = loc_country['cnt_perweek_perlocation']/loc_country['wl_cnt']
loc_country = loc_country.sort_values('normCnt',ascending=False)
locSum = loc_country.groupby('country').sum().reset_index().sort_values('normCnt',ascending=False)
locSum.head(20)
posAll40x_w32[ (posAll40x_w32.pos==15520)].sort_values('weekdate',ascending=True)[['location','weekdate']].head(20)