Omicron and nu mutations in GISAID 20211126 version¶

import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import psycopg2

import datetime
import time
import io

import tarfile

from sqlalchemy import create_engine

def psql(query):
    conn = psycopg2.connect(
            dbname='coveo',
            host='pg.coveo',
            user='***',
            password='*****',
            options="-c search_path=gisaid"
        )
    dummy = sqlio.read_sql_query(query, conn)
    conn.close()
    return dummy

def psql_execute(query):
    conn = None

    conn = psycopg2.connect(
        dbname='coveo',
        host='pg.coveo',
        user='****',
        password=*****,
        options="-c search_path=gisaid"
    )
    conn.autocommit = True

    cursor = conn.cursor()
    cursor.execute("SET search_path TO gisaid")
    cursor.execute(query)
        
    conn.close()
    return True

%%time
query = '''SELECT * 
FROM merged_gisaid_20211126 LIMIT 10 '''
merged_def = psql(query)
merged_def

CPU times: user 9.75 ms, sys: 3.45 ms, total: 13.2 ms
Wall time: 18.6 ms

%%time
query = '''SELECT * 
FROM omicron_20211129 o '''
omicron_def = psql(query)

CPU times: user 0 ns, sys: 9.78 ms, total: 9.78 ms
Wall time: 14.9 ms

omicron_def.columns

Index(['index', 'WHO_label', 'pango', 'type_variant', 'description',
       'amino_acid_change', 'protein_codon_position', 'REF_protein',
       'ALT_protein', 'gene', 'effect', 'ref_pos_alt', 'ref', 'alt', 'pos'],
      dtype='object')

Defining positions¶

omicron_def

%%time
query = '''SELECT o.pos, o.amino_acid_change, o.pango, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
GROUP BY o.pos, o.amino_acid_change, o.pango'''
omicro_mu_gisaid_stat = psql(query)

CPU times: user 7.38 ms, sys: 920 µs, total: 8.3 ms
Wall time: 38.3 s

Individual mutation occurences¶

omicro_mu_gisaid_stat

merged_def.columns

Index(['pos', 'ref', 'alt', 'seqid', 'virus_name', 'accession_id',
       'collection_date', 'submission_date', 'patient_age', 'n_content',
       'gc_content', 'sequence_length', 'type_code', 'location_code',
       'additional_location_information_code', 'host_code', 'gender_code',
       'clade_code', 'pango_lineage_code', 'pangolin_version_code',
       'variant_code', 'is_reference_code', 'is_complete_code',
       'is_high_coverage_code', 'is_low_coverage_code', 'weekdate'],
      dtype='object')

%%time
query = '''SELECT * FROM samplesperweek_gisaid_20211126 '''

total_weekly_stat = psql(query)

CPU times: user 39.4 ms, sys: 8.75 ms, total: 48.1 ms
Wall time: 74.4 ms

%%time
query = '''SELECT o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage '''

omicron_weekly_stat = psql(query)

## 3min 35s

CPU times: user 2.52 s, sys: 1.16 s, total: 3.68 s
Wall time: 3min 35s

omicron_weekly_stat.shape

(1340704, 6)

#omicron_weekly_stat.head()

weeklyStat = omicron_weekly_stat.groupby(['weekdate','pos','amino_acid_change']).sum().reset_index().rename(columns={0:'cnt'})

weeklyStat = pd.merge(weeklyStat,total_weekly_stat.rename(columns={'cnt':'cntTotal'}),on='weekdate')
#weeklyStat

d1 = datetime.date(2020,1,1) #datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.date(2021,11,30) #datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")

plt.figure(figsize=[16,8])


_ = total_weekly_stat[(total_weekly_stat.weekdate>=d1) & (total_weekly_stat.weekdate<=d2)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label='all samples')
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('All samples per week',fontsize='xx-large')

Text(0.5, 1.0, 'All samples per week')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.semilogy(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    if (_.cnt.values/_.cntTotal.values).max()>0.1:
        plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations  (only fractions >0.1)',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations  (only fractions >0.1)')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    if (_.cnt.values/_.cntTotal.values).max()<=0.1:
        plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations  (only fractions <=0.1)',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations  (only fractions <=0.1)')

plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.semilogy(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations')

Locations¶

omicron_weekly_stat.head()

omicron_def[omicron_def.WHO_label=='Omicron'].shape

(30, 15)

Countries that have certain number of mutations in any of their samples¶

_ = omicron_weekly_stat.groupby(['location','amino_acid_change']).size().reset_index()
_.groupby('location').size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False).head(30)

The same in weekly¶

_ = omicron_weekly_stat.groupby(['weekdate','location','amino_acid_change']).size().reset_index()
weekly_num_of = _.groupby(['weekdate','location']).size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False)
weekly_num_of

Top countries with most mutations present weekly¶

topCountries = weekly_num_of.groupby('location').max().reset_index().sort_values('cnt',ascending=False)
topCountries.head(30)

plt.figure(figsize=[16,8])


for i,c in enumerate(topCountries.head(10).location.values):
    _ = weekly_num_of[(weekly_num_of.weekdate>=d1) & (weekly_num_of.weekdate<=d2) & (weekly_num_of.location==c)].sort_values('weekdate')
    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    if i<3:
        plt.plot(_.weekdate,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate,_.cnt,label=lab,alpha=0.5)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of Omicron mutations present in country',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')

Text(0.5, 1.0, 'Omicron mutations. Top 10 countries with most mutations present')

Multiple simultaneous mutations¶

# TODO: this should not be a cleartext password
passwd = 'XXXXXXXX'

%%time
query = '''SELECT m.seqID, m.weekdate, l.location, COUNT(*) as cnt 
INTO omicron_persample_20211126
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY m.seqID, m.weekdate, l.location '''

psql_execute(query)
# Wall time: 13min 22s

CPU times: user 34.5 ms, sys: 16.3 ms, total: 50.8 ms
Wall time: 13min 22s

True

query = '''SELECT COUNT(*) FROM omicron_persample_20211126 WHERE cnt>5'''
psql(query)

%%time
query = '''SELECT m.*,o.cnt,l.location,p.pango_lineage FROM merged_gisaid_20211126 m JOIN 
(SELECT * FROM omicron_persample_20211126 WHERE cnt>5) o
ON m.seqID=o.seqID 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE m.weekdate> '2020-1-1'
'''
samplesWithMoreThan5 = psql(query)

CPU times: user 1.32 s, sys: 176 ms, total: 1.49 s
Wall time: 53.9 s

s5s = samplesWithMoreThan5.groupby(['weekdate','seqid']).max().reset_index()
s5s.sort_values('cnt',ascending=False).head(20)

samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)

dummy

plt.figure(figsize=[16,8])

dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)

for i,c in enumerate(dummy.location.values):
    #_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
    #print(c)
    _ = s5s[ (s5s.location==c)].sort_values('weekdate')

    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    lab = lab+str(max(_.cnt.values))
    if i<5:
        plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
    
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')

plt.ylim(0,30)
#plt.xlim(d1,d2)

(0, 30)

plt.figure(figsize=[16,8])

dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(20)

for i,c in enumerate(dummy.location.values):
    #_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
    #print(c)
    _ = s5s[ (s5s.location==c)].sort_values('weekdate')

    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    lab = lab+str(max(_.cnt.values))
    if i<5:
        plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
    
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
plt.ylim(0,30)
#plt.xlim(d1,d2)

(0, 30)

	pos	ref	alt	seqid	virus_name	accession_id	collection_date	submission_date	patient_age	n_content	...	gender_code	clade_code	pango_lineage_code	pangolin_version_code	variant_code	is_reference_code	is_low_coverage_code	weekdate
0	34	a	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
1	35	a	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
2	36	c	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
3	37	c	a	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
4	241	c	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
5	1059	c	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
6	3037	c	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
7	14408	c	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
8	23403	a	g	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23
9	25563	g	t	424246	hCoV-19/USA/WA-UW-1758/2020 ...	EPI_ISL_424246	2020-03-21	2020-04-12	None	0.000368	...	330	1	1044	3	6	1	1	2020-03-23

	index	WHO_label	pango	type_variant	description	amino_acid_change	protein_codon_position	REF_protein	ALT_protein	gene	effect	ref_pos_alt	ref	alt	pos
0	0	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	T95I	95	T	I	S	missense_variant	C21846T	C	T	21846
1	1	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	R346K	346	R	K	S	missense_variant	G22599A	G	A	22599
2	2	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	E484K	484	E	K	S	missense_variant	G23012A	G	A	23012
3	3	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	N501Y	501	N	Y	S	missense_variant	A23063T	A	T	23063
4	4	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	D614G	614	D	G	S	missense_variant	A23403G	A	G	23403
5	5	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	P681H	681	P	H	S	missense_variant	C23604A	C	A	23604
6	6	Mu	B.1.621	VOI	This variant was first detected in Colombia in...	D950N	950	D	N	S	missense_variant	G24410A	G	A	24410
7	7	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	A67V	67	A	V	S	missense_variant	C21762T	C	T	21762
8	8	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	T95I	95	T	I	S	missense_variant	C21846T	C	T	21846
9	9	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	G142D	142	G	D	S	disruptive_inframe_deletion	GGTGTTTATT21986G	GGTGTTTATT	G	21986
10	10	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	G339D	339	G	D	S	missense_variant	G22578A	G	A	22578
11	11	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	L212I	212	L	I	S	missense_variant	T22673C	T	C	22673
12	12	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	S371L	371	S	L	S	missense_variant	C22674T	C	T	22674
13	13	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	S373P	373	S	P	S	missense_variant	T22679C	T	C	22679
14	14	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	S375F	375	S	F	S	missense_variant	C22686T	C	T	22686
15	15	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	K417N	417	K	N	S	missense_variant	G22813T	G	T	22813
16	16	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N440K	440	N	K	S	missense_variant	T22882G	T	G	22882
17	17	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	G446S	446	G	S	S	missense_variant	G22898A	G	A	22898
18	18	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	S477N	477	S	N	S	missense_variant	G22992A	G	A	22992
19	19	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	T478K	478	T	K	S	missense_variant	C22995A	C	A	22995
20	20	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	E484A	484	E	A	S	missense_variant	A23013C	A	C	23013
21	21	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	Q493R	493	Q	R	S	missense_variant	A23040G	A	G	23040
22	22	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	G496S	496	G	S	S	missense_variant	G23048A	G	A	23048
23	23	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	Q498R	498	Q	R	S	missense_variant	A23055G	A	G	23055
24	24	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N501Y	501	N	Y	S	missense_variant	A23063T	A	T	23063
25	25	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	Y505H	505	Y	H	S	missense_variant	T23075C	T	C	23075
26	26	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	T547K	547	T	K	S	missense_variant	C23202A	C	A	23202
27	27	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	D614G	614	D	G	S	missense_variant	A23403G	A	G	23403
28	28	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	H655Y	655	H	Y	S	missense_variant	C23525T	C	T	23525
29	29	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N679K	679	N	K	S	missense_variant	T23599G	T	G	23599
30	30	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	P681H	681	P	H	S	missense_variant	C23604A	C	A	23604
31	31	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N764K	764	N	K	S	missense_variant	C23854A	C	A	23854
32	32	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	D796Y	796	D	Y	S	missense_variant	G23948T	G	T	23948
33	33	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N856K	856	N	K	S	missense_variant	C24130A	C	A	24130
34	34	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	Q954H	954	Q	H	S	missense_variant	A24424T	A	T	24424
35	35	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	N969K	969	N	K	S	missense_variant	T24469A	T	A	24469
36	36	Omicron	B.1.1.529	VOC	This variant was first detected in South Afric...	L981F	981	L	F	S	missense_variant	C24503T	C	T	24503

	pos	amino_acid_change	pango	cnt
0	23604	P681H	B.1.621	4143062
1	23048	G496S	B.1.1.529	565
2	22686	S375F	B.1.1.529	299
3	22674	S371L	B.1.1.529	87
4	23948	D796Y	B.1.1.529	12730
5	23403	D614G	B.1.621	5369906
6	23604	P681H	B.1.1.529	4143062
7	23055	Q498R	B.1.1.529	164
8	22679	S373P	B.1.1.529	478
9	22898	G446S	B.1.1.529	834
10	24410	D950N	B.1.621	2784322
11	24503	L981F	B.1.1.529	122
12	23040	Q493R	B.1.1.529	541
13	23075	Y505H	B.1.1.529	178
14	23013	E484A	B.1.1.529	1484
15	23403	D614G	B.1.1.529	5369906
16	24424	Q954H	B.1.1.529	211
17	22882	N440K	B.1.1.529	10351
18	23599	N679K	B.1.1.529	5482
19	24130	N856K	B.1.1.529	63164
20	21846	T95I	B.1.1.529	1139920
21	22578	G339D	B.1.1.529	584
22	23063	N501Y	B.1.621	1337076
23	21762	A67V	B.1.1.529	20664
24	23525	H655Y	B.1.1.529	123703
25	22995	T478K	B.1.1.529	2815171
26	22992	S477N	B.1.1.529	81223
27	24469	N969K	B.1.1.529	896
28	21986	G142D	B.1.1.529	1938
29	22599	R346K	B.1.621	16372
30	23854	N764K	B.1.1.529	6387
31	21846	T95I	B.1.621	1139920
32	23202	T547K	B.1.1.529	8405
33	23012	E484K	B.1.621	250540
34	22673	L212I	B.1.1.529	299
35	23063	N501Y	B.1.1.529	1337076
36	22813	K417N	B.1.1.529	48474

	pos	amino_acid_change	weekdate	location	pango_lineage	cnt
0	21762	A67V	2010-12-06	Asia / Cambodia / Steung Treng	A	2
1	21762	A67V	2020-01-06	Africa / Nigeria / Edo State	B.1.525	1
2	21762	A67V	2020-01-06	Europe / Gibraltar	B.1.177.53	4
3	21762	A67V	2020-01-06	Europe / Greece	B.1.1.70	3
4	21762	A67V	2020-01-06	Europe / United Kingdom / England	B.1.1	1

	location	cnt
15931	Europe / United Kingdom / England	30
18781	North America / USA / Texas / Houston	29
17068	North America / USA / Florida	29
166	Africa / Botswana / South East / Greater Gabor...	29
957	Africa / South Africa / Gauteng	28
10270	Europe / Italy / Campania	28
7897	Europe / Denmark / Hovedstaden	28
18173	North America / USA / New York	28
15964	North America / Canada / British Columbia	27
17700	North America / USA / Massachusetts	27
15940	Europe / United Kingdom / Scotland	27
16819	North America / USA / California	27
9797	Europe / Germany / North Rhine-Westphalia	27
17728	North America / USA / Michigan	27
19235	Oceania / Australia / Victoria	26
9697	Europe / Germany / Baden-Wurttemberg	26
9696	Europe / Germany	26
17487	North America / USA / Indiana	26
19030	North America / USA / Washington	26
18609	North America / USA / Texas	26
9714	Europe / Germany / Bavaria	25
18220	North America / USA / New York / New York City	25
17740	North America / USA / Minnesota	25
18130	North America / USA / New Jersey	25
16974	North America / USA / Colorado	25
19015	North America / USA / Utah	25
18322	North America / USA / Ohio	25
15943	Europe / United Kingdom / Wales	25
15882	Europe / Turkey	25
15686	Europe / Sweden / Uppsala	24

	weekdate	location	cnt
137955	2021-11-08	Africa / Botswana / South East / Greater Gabor...	29
138565	2021-11-15	Africa / South Africa / Gauteng	27
139398	2022-01-03	Europe / United Kingdom / England	26
138313	2021-11-08	Europe / United Kingdom / England	26
36936	2021-01-25	Europe / United Kingdom / England	26
...	...	...	...
22402	2020-11-02	Europe / Netherlands / Groningen	1
22405	2020-11-02	Europe / Netherlands / Limburg / Brunssum	1
22407	2020-11-02	Europe / Netherlands / Limburg / Heerlen	1
90311	2021-06-28	North America / USA / Washington / Benton County	1
139819	2022-01-03	South America / Venezuela / La Guaira	1

	weekdate	seqid	pos	ref	alt	virus_name	accession_id	collection_date	submission_date	patient_age	...	pango_lineage_code	pangolin_version_code	variant_code	is_reference_code	is_complete_code	is_high_coverage_code	is_low_coverage_code	cnt	location	pango_lineage
285	2021-11-08	6670244	28883	t	t	hCoV-19/Botswana/R40B61_BHP_3321001244/2021 ...	EPI_ISL_6670244	2021-11-11	2021-11-24	64.0	...	388	3	6	1	0	1	1	29	Africa / Botswana / South East / Greater Gabor...	B.1.1.263
283	2021-11-08	6640919	28883	t	t	hCoV-19/Botswana/R40B58_BHP_3321001245/2021 ...	EPI_ISL_6640919	2021-11-11	2021-11-23	33.0	...	388	3	6	1	0	1	1	29	Africa / Botswana / South East / Greater Gabor...	B.1.1.263
282	2021-11-08	6640917	28883	t	t	hCoV-19/Botswana/R40B60_BHP_3321001247/2021 ...	EPI_ISL_6640917	2021-11-11	2021-11-23	64.0	...	388	3	6	1	0	1	1	29	Africa / Botswana / South East / Greater Gabor...	B.1.1.263
281	2021-11-08	6640916	28883	t	t	hCoV-19/Botswana/R40B59_BHP_3321001248/2021 ...	EPI_ISL_6640916	2021-11-11	2021-11-23	57.0	...	388	3	6	1	0	1	1	28	Africa / Botswana / South East / Greater Gabor...	B.1.1.263
290	2021-11-15	6647961	28883	t	t	hCoV-19/South Africa/NICD-N21606-DX64492/2021 ...	EPI_ISL_6647961	2021-11-15	2021-11-23	38.0	...	1471	3	6	1	0	1	1	27	Africa / South Africa / Gauteng	None
291	2021-11-15	6647962	28883	t	t	hCoV-19/South Africa/NICD-N21607-DX64624/2021 ...	EPI_ISL_6647962	2021-11-16	2021-11-23	0.0	...	1471	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	None
289	2021-11-15	6647960	28883	t	t	hCoV-19/South Africa/NICD-N21605-DX64490/2021 ...	EPI_ISL_6647960	2021-11-15	2021-11-23	21.0	...	388	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	B.1.1.263
288	2021-11-15	6647959	28883	t	t	hCoV-19/South Africa/NICD-N21604-DX64219/2021 ...	EPI_ISL_6647959	2021-11-16	2021-11-23	29.0	...	388	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	B.1.1.263
287	2021-11-15	6647958	28883	t	t	hCoV-19/South Africa/NICD-N21603-DX64204/2021 ...	EPI_ISL_6647958	2021-11-16	2021-11-23	35.0	...	388	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	B.1.1.263
286	2021-11-15	6647957	28883	t	t	hCoV-19/South Africa/NICD-N21602-DX040380/2021...	EPI_ISL_6647957	2021-11-15	2021-11-23	28.0	...	1471	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	None
284	2021-11-08	6647956	28883	t	t	hCoV-19/South Africa/NICD-N21600-DX03569/2021 ...	EPI_ISL_6647956	2021-11-14	2021-11-23	27.0	...	1471	3	6	1	0	1	1	20	Africa / South Africa / Gauteng	None
1	2020-06-01	1699445	29769	t	t	hCoV-19/bat/Yunnan/RmYN05/2020 ...	EPI_ISL_1699445	2020-05-25	2021-04-23	NaN	...	1471	3	6	1	0	0	1	13	Asia / China / Yunnan / Xishuangbanna	None
0	2020-04-20	1699444	29769	t	t	hCoV-19/bat/Yunnan/RsYN04/2020 ...	EPI_ISL_1699444	2020-04-18	2021-04-23	NaN	...	1471	3	6	1	0	0	1	13	Asia / China / Yunnan / Xishuangbanna	None
6	2020-07-20	1699448	29769	t	t	hCoV-19/bat/Yunnan/RmYN08/2020 ...	EPI_ISL_1699448	2020-07-14	2021-04-23	NaN	...	1471	3	6	1	0	1	1	13	Asia / China / Yunnan / Xishuangbanna	None
8	2020-09-07	754195	29884	t	t	hCoV-19/Georgia/Tb-72720/2020 ...	EPI_ISL_754195	2020-09-02	2020-12-31	NaN	...	252	3	6	1	1	0	1	12	Asia / Georgia / Tbilisi	B.1.1
3	2020-06-08	1699447	29769	t	t	hCoV-19/bat/Yunnan/RmYN07/2020 ...	EPI_ISL_1699447	2020-06-03	2021-04-23	NaN	...	1471	3	6	1	0	0	1	12	Asia / China / Yunnan / Xishuangbanna	None
15	2021-03-15	1663018	29891	t	t	hCoV-19/USA/IN-ISDHQ54/2021 ...	EPI_ISL_1663018	2021-03-19	2021-04-21	48.0	...	842	3	6	1	1	1	1	11	North America / USA / Indiana	B.1.2
16	2021-03-15	1663023	29891	t	t	hCoV-19/USA/IN-ISDHQ73/2021 ...	EPI_ISL_1663023	2021-03-18	2021-04-21	28.0	...	1471	3	6	1	1	1	1	11	North America / USA / Indiana	None
7	2020-08-03	1699449	29835	t	t	hCoV-19/bat/Yunnan/RsYN09/2020 ...	EPI_ISL_1699449	2020-07-28	2021-04-23	NaN	...	1471	3	6	1	0	0	1	10	Asia / China / Yunnan / Xishuangbanna	None
20	2021-04-26	3267718	29872	t	y	hCoV-19/Sweden/Uppsala-000199821415N2/2021 ...	EPI_ISL_3267718	2021-04-28	2021-08-07	NaN	...	1523	3	6	1	1	0	1	10	Europe / Sweden / Uppsala	nan