Omicron and nu mutations in GISAID 20211126 version

In [3]:
import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import psycopg2

import datetime
import time
import io

import tarfile

from sqlalchemy import create_engine
In [95]:
def psql(query):
    conn = psycopg2.connect(
            dbname='coveo',
            host='pg.coveo',
            user='***',
            password='*****',
            options="-c search_path=gisaid"
        )
    dummy = sqlio.read_sql_query(query, conn)
    conn.close()
    return dummy

def psql_execute(query):
    conn = None

    conn = psycopg2.connect(
        dbname='coveo',
        host='pg.coveo',
        user='****',
        password=*****,
        options="-c search_path=gisaid"
    )
    conn.autocommit = True

    cursor = conn.cursor()
    cursor.execute("SET search_path TO gisaid")
    cursor.execute(query)
        
    conn.close()
    return True
In [17]:
%%time
query = '''SELECT * 
FROM merged_gisaid_20211126 LIMIT 10 '''
merged_def = psql(query)
merged_def
CPU times: user 9.75 ms, sys: 3.45 ms, total: 13.2 ms
Wall time: 18.6 ms
Out[17]:
pos ref alt seqid virus_name accession_id collection_date submission_date patient_age n_content ... gender_code clade_code pango_lineage_code pangolin_version_code variant_code is_reference_code is_complete_code is_high_coverage_code is_low_coverage_code weekdate
0 34 a t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
1 35 a t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
2 36 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
3 37 c a 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
4 241 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
5 1059 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
6 3037 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
7 14408 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
8 23403 a g 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
9 25563 g t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23

10 rows × 26 columns

In [9]:
%%time
query = '''SELECT * 
FROM omicron_20211129 o '''
omicron_def = psql(query)
CPU times: user 0 ns, sys: 9.78 ms, total: 9.78 ms
Wall time: 14.9 ms
In [10]:
omicron_def.columns
Out[10]:
Index(['index', 'WHO_label', 'pango', 'type_variant', 'description',
       'amino_acid_change', 'protein_codon_position', 'REF_protein',
       'ALT_protein', 'gene', 'effect', 'ref_pos_alt', 'ref', 'alt', 'pos'],
      dtype='object')

Defining positions

In [15]:
omicron_def
Out[15]:
index WHO_label pango type_variant description amino_acid_change protein_codon_position REF_protein ALT_protein gene effect ref_pos_alt ref alt pos
0 0 Mu B.1.621 VOI This variant was first detected in Colombia in... T95I 95 T I S missense_variant C21846T C T 21846
1 1 Mu B.1.621 VOI This variant was first detected in Colombia in... R346K 346 R K S missense_variant G22599A G A 22599
2 2 Mu B.1.621 VOI This variant was first detected in Colombia in... E484K 484 E K S missense_variant G23012A G A 23012
3 3 Mu B.1.621 VOI This variant was first detected in Colombia in... N501Y 501 N Y S missense_variant A23063T A T 23063
4 4 Mu B.1.621 VOI This variant was first detected in Colombia in... D614G 614 D G S missense_variant A23403G A G 23403
5 5 Mu B.1.621 VOI This variant was first detected in Colombia in... P681H 681 P H S missense_variant C23604A C A 23604
6 6 Mu B.1.621 VOI This variant was first detected in Colombia in... D950N 950 D N S missense_variant G24410A G A 24410
7 7 Omicron B.1.1.529 VOC This variant was first detected in South Afric... A67V 67 A V S missense_variant C21762T C T 21762
8 8 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T95I 95 T I S missense_variant C21846T C T 21846
9 9 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G142D 142 G D S disruptive_inframe_deletion GGTGTTTATT21986G GGTGTTTATT G 21986
10 10 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G339D 339 G D S missense_variant G22578A G A 22578
11 11 Omicron B.1.1.529 VOC This variant was first detected in South Afric... L212I 212 L I S missense_variant T22673C T C 22673
12 12 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S371L 371 S L S missense_variant C22674T C T 22674
13 13 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S373P 373 S P S missense_variant T22679C T C 22679
14 14 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S375F 375 S F S missense_variant C22686T C T 22686
15 15 Omicron B.1.1.529 VOC This variant was first detected in South Afric... K417N 417 K N S missense_variant G22813T G T 22813
16 16 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N440K 440 N K S missense_variant T22882G T G 22882
17 17 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G446S 446 G S S missense_variant G22898A G A 22898
18 18 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S477N 477 S N S missense_variant G22992A G A 22992
19 19 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T478K 478 T K S missense_variant C22995A C A 22995
20 20 Omicron B.1.1.529 VOC This variant was first detected in South Afric... E484A 484 E A S missense_variant A23013C A C 23013
21 21 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q493R 493 Q R S missense_variant A23040G A G 23040
22 22 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G496S 496 G S S missense_variant G23048A G A 23048
23 23 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q498R 498 Q R S missense_variant A23055G A G 23055
24 24 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N501Y 501 N Y S missense_variant A23063T A T 23063
25 25 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Y505H 505 Y H S missense_variant T23075C T C 23075
26 26 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T547K 547 T K S missense_variant C23202A C A 23202
27 27 Omicron B.1.1.529 VOC This variant was first detected in South Afric... D614G 614 D G S missense_variant A23403G A G 23403
28 28 Omicron B.1.1.529 VOC This variant was first detected in South Afric... H655Y 655 H Y S missense_variant C23525T C T 23525
29 29 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N679K 679 N K S missense_variant T23599G T G 23599
30 30 Omicron B.1.1.529 VOC This variant was first detected in South Afric... P681H 681 P H S missense_variant C23604A C A 23604
31 31 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N764K 764 N K S missense_variant C23854A C A 23854
32 32 Omicron B.1.1.529 VOC This variant was first detected in South Afric... D796Y 796 D Y S missense_variant G23948T G T 23948
33 33 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N856K 856 N K S missense_variant C24130A C A 24130
34 34 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q954H 954 Q H S missense_variant A24424T A T 24424
35 35 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N969K 969 N K S missense_variant T24469A T A 24469
36 36 Omicron B.1.1.529 VOC This variant was first detected in South Afric... L981F 981 L F S missense_variant C24503T C T 24503
In [13]:
%%time
query = '''SELECT o.pos, o.amino_acid_change, o.pango, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
GROUP BY o.pos, o.amino_acid_change, o.pango'''
omicro_mu_gisaid_stat = psql(query)
CPU times: user 7.38 ms, sys: 920 µs, total: 8.3 ms
Wall time: 38.3 s

Individual mutation occurences

In [14]:
omicro_mu_gisaid_stat
Out[14]:
pos amino_acid_change pango cnt
0 23604 P681H B.1.621 4143062
1 23048 G496S B.1.1.529 565
2 22686 S375F B.1.1.529 299
3 22674 S371L B.1.1.529 87
4 23948 D796Y B.1.1.529 12730
5 23403 D614G B.1.621 5369906
6 23604 P681H B.1.1.529 4143062
7 23055 Q498R B.1.1.529 164
8 22679 S373P B.1.1.529 478
9 22898 G446S B.1.1.529 834
10 24410 D950N B.1.621 2784322
11 24503 L981F B.1.1.529 122
12 23040 Q493R B.1.1.529 541
13 23075 Y505H B.1.1.529 178
14 23013 E484A B.1.1.529 1484
15 23403 D614G B.1.1.529 5369906
16 24424 Q954H B.1.1.529 211
17 22882 N440K B.1.1.529 10351
18 23599 N679K B.1.1.529 5482
19 24130 N856K B.1.1.529 63164
20 21846 T95I B.1.1.529 1139920
21 22578 G339D B.1.1.529 584
22 23063 N501Y B.1.621 1337076
23 21762 A67V B.1.1.529 20664
24 23525 H655Y B.1.1.529 123703
25 22995 T478K B.1.1.529 2815171
26 22992 S477N B.1.1.529 81223
27 24469 N969K B.1.1.529 896
28 21986 G142D B.1.1.529 1938
29 22599 R346K B.1.621 16372
30 23854 N764K B.1.1.529 6387
31 21846 T95I B.1.621 1139920
32 23202 T547K B.1.1.529 8405
33 23012 E484K B.1.621 250540
34 22673 L212I B.1.1.529 299
35 23063 N501Y B.1.1.529 1337076
36 22813 K417N B.1.1.529 48474
In [18]:
merged_def.columns
Out[18]:
Index(['pos', 'ref', 'alt', 'seqid', 'virus_name', 'accession_id',
       'collection_date', 'submission_date', 'patient_age', 'n_content',
       'gc_content', 'sequence_length', 'type_code', 'location_code',
       'additional_location_information_code', 'host_code', 'gender_code',
       'clade_code', 'pango_lineage_code', 'pangolin_version_code',
       'variant_code', 'is_reference_code', 'is_complete_code',
       'is_high_coverage_code', 'is_low_coverage_code', 'weekdate'],
      dtype='object')
## TOO SLOW %%time query = '''SELECT m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt FROM merged_gisaid_20211126 m JOIN location_20211126 l ON l.location_code=m.location_code JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code GROUP BY m.weekdate, l.location, p.pango_lineage ''' total_weekly_stat = psql(query)
In [48]:
%%time
query = '''SELECT * FROM samplesperweek_gisaid_20211126 '''

total_weekly_stat = psql(query)
CPU times: user 39.4 ms, sys: 8.75 ms, total: 48.1 ms
Wall time: 74.4 ms
In [21]:
%%time
query = '''SELECT o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage '''

omicron_weekly_stat = psql(query)

## 3min 35s
CPU times: user 2.52 s, sys: 1.16 s, total: 3.68 s
Wall time: 3min 35s
In [22]:
omicron_weekly_stat.shape
Out[22]:
(1340704, 6)
In [94]:
#omicron_weekly_stat.head()
In [51]:
weeklyStat = omicron_weekly_stat.groupby(['weekdate','pos','amino_acid_change']).sum().reset_index().rename(columns={0:'cnt'})
In [ ]:
weeklyStat = pd.merge(weeklyStat,total_weekly_stat.rename(columns={'cnt':'cntTotal'}),on='weekdate')
#weeklyStat
In [46]:
d1 = datetime.date(2020,1,1) #datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.date(2021,11,30) #datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
In [60]:
plt.figure(figsize=[16,8])


_ = total_weekly_stat[(total_weekly_stat.weekdate>=d1) & (total_weekly_stat.weekdate<=d2)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label='all samples')
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('All samples per week',fontsize='xx-large')
Out[60]:
Text(0.5, 1.0, 'All samples per week')
In [35]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[35]:
Text(0.5, 1.0, 'Omicron mutations')
In [36]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.semilogy(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[36]:
Text(0.5, 1.0, 'Omicron mutations')
In [54]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[54]:
Text(0.5, 1.0, 'Omicron mutations')
In [57]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    if (_.cnt.values/_.cntTotal.values).max()>0.1:
        plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations  (only fractions >0.1)',fontsize='xx-large')
Out[57]:
Text(0.5, 1.0, 'Omicron mutations  (only fractions >0.1)')
In [58]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    if (_.cnt.values/_.cntTotal.values).max()<=0.1:
        plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations  (only fractions <=0.1)',fontsize='xx-large')
Out[58]:
Text(0.5, 1.0, 'Omicron mutations  (only fractions <=0.1)')
In [55]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.semilogy(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[55]:
Text(0.5, 1.0, 'Omicron mutations')

Locations

In [61]:
omicron_weekly_stat.head()
Out[61]:
pos amino_acid_change weekdate location pango_lineage cnt
0 21762 A67V 2010-12-06 Asia / Cambodia / Steung Treng A 2
1 21762 A67V 2020-01-06 Africa / Nigeria / Edo State B.1.525 1
2 21762 A67V 2020-01-06 Europe / Gibraltar B.1.177.53 4
3 21762 A67V 2020-01-06 Europe / Greece B.1.1.70 3
4 21762 A67V 2020-01-06 Europe / United Kingdom / England B.1.1 1
In [68]:
omicron_def[omicron_def.WHO_label=='Omicron'].shape
Out[68]:
(30, 15)

Countries that have certain number of mutations in any of their samples

In [69]:
_ = omicron_weekly_stat.groupby(['location','amino_acid_change']).size().reset_index()
_.groupby('location').size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False).head(30)
Out[69]:
location cnt
15931 Europe / United Kingdom / England 30
18781 North America / USA / Texas / Houston 29
17068 North America / USA / Florida 29
166 Africa / Botswana / South East / Greater Gabor... 29
957 Africa / South Africa / Gauteng 28
10270 Europe / Italy / Campania 28
7897 Europe / Denmark / Hovedstaden 28
18173 North America / USA / New York 28
15964 North America / Canada / British Columbia 27
17700 North America / USA / Massachusetts 27
15940 Europe / United Kingdom / Scotland 27
16819 North America / USA / California 27
9797 Europe / Germany / North Rhine-Westphalia 27
17728 North America / USA / Michigan 27
19235 Oceania / Australia / Victoria 26
9697 Europe / Germany / Baden-Wurttemberg 26
9696 Europe / Germany 26
17487 North America / USA / Indiana 26
19030 North America / USA / Washington 26
18609 North America / USA / Texas 26
9714 Europe / Germany / Bavaria 25
18220 North America / USA / New York / New York City 25
17740 North America / USA / Minnesota 25
18130 North America / USA / New Jersey 25
16974 North America / USA / Colorado 25
19015 North America / USA / Utah 25
18322 North America / USA / Ohio 25
15943 Europe / United Kingdom / Wales 25
15882 Europe / Turkey 25
15686 Europe / Sweden / Uppsala 24

The same in weekly

In [74]:
_ = omicron_weekly_stat.groupby(['weekdate','location','amino_acid_change']).size().reset_index()
weekly_num_of = _.groupby(['weekdate','location']).size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False)
weekly_num_of
Out[74]:
weekdate location cnt
137955 2021-11-08 Africa / Botswana / South East / Greater Gabor... 29
138565 2021-11-15 Africa / South Africa / Gauteng 27
139398 2022-01-03 Europe / United Kingdom / England 26
138313 2021-11-08 Europe / United Kingdom / England 26
36936 2021-01-25 Europe / United Kingdom / England 26
... ... ... ...
22402 2020-11-02 Europe / Netherlands / Groningen 1
22405 2020-11-02 Europe / Netherlands / Limburg / Brunssum 1
22407 2020-11-02 Europe / Netherlands / Limburg / Heerlen 1
90311 2021-06-28 North America / USA / Washington / Benton County 1
139819 2022-01-03 South America / Venezuela / La Guaira 1

139820 rows × 3 columns

Top countries with most mutations present weekly

In [79]:
topCountries = weekly_num_of.groupby('location').max().reset_index().sort_values('cnt',ascending=False)
topCountries.head(30)
Out[79]:
location weekdate cnt
166 Africa / Botswana / South East / Greater Gabor... 2021-11-08 29
957 Africa / South Africa / Gauteng 2022-01-03 27
15931 Europe / United Kingdom / England 2022-01-03 26
15686 Europe / Sweden / Uppsala 2022-01-03 21
17068 North America / USA / Florida 2022-01-03 20
16819 North America / USA / California 2022-01-03 20
17487 North America / USA / Indiana 2022-01-03 19
18130 North America / USA / New Jersey 2022-01-03 19
9797 Europe / Germany / North Rhine-Westphalia 2022-01-03 18
18781 North America / USA / Texas / Houston 2022-01-03 18
17728 North America / USA / Michigan 2022-01-03 18
15882 Europe / Turkey 2021-11-08 18
9696 Europe / Germany 2022-01-03 18
18220 North America / USA / New York / New York City 2022-01-03 17
17740 North America / USA / Minnesota 2022-01-03 17
15964 North America / Canada / British Columbia 2022-01-03 17
17700 North America / USA / Massachusetts 2022-01-03 17
18609 North America / USA / Texas 2022-01-03 17
9697 Europe / Germany / Baden-Wurttemberg 2022-01-03 17
18447 North America / USA / Pennsylvania 2022-01-03 16
18173 North America / USA / New York 2022-01-03 16
19030 North America / USA / Washington 2022-01-03 16
18580 North America / USA / Tennessee 2022-01-03 16
17189 North America / USA / Illinois 2022-01-03 16
15940 Europe / United Kingdom / Scotland 2022-01-03 16
15970 North America / Canada / Ontario 2022-01-03 16
15943 Europe / United Kingdom / Wales 2022-01-03 16
17688 North America / USA / Maryland 2022-01-03 16
16914 North America / USA / California / San Diego C... 2021-11-08 15
9826 Europe / Germany / Saxony 2021-11-08 15
In [93]:
plt.figure(figsize=[16,8])


for i,c in enumerate(topCountries.head(10).location.values):
    _ = weekly_num_of[(weekly_num_of.weekdate>=d1) & (weekly_num_of.weekdate<=d2) & (weekly_num_of.location==c)].sort_values('weekdate')
    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    if i<3:
        plt.plot(_.weekdate,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate,_.cnt,label=lab,alpha=0.5)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of Omicron mutations present in country',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
Out[93]:
Text(0.5, 1.0, 'Omicron mutations. Top 10 countries with most mutations present')

Multiple simultaneous mutations

In [96]:
# TODO: this should not be a cleartext password
passwd = 'XXXXXXXX'
In [97]:
%%time
query = '''SELECT m.seqID, m.weekdate, l.location, COUNT(*) as cnt 
INTO omicron_persample_20211126
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY m.seqID, m.weekdate, l.location '''

psql_execute(query)
# Wall time: 13min 22s
CPU times: user 34.5 ms, sys: 16.3 ms, total: 50.8 ms
Wall time: 13min 22s
Out[97]:
True
In [102]:
query = '''SELECT COUNT(*) FROM omicron_persample_20211126 WHERE cnt>5'''
psql(query)
Out[102]:
count
0 299
In [148]:
%%time
query = '''SELECT m.*,o.cnt,l.location,p.pango_lineage FROM merged_gisaid_20211126 m JOIN 
(SELECT * FROM omicron_persample_20211126 WHERE cnt>5) o
ON m.seqID=o.seqID 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE m.weekdate> '2020-1-1'
'''
samplesWithMoreThan5 = psql(query)
CPU times: user 1.32 s, sys: 176 ms, total: 1.49 s
Wall time: 53.9 s
In [149]:
s5s = samplesWithMoreThan5.groupby(['weekdate','seqid']).max().reset_index()
s5s.sort_values('cnt',ascending=False).head(20)
Out[149]:
weekdate seqid pos ref alt virus_name accession_id collection_date submission_date patient_age ... pango_lineage_code pangolin_version_code variant_code is_reference_code is_complete_code is_high_coverage_code is_low_coverage_code cnt location pango_lineage
285 2021-11-08 6670244 28883 t t hCoV-19/Botswana/R40B61_BHP_3321001244/2021 ... EPI_ISL_6670244 2021-11-11 2021-11-24 64.0 ... 388 3 6 1 0 1 1 29 Africa / Botswana / South East / Greater Gabor... B.1.1.263
283 2021-11-08 6640919 28883 t t hCoV-19/Botswana/R40B58_BHP_3321001245/2021 ... EPI_ISL_6640919 2021-11-11 2021-11-23 33.0 ... 388 3 6 1 0 1 1 29 Africa / Botswana / South East / Greater Gabor... B.1.1.263
282 2021-11-08 6640917 28883 t t hCoV-19/Botswana/R40B60_BHP_3321001247/2021 ... EPI_ISL_6640917 2021-11-11 2021-11-23 64.0 ... 388 3 6 1 0 1 1 29 Africa / Botswana / South East / Greater Gabor... B.1.1.263
281 2021-11-08 6640916 28883 t t hCoV-19/Botswana/R40B59_BHP_3321001248/2021 ... EPI_ISL_6640916 2021-11-11 2021-11-23 57.0 ... 388 3 6 1 0 1 1 28 Africa / Botswana / South East / Greater Gabor... B.1.1.263
290 2021-11-15 6647961 28883 t t hCoV-19/South Africa/NICD-N21606-DX64492/2021 ... EPI_ISL_6647961 2021-11-15 2021-11-23 38.0 ... 1471 3 6 1 0 1 1 27 Africa / South Africa / Gauteng None
291 2021-11-15 6647962 28883 t t hCoV-19/South Africa/NICD-N21607-DX64624/2021 ... EPI_ISL_6647962 2021-11-16 2021-11-23 0.0 ... 1471 3 6 1 0 1 1 20 Africa / South Africa / Gauteng None
289 2021-11-15 6647960 28883 t t hCoV-19/South Africa/NICD-N21605-DX64490/2021 ... EPI_ISL_6647960 2021-11-15 2021-11-23 21.0 ... 388 3 6 1 0 1 1 20 Africa / South Africa / Gauteng B.1.1.263
288 2021-11-15 6647959 28883 t t hCoV-19/South Africa/NICD-N21604-DX64219/2021 ... EPI_ISL_6647959 2021-11-16 2021-11-23 29.0 ... 388 3 6 1 0 1 1 20 Africa / South Africa / Gauteng B.1.1.263
287 2021-11-15 6647958 28883 t t hCoV-19/South Africa/NICD-N21603-DX64204/2021 ... EPI_ISL_6647958 2021-11-16 2021-11-23 35.0 ... 388 3 6 1 0 1 1 20 Africa / South Africa / Gauteng B.1.1.263
286 2021-11-15 6647957 28883 t t hCoV-19/South Africa/NICD-N21602-DX040380/2021... EPI_ISL_6647957 2021-11-15 2021-11-23 28.0 ... 1471 3 6 1 0 1 1 20 Africa / South Africa / Gauteng None
284 2021-11-08 6647956 28883 t t hCoV-19/South Africa/NICD-N21600-DX03569/2021 ... EPI_ISL_6647956 2021-11-14 2021-11-23 27.0 ... 1471 3 6 1 0 1 1 20 Africa / South Africa / Gauteng None
1 2020-06-01 1699445 29769 t t hCoV-19/bat/Yunnan/RmYN05/2020 ... EPI_ISL_1699445 2020-05-25 2021-04-23 NaN ... 1471 3 6 1 0 0 1 13 Asia / China / Yunnan / Xishuangbanna None
0 2020-04-20 1699444 29769 t t hCoV-19/bat/Yunnan/RsYN04/2020 ... EPI_ISL_1699444 2020-04-18 2021-04-23 NaN ... 1471 3 6 1 0 0 1 13 Asia / China / Yunnan / Xishuangbanna None
6 2020-07-20 1699448 29769 t t hCoV-19/bat/Yunnan/RmYN08/2020 ... EPI_ISL_1699448 2020-07-14 2021-04-23 NaN ... 1471 3 6 1 0 1 1 13 Asia / China / Yunnan / Xishuangbanna None
8 2020-09-07 754195 29884 t t hCoV-19/Georgia/Tb-72720/2020 ... EPI_ISL_754195 2020-09-02 2020-12-31 NaN ... 252 3 6 1 1 0 1 12 Asia / Georgia / Tbilisi B.1.1
3 2020-06-08 1699447 29769 t t hCoV-19/bat/Yunnan/RmYN07/2020 ... EPI_ISL_1699447 2020-06-03 2021-04-23 NaN ... 1471 3 6 1 0 0 1 12 Asia / China / Yunnan / Xishuangbanna None
15 2021-03-15 1663018 29891 t t hCoV-19/USA/IN-ISDHQ54/2021 ... EPI_ISL_1663018 2021-03-19 2021-04-21 48.0 ... 842 3 6 1 1 1 1 11 North America / USA / Indiana B.1.2
16 2021-03-15 1663023 29891 t t hCoV-19/USA/IN-ISDHQ73/2021 ... EPI_ISL_1663023 2021-03-18 2021-04-21 28.0 ... 1471 3 6 1 1 1 1 11 North America / USA / Indiana None
7 2020-08-03 1699449 29835 t t hCoV-19/bat/Yunnan/RsYN09/2020 ... EPI_ISL_1699449 2020-07-28 2021-04-23 NaN ... 1471 3 6 1 0 0 1 10 Asia / China / Yunnan / Xishuangbanna None
20 2021-04-26 3267718 29872 t y hCoV-19/Sweden/Uppsala-000199821415N2/2021 ... EPI_ISL_3267718 2021-04-28 2021-08-07 NaN ... 1523 3 6 1 1 0 1 10 Europe / Sweden / Uppsala nan

20 rows × 29 columns

In [150]:
samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)
Out[150]:
location cnt
0 Africa / Botswana / South East / Greater Gabor... 29
2 Africa / South Africa / Gauteng 27
7 Asia / China / Yunnan / Xishuangbanna 13
8 Asia / Georgia / Tbilisi 12
66 North America / USA / Indiana 11
32 Europe / Sweden / Uppsala 10
12 Asia / Laos 9
4 Africa / South Africa / Western Cape 7
5 Asia / Bangladesh / Dhaka / Cantonment 7
34 Europe / Turkey 7
In [165]:
dummy
Out[165]:
location cnt
0 Africa / Botswana / South East / Greater Gabor... 29
2 Africa / South Africa / Gauteng 27
7 Asia / China / Yunnan / Xishuangbanna 13
8 Asia / Georgia / Tbilisi 12
66 North America / USA / Indiana 11
32 Europe / Sweden / Uppsala 10
12 Asia / Laos 9
4 Africa / South Africa / Western Cape 7
5 Asia / Bangladesh / Dhaka / Cantonment 7
34 Europe / Turkey 7
In [174]:
plt.figure(figsize=[16,8])

dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)

for i,c in enumerate(dummy.location.values):
    #_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
    #print(c)
    _ = s5s[ (s5s.location==c)].sort_values('weekdate')

    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    lab = lab+str(max(_.cnt.values))
    if i<5:
        plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
    
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')

plt.ylim(0,30)
#plt.xlim(d1,d2)
Out[174]:
(0, 30)
In [175]:
plt.figure(figsize=[16,8])

dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(20)

for i,c in enumerate(dummy.location.values):
    #_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
    #print(c)
    _ = s5s[ (s5s.location==c)].sort_values('weekdate')

    lab = c.split('/')[1]
    if lab==' USA ':
        lab = c.split('/')[-1]
    lab = lab+str(max(_.cnt.values))
    if i<5:
        plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
    else:
        plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
    
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
plt.ylim(0,30)
#plt.xlim(d1,d2)
Out[175]:
(0, 30)
In [ ]: