Omicron and nu mutations in GISAID 20211126 version

In [3]:
import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import psycopg2

import datetime
import time
import io

import tarfile

from sqlalchemy import create_engine
In [95]:
def psql(query):
    conn = psycopg2.connect(
            dbname='coveo',
            host='pg.coveo',
            user='***',
            password='*****',
            options="-c search_path=gisaid"
        )
    dummy = sqlio.read_sql_query(query, conn)
    conn.close()
    return dummy

def psql_execute(query):
    conn = None

    conn = psycopg2.connect(
        dbname='coveo',
        host='pg.coveo',
        user='****',
        password=*****,
        options="-c search_path=gisaid"
    )
    conn.autocommit = True

    cursor = conn.cursor()
    cursor.execute("SET search_path TO gisaid")
    cursor.execute(query)
        
    conn.close()
    return True
In [17]:
%%time
query = '''SELECT * 
FROM merged_gisaid_20211126 LIMIT 10 '''
merged_def = psql(query)
merged_def
CPU times: user 9.75 ms, sys: 3.45 ms, total: 13.2 ms
Wall time: 18.6 ms
Out[17]:
pos ref alt seqid virus_name accession_id collection_date submission_date patient_age n_content ... gender_code clade_code pango_lineage_code pangolin_version_code variant_code is_reference_code is_complete_code is_high_coverage_code is_low_coverage_code weekdate
0 34 a t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
1 35 a t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
2 36 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
3 37 c a 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
4 241 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
5 1059 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
6 3037 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
7 14408 c t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
8 23403 a g 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23
9 25563 g t 424246 hCoV-19/USA/WA-UW-1758/2020 ... EPI_ISL_424246 2020-03-21 2020-04-12 None 0.000368 ... 330 1 1044 3 6 1 0 0 1 2020-03-23

10 rows × 26 columns

In [9]:
%%time
query = '''SELECT * 
FROM omicron_20211129 o '''
omicron_def = psql(query)
CPU times: user 0 ns, sys: 9.78 ms, total: 9.78 ms
Wall time: 14.9 ms
In [10]:
omicron_def.columns
Out[10]:
Index(['index', 'WHO_label', 'pango', 'type_variant', 'description',
       'amino_acid_change', 'protein_codon_position', 'REF_protein',
       'ALT_protein', 'gene', 'effect', 'ref_pos_alt', 'ref', 'alt', 'pos'],
      dtype='object')

Defining positions

In [15]:
omicron_def
Out[15]:
index WHO_label pango type_variant description amino_acid_change protein_codon_position REF_protein ALT_protein gene effect ref_pos_alt ref alt pos
0 0 Mu B.1.621 VOI This variant was first detected in Colombia in... T95I 95 T I S missense_variant C21846T C T 21846
1 1 Mu B.1.621 VOI This variant was first detected in Colombia in... R346K 346 R K S missense_variant G22599A G A 22599
2 2 Mu B.1.621 VOI This variant was first detected in Colombia in... E484K 484 E K S missense_variant G23012A G A 23012
3 3 Mu B.1.621 VOI This variant was first detected in Colombia in... N501Y 501 N Y S missense_variant A23063T A T 23063
4 4 Mu B.1.621 VOI This variant was first detected in Colombia in... D614G 614 D G S missense_variant A23403G A G 23403
5 5 Mu B.1.621 VOI This variant was first detected in Colombia in... P681H 681 P H S missense_variant C23604A C A 23604
6 6 Mu B.1.621 VOI This variant was first detected in Colombia in... D950N 950 D N S missense_variant G24410A G A 24410
7 7 Omicron B.1.1.529 VOC This variant was first detected in South Afric... A67V 67 A V S missense_variant C21762T C T 21762
8 8 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T95I 95 T I S missense_variant C21846T C T 21846
9 9 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G142D 142 G D S disruptive_inframe_deletion GGTGTTTATT21986G GGTGTTTATT G 21986
10 10 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G339D 339 G D S missense_variant G22578A G A 22578
11 11 Omicron B.1.1.529 VOC This variant was first detected in South Afric... L212I 212 L I S missense_variant T22673C T C 22673
12 12 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S371L 371 S L S missense_variant C22674T C T 22674
13 13 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S373P 373 S P S missense_variant T22679C T C 22679
14 14 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S375F 375 S F S missense_variant C22686T C T 22686
15 15 Omicron B.1.1.529 VOC This variant was first detected in South Afric... K417N 417 K N S missense_variant G22813T G T 22813
16 16 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N440K 440 N K S missense_variant T22882G T G 22882
17 17 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G446S 446 G S S missense_variant G22898A G A 22898
18 18 Omicron B.1.1.529 VOC This variant was first detected in South Afric... S477N 477 S N S missense_variant G22992A G A 22992
19 19 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T478K 478 T K S missense_variant C22995A C A 22995
20 20 Omicron B.1.1.529 VOC This variant was first detected in South Afric... E484A 484 E A S missense_variant A23013C A C 23013
21 21 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q493R 493 Q R S missense_variant A23040G A G 23040
22 22 Omicron B.1.1.529 VOC This variant was first detected in South Afric... G496S 496 G S S missense_variant G23048A G A 23048
23 23 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q498R 498 Q R S missense_variant A23055G A G 23055
24 24 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N501Y 501 N Y S missense_variant A23063T A T 23063
25 25 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Y505H 505 Y H S missense_variant T23075C T C 23075
26 26 Omicron B.1.1.529 VOC This variant was first detected in South Afric... T547K 547 T K S missense_variant C23202A C A 23202
27 27 Omicron B.1.1.529 VOC This variant was first detected in South Afric... D614G 614 D G S missense_variant A23403G A G 23403
28 28 Omicron B.1.1.529 VOC This variant was first detected in South Afric... H655Y 655 H Y S missense_variant C23525T C T 23525
29 29 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N679K 679 N K S missense_variant T23599G T G 23599
30 30 Omicron B.1.1.529 VOC This variant was first detected in South Afric... P681H 681 P H S missense_variant C23604A C A 23604
31 31 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N764K 764 N K S missense_variant C23854A C A 23854
32 32 Omicron B.1.1.529 VOC This variant was first detected in South Afric... D796Y 796 D Y S missense_variant G23948T G T 23948
33 33 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N856K 856 N K S missense_variant C24130A C A 24130
34 34 Omicron B.1.1.529 VOC This variant was first detected in South Afric... Q954H 954 Q H S missense_variant A24424T A T 24424
35 35 Omicron B.1.1.529 VOC This variant was first detected in South Afric... N969K 969 N K S missense_variant T24469A T A 24469
36 36 Omicron B.1.1.529 VOC This variant was first detected in South Afric... L981F 981 L F S missense_variant C24503T C T 24503
In [13]:
%%time
query = '''SELECT o.pos, o.amino_acid_change, o.pango, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
GROUP BY o.pos, o.amino_acid_change, o.pango'''
omicro_mu_gisaid_stat = psql(query)
CPU times: user 7.38 ms, sys: 920 µs, total: 8.3 ms
Wall time: 38.3 s

Individual mutation occurences

In [14]:
omicro_mu_gisaid_stat
Out[14]:
pos amino_acid_change pango cnt
0 23604 P681H B.1.621 4143062
1 23048 G496S B.1.1.529 565
2 22686 S375F B.1.1.529 299
3 22674 S371L B.1.1.529 87
4 23948 D796Y B.1.1.529 12730
5 23403 D614G B.1.621 5369906
6 23604 P681H B.1.1.529 4143062
7 23055 Q498R B.1.1.529 164
8 22679 S373P B.1.1.529 478
9 22898 G446S B.1.1.529 834
10 24410 D950N B.1.621 2784322
11 24503 L981F B.1.1.529 122
12 23040 Q493R B.1.1.529 541
13 23075 Y505H B.1.1.529 178
14 23013 E484A B.1.1.529 1484
15 23403 D614G B.1.1.529 5369906
16 24424 Q954H B.1.1.529 211
17 22882 N440K B.1.1.529 10351
18 23599 N679K B.1.1.529 5482
19 24130 N856K B.1.1.529 63164
20 21846 T95I B.1.1.529 1139920
21 22578 G339D B.1.1.529 584
22 23063 N501Y B.1.621 1337076
23 21762 A67V B.1.1.529 20664
24 23525 H655Y B.1.1.529 123703
25 22995 T478K B.1.1.529 2815171
26 22992 S477N B.1.1.529 81223
27 24469 N969K B.1.1.529 896
28 21986 G142D B.1.1.529 1938
29 22599 R346K B.1.621 16372
30 23854 N764K B.1.1.529 6387
31 21846 T95I B.1.621 1139920
32 23202 T547K B.1.1.529 8405
33 23012 E484K B.1.621 250540
34 22673 L212I B.1.1.529 299
35 23063 N501Y B.1.1.529 1337076
36 22813 K417N B.1.1.529 48474
In [18]:
merged_def.columns
Out[18]:
Index(['pos', 'ref', 'alt', 'seqid', 'virus_name', 'accession_id',
       'collection_date', 'submission_date', 'patient_age', 'n_content',
       'gc_content', 'sequence_length', 'type_code', 'location_code',
       'additional_location_information_code', 'host_code', 'gender_code',
       'clade_code', 'pango_lineage_code', 'pangolin_version_code',
       'variant_code', 'is_reference_code', 'is_complete_code',
       'is_high_coverage_code', 'is_low_coverage_code', 'weekdate'],
      dtype='object')
## TOO SLOW %%time query = '''SELECT m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt FROM merged_gisaid_20211126 m JOIN location_20211126 l ON l.location_code=m.location_code JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code GROUP BY m.weekdate, l.location, p.pango_lineage ''' total_weekly_stat = psql(query)
In [48]:
%%time
query = '''SELECT * FROM samplesperweek_gisaid_20211126 '''

total_weekly_stat = psql(query)
CPU times: user 39.4 ms, sys: 8.75 ms, total: 48.1 ms
Wall time: 74.4 ms
In [21]:
%%time
query = '''SELECT o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt 
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos 
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage '''

omicron_weekly_stat = psql(query)

## 3min 35s
CPU times: user 2.52 s, sys: 1.16 s, total: 3.68 s
Wall time: 3min 35s
In [22]:
omicron_weekly_stat.shape
Out[22]:
(1340704, 6)
In [94]:
#omicron_weekly_stat.head()
In [51]:
weeklyStat = omicron_weekly_stat.groupby(['weekdate','pos','amino_acid_change']).sum().reset_index().rename(columns={0:'cnt'})
In [ ]:
weeklyStat = pd.merge(weeklyStat,total_weekly_stat.rename(columns={'cnt':'cntTotal'}),on='weekdate')
#weeklyStat
In [46]:
d1 = datetime.date(2020,1,1) #datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.date(2021,11,30) #datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
In [60]:
plt.figure(figsize=[16,8])


_ = total_weekly_stat[(total_weekly_stat.weekdate>=d1) & (total_weekly_stat.weekdate<=d2)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label='all samples')
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('All samples per week',fontsize='xx-large')
Out[60]:
Text(0.5, 1.0, 'All samples per week')
In [35]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[35]:
Text(0.5, 1.0, 'Omicron mutations')
In [36]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.semilogy(_.weekdate,_.cnt,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[36]:
Text(0.5, 1.0, 'Omicron mutations')
In [54]:
plt.figure(figsize=[16,8])


for a in pd.unique(weeklyStat.amino_acid_change):
    _ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
    plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
    
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
Out[54]:
Text(0.5, 1.0, 'Omicron mutations')