import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
import datetime
import time
import io
import tarfile
from sqlalchemy import create_engine
def psql(query):
conn = psycopg2.connect(
dbname='coveo',
host='pg.coveo',
user='***',
password='*****',
options="-c search_path=gisaid"
)
dummy = sqlio.read_sql_query(query, conn)
conn.close()
return dummy
def psql_execute(query):
conn = None
conn = psycopg2.connect(
dbname='coveo',
host='pg.coveo',
user='****',
password=*****,
options="-c search_path=gisaid"
)
conn.autocommit = True
cursor = conn.cursor()
cursor.execute("SET search_path TO gisaid")
cursor.execute(query)
conn.close()
return True
%%time
query = '''SELECT *
FROM merged_gisaid_20211126 LIMIT 10 '''
merged_def = psql(query)
merged_def
%%time
query = '''SELECT *
FROM omicron_20211129 o '''
omicron_def = psql(query)
omicron_def.columns
omicron_def
%%time
query = '''SELECT o.pos, o.amino_acid_change, o.pango, COUNT(*) as cnt
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos
GROUP BY o.pos, o.amino_acid_change, o.pango'''
omicro_mu_gisaid_stat = psql(query)
omicro_mu_gisaid_stat
merged_def.columns
%%time
query = '''SELECT * FROM samplesperweek_gisaid_20211126 '''
total_weekly_stat = psql(query)
%%time
query = '''SELECT o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage '''
omicron_weekly_stat = psql(query)
## 3min 35s
omicron_weekly_stat.shape
#omicron_weekly_stat.head()
weeklyStat = omicron_weekly_stat.groupby(['weekdate','pos','amino_acid_change']).sum().reset_index().rename(columns={0:'cnt'})
weeklyStat = pd.merge(weeklyStat,total_weekly_stat.rename(columns={'cnt':'cntTotal'}),on='weekdate')
#weeklyStat
d1 = datetime.date(2020,1,1) #datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.date(2021,11,30) #datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
plt.figure(figsize=[16,8])
_ = total_weekly_stat[(total_weekly_stat.weekdate>=d1) & (total_weekly_stat.weekdate<=d2)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label='all samples')
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('All samples per week',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.semilogy(_.weekdate,_.cnt,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
if (_.cnt.values/_.cntTotal.values).max()>0.1:
plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations (only fractions >0.1)',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
if (_.cnt.values/_.cntTotal.values).max()<=0.1:
plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations (only fractions <=0.1)',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.semilogy(_.weekdate,_.cnt/_.cntTotal,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
omicron_weekly_stat.head()
omicron_def[omicron_def.WHO_label=='Omicron'].shape
_ = omicron_weekly_stat.groupby(['location','amino_acid_change']).size().reset_index()
_.groupby('location').size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False).head(30)
_ = omicron_weekly_stat.groupby(['weekdate','location','amino_acid_change']).size().reset_index()
weekly_num_of = _.groupby(['weekdate','location']).size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt',ascending=False)
weekly_num_of
topCountries = weekly_num_of.groupby('location').max().reset_index().sort_values('cnt',ascending=False)
topCountries.head(30)
plt.figure(figsize=[16,8])
for i,c in enumerate(topCountries.head(10).location.values):
_ = weekly_num_of[(weekly_num_of.weekdate>=d1) & (weekly_num_of.weekdate<=d2) & (weekly_num_of.location==c)].sort_values('weekdate')
lab = c.split('/')[1]
if lab==' USA ':
lab = c.split('/')[-1]
if i<3:
plt.plot(_.weekdate,_.cnt,'o-',label=lab)
else:
plt.plot(_.weekdate,_.cnt,label=lab,alpha=0.5)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of Omicron mutations present in country',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
# TODO: this should not be a cleartext password
passwd = 'XXXXXXXX'
%%time
query = '''SELECT m.seqID, m.weekdate, l.location, COUNT(*) as cnt
INTO omicron_persample_20211126
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY m.seqID, m.weekdate, l.location '''
psql_execute(query)
# Wall time: 13min 22s
query = '''SELECT COUNT(*) FROM omicron_persample_20211126 WHERE cnt>5'''
psql(query)
%%time
query = '''SELECT m.*,o.cnt,l.location,p.pango_lineage FROM merged_gisaid_20211126 m JOIN
(SELECT * FROM omicron_persample_20211126 WHERE cnt>5) o
ON m.seqID=o.seqID
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE m.weekdate> '2020-1-1'
'''
samplesWithMoreThan5 = psql(query)
s5s = samplesWithMoreThan5.groupby(['weekdate','seqid']).max().reset_index()
s5s.sort_values('cnt',ascending=False).head(20)
samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)
dummy
plt.figure(figsize=[16,8])
dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(10)
for i,c in enumerate(dummy.location.values):
#_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
#print(c)
_ = s5s[ (s5s.location==c)].sort_values('weekdate')
lab = c.split('/')[1]
if lab==' USA ':
lab = c.split('/')[-1]
lab = lab+str(max(_.cnt.values))
if i<5:
plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
else:
plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
plt.ylim(0,30)
#plt.xlim(d1,d2)
plt.figure(figsize=[16,8])
dummy = samplesWithMoreThan5.groupby('location').max().reset_index()[['location','cnt']].sort_values('cnt',ascending=False).head(20)
for i,c in enumerate(dummy.location.values):
#_ = s5s[(s5s.weekdate>=d1) & (s5s.weekdate<=d2) & (s5s.location==c)].sort_values('weekdate')
#print(c)
_ = s5s[ (s5s.location==c)].sort_values('weekdate')
lab = c.split('/')[1]
if lab==' USA ':
lab = c.split('/')[-1]
lab = lab+str(max(_.cnt.values))
if i<5:
plt.plot(_.weekdate.values,_.cnt,'o-',label=lab)
else:
plt.plot(_.weekdate.values,_.cnt,'x-',label=lab,alpha=0.5)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('# of Omicron mutations present in single samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations. Top 10 countries with most mutations present',fontsize='xx-large')
plt.ylim(0,30)
#plt.xlim(d1,d2)