import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
import datetime
import time
import io
import tarfile
from sqlalchemy import create_engine
def psql(query):
conn = psycopg2.connect(
dbname='coveo',
host='pg.coveo',
user='***',
password='*****',
options="-c search_path=gisaid"
)
dummy = sqlio.read_sql_query(query, conn)
conn.close()
return dummy
def psql_execute(query):
conn = None
conn = psycopg2.connect(
dbname='coveo',
host='pg.coveo',
user='****',
password=*****,
options="-c search_path=gisaid"
)
conn.autocommit = True
cursor = conn.cursor()
cursor.execute("SET search_path TO gisaid")
cursor.execute(query)
conn.close()
return True
%%time
query = '''SELECT *
FROM merged_gisaid_20211126 LIMIT 10 '''
merged_def = psql(query)
merged_def
%%time
query = '''SELECT *
FROM omicron_20211129 o '''
omicron_def = psql(query)
omicron_def.columns
omicron_def
%%time
query = '''SELECT o.pos, o.amino_acid_change, o.pango, COUNT(*) as cnt
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos
GROUP BY o.pos, o.amino_acid_change, o.pango'''
omicro_mu_gisaid_stat = psql(query)
omicro_mu_gisaid_stat
merged_def.columns
%%time
query = '''SELECT * FROM samplesperweek_gisaid_20211126 '''
total_weekly_stat = psql(query)
%%time
query = '''SELECT o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage, COUNT(*) as cnt
FROM omicron_20211129 o JOIN merged_gisaid_20211126 m on o.pos=m.pos
JOIN location_20211126 l ON l.location_code=m.location_code
JOIN pango_lineage_20211126 p ON p.pango_lineage_code = m.pango_lineage_code
WHERE o.pango = 'B.1.1.529'
GROUP BY o.pos, o.amino_acid_change, m.weekdate, l.location, p.pango_lineage '''
omicron_weekly_stat = psql(query)
## 3min 35s
omicron_weekly_stat.shape
#omicron_weekly_stat.head()
weeklyStat = omicron_weekly_stat.groupby(['weekdate','pos','amino_acid_change']).sum().reset_index().rename(columns={0:'cnt'})
weeklyStat = pd.merge(weeklyStat,total_weekly_stat.rename(columns={'cnt':'cntTotal'}),on='weekdate')
#weeklyStat
d1 = datetime.date(2020,1,1) #datetime.datetime.strptime("2020-05-01", "%Y-%m-%d")
d2 = datetime.date(2021,11,30) #datetime.datetime.strptime("2021-11-10", "%Y-%m-%d")
plt.figure(figsize=[16,8])
_ = total_weekly_stat[(total_weekly_stat.weekdate>=d1) & (total_weekly_stat.weekdate<=d2)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label='all samples')
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('All samples per week',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.semilogy(_.weekdate,_.cnt,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly # of samples',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')
plt.figure(figsize=[16,8])
for a in pd.unique(weeklyStat.amino_acid_change):
_ = weeklyStat[(weeklyStat.weekdate>=d1) & (weeklyStat.weekdate<=d2) & (weeklyStat.amino_acid_change==a)].sort_values('weekdate')
plt.plot(_.weekdate,_.cnt/_.cntTotal,label=a)
plt.legend(fontsize='x-large', ncol=3,handleheight=2.4, labelspacing=0.05)
plt.ylabel('Weekly ratio of samples with mutation',fontsize='xx-large')
plt.xlabel('date',fontsize='xx-large')
plt.xticks(fontsize='xx-large')
plt.yticks(fontsize='xx-large')
plt.title('Omicron mutations',fontsize='xx-large')