Find mutations that had low average allele frequency (AF) a few weeks ago, but now have 0.5 larger AF. We do not distinguish type of mutations (indels, etc.), only count any type at the position where it is registered in VCF.
# load python libraries
import pandas as pd
import pandas.io.sql as sqlio
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
# this connects to database and executes a query
def psql(query):
conn = psycopg2.connect("dbname=***** user=*** password=**** host=postgres.coveo port=5432")
dummy = sqlio.read_sql_query(query, conn)
conn.close()
return dummy
We build the query by incrementally filtering the whole data set of mutations. To do so wr run sub-queries and create temporary data tables.
Select reliable mutations based on Poisson statistics, allele frequency > 3 times (random) standard deviation. Calculate the number of occurances and average AF on weekly basis
query0 = '''
CREATE TEMP TABLE IF NOT EXISTS weekly_AF AS
SELECT pos,date_isoweek, date_isoyear, AVG(af) as avgAF, STDDEV(af) as stdAF, COUNT(*) as cnt
FROM vcf_all JOIN meta on vcf_all.ena_run = meta.ena_run
WHERE ann_num=1
AND af>3/SQRT(dp)
AND clean_host = 'Homo sapiens'
GROUP BY pos,date_isoyear,date_isoweek;
'''
Subset for the genomic positions that have mutation at least in 200 of the collected samples
query1 = '''
CREATE TEMP TABLE IF NOT EXISTS frequent_pos AS
SELECT pos FROM weekly_AF
GROUP BY pos
HAVING SUM(cnt)>200;
'''
query2 = '''
CREATE TEMP TABLE IF NOT EXISTS frequent_AF AS
SELECT w.* FROM weekly_AF w JOIN frequent_pos f
ON w.pos = f.pos;
'''
Get the average AF for each of the selected genomic positions for the last 10 weeks and for the previous weeks, respectively
query3 = '''
CREATE TEMP TABLE IF NOT EXISTS last_weeks_AF AS
SELECT pos, AVG(avgAF) AS avg_AF FROM frequent_AF
WHERE date_isoyear=2021 AND date_isoweek>24
GROUP BY pos
HAVING SUM(cnt) > 50;
'''
query4 = '''
CREATE TEMP TABLE IF NOT EXISTS previous_weeks_AF AS
SELECT pos, AVG(avgAF) AS avg_AF FROM frequent_AF
WHERE NOT (date_isoyear=2021 AND date_isoweek>24)
GROUP BY pos
HAVING SUM(cnt) > 50;
'''
Find genomic positions with significant recent jump (>0.5) in AF
query5 = '''
CREATE TEMP TABLE IF NOT EXISTS jump_AF AS
SELECT p.pos
FROM last_weeks_AF l JOIN previous_weeks_AF p
ON l.pos = p.pos
WHERE l.avg_AF - p.avg_AF > 0.5;
'''
For the selected genomic positions with "jumping AF" collect detailed data, with individual AF values, country of origin, sequencing platform
query6 = '''
SELECT vcf_all.pos,meta.clean_collection_date, vcf_all.af, clean_country,instrument_platform
FROM vcf_all JOIN meta ON vcf_all.ena_run = meta.ena_run
JOIN jump_AF ON vcf_all.pos = jump_AF.pos
WHERE ann_num=1
AND af>3/SQRT(dp)
AND clean_host = 'Homo sapiens'
'''
It takes around 5 minutes
%%time
query = query0 + query1 + query2 + query3 + query4 + query5 + query6
growing_AF = psql(query)
np.sort(pd.unique(growing_AF.pos))
Plot symbol colors mark the sequencing platform. We use the same time and AF scale on all figures.
# date type conversion needed for plotting
growing_AF['datetime'] = growing_AF['clean_collection_date'].apply(pd.to_datetime)
for i in np.sort(pd.unique(growing_AF.pos)):
_ = growing_AF[growing_AF.pos==i]
plt.figure(figsize=(16,3))
_ = _[_.instrument_platform=='ILLUMINA']
plt.plot(_.datetime, _.af,'.',label='ILLUMINA pos='+str(i), alpha=0.5)
_ = growing_AF[growing_AF.pos==i]
_ = _[_.instrument_platform!='ILLUMINA']
plt.plot(_.datetime, _.af,'.',label='NANOPORE pos='+str(i), alpha=0.5)
plt.xlabel("date")
plt.ylabel("allele freq")
plt.xlim(pd.to_datetime('2020-01-01'),pd.to_datetime('2021-10-01'))
plt.ylim(0,1.1)
plt.legend()
plt.show()
Plot symbol colors mark the country where the sequencing was done and (most probably) the samples were collected. We use the same time and AF scale on all figures.
for i in np.sort(pd.unique(growing_AF.pos)):
_ = growing_AF[growing_AF.pos==i]
plt.figure(figsize=(16,3))
_ = _[_.clean_country=='United Kingdom']
plt.plot(_.datetime, _.af,'.',label='UK pos='+str(i), alpha=0.5)
_ = growing_AF[growing_AF.pos==i]
_ = _[_.clean_country!='United Kingdom']
plt.plot(_.datetime, _.af,'.',label='non-UK pos='+str(i), alpha=0.5)
plt.xlabel("date")
plt.ylabel("allele freq")
plt.xlim(pd.to_datetime('2020-01-01'),pd.to_datetime('2021-10-01'))
plt.ylim(0,1.1)
plt.legend()
plt.show()