Bioinformatics_Project / functions.py
Rules99's picture
Bioinformatics project
9f8cc36
raw
history blame
9.12 kB
import numpy as np
import pickle
import pandas as pd
import requests
from selenium import webdriver
import matplotlib.pyplot as plt
#Simple assignment
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import requests
import os
import seaborn as sns
from collections import Counter
import plotly.express as px
import streamlit as st
### Scrap the cosmic id information
# ### FRAMEWORKS NEEDED
def scrap():
#### Setting options to the driver
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.capabilities
### Setting options of webdriver
# a) Setting the chromedriver
browser = Firefox(options=options,executable_path=r"C:\Users\Pablo\OneDrive\Documents\Documentos\Escuela Politécnica Superior Leganés\4 AÑO\ASIGNATURAS\1 CUATRI\WEB ANALYTICS\PART 2\Milestone3\geckodriver.exe")
### Functions and execution to run the scrapping
def getinfofromtable(oddrows:list,score:float,headertable)->list:
rows = []
for row in oddrows:
cols = []
for (i,col) in enumerate(row.find_elements_by_css_selector("td")):
if i==headertable.index( 'Primary Tissue') or i==headertable.index('Primary Histology') or i==headertable.index('Zygosity'):
cols.append(col.text)
cols.append(score)
rows.append(cols)
return rows
def getinfocosmic(mutationid):
import time
search = browser.find_element_by_id('search-field')
search = search.find_element_by_class_name("text_def")
search.send_keys(mutationid)
search.send_keys(Keys.RETURN)
time.sleep(5)
try:
container = browser.find_element_by_id("section-list")
except NoSuchElementException:
return []
try:
subq1 = container.text[container.text.find("score")+len("score"):]
score = float(subq1[:subq1.find(")")].strip())
except ValueError:
score = 0
section = browser.find_element_by_id("DataTables_Table_0")
headertable = [header.text for header in section.find_element_by_tag_name("thead").find_elements_by_tag_name("th")]
oddrows = section.find_elements_by_class_name("odd")
evenrows = section.find_elements_by_class_name("even")
l1 = getinfofromtable(oddrows,score,headertable)
l1.extend(getinfofromtable(evenrows,score,headertable))
# browser.close()
return l1
## Looking for cosmic id info
cosl = []
browser.get("https://cancer.sanger.ac.uk/cosmic")
for cos in cosmicinfo.reset_index()["COSMIC_ID"].iloc[20:]:
if cos.find(",")!=-1:
cos = cos.split(",")[0]
cosl.append(getinfocosmic(cos))
browser.get("https://cancer.sanger.ac.uk/cosmic")
### Pieplots
def pieplot(merging,id=0):
genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index()
if id==0:
gtype = genecount[genecount.UV_exposure_tissue=="Intermittently-photoexposed"]
if id ==1 :
gtype = genecount[genecount.UV_exposure_tissue=="Chronically-photoexposed"]
else:
gtype = genecount
gtype = gtype.groupby("gene_name").count()["sampleID"].reset_index()
gtype.sort_values(by="sampleID",ascending=False,inplace=True)
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:len(gtype)]
#create pie chart
# plt.suptitle("Gene Occuring for different genes")
plt.pie(gtype.sampleID, labels =gtype.gene_name, colors = colors, autopct='%.0f%%',radius=2,textprops={"fontsize":9})
plt.show()
### Depending on what result you want you return one or another
def filterp4(dfgenes,id=0):
if id==0 or id==1:
if id==0:
chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Intermittently-photoexposed"].sort_values(by=["mean_mut"],ascending=False)
if id==1:
chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Chronically-photoexposed"].sort_values(by=["mean_mut"],ascending=False)
return px.bar(chexposed,x="gene_name",y="mean_mut",error_y="std")
if id==2:
return px.bar(dfgenes,x="gene_name",y="mean_mut",color="UV_exposure_tissue",barmode='group',error_y="std")
### Read scrapping done with cosmic ids
def read_scrap()->list:
with open('my_pickle_file.pickle', 'rb') as f :
cosbase = pickle.load(f)
return cosbase
### GendfClean
def gendfclean(cosbase,cid)->pd.DataFrame:
dfd = {"tissue": None , "histology": None,"zygosity": None, "score": None }
for i,key in enumerate(list(dfd.keys())):
dfd[key] = list(map(lambda x : np.array(x)[:,i].tolist() if x!=[] else [] ,cosbase))
dfd["cosmic_id"] = cid.tolist()
cosmicdb = pd.DataFrame(dfd)
cosmicdb = cosmicdb[(cosmicdb['tissue'].map(lambda d: len(d)) > 0) & (cosmicdb['histology'].map(lambda d: len(d)) > 0) & (cosmicdb['zygosity'].map(lambda d: len(d)) > 0) & (cosmicdb['score'].map(lambda d: len(d)) > 0) ]
cosmicdb["score"] = cosmicdb.score.apply(lambda x: float(x[0]))
return cosmicdb
### Look for stats of a gene
def inputgene(lookforgene,merging,id =0)->dict:
### id = 0--> Intermittently exposed
### id = 1--> Continuously exposed
genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index()
tgene = genecount[genecount.gene_name==lookforgene]
if id==0:
ph_gene = tgene[tgene.UV_exposure_tissue=='Intermittently-photoexposed']
else:
ph_gene = tgene[tgene.UV_exposure_tissue=="Chronically-photoexposed"]
### Statistiacs about gene|samples
stats = ph_gene.chr.describe()
dc = dict(stats)
dc["gene_name"] = lookforgene
if id==0:
dc["UV_exposure_tissue"] = 'Intermittently-photoexposed'
else:
dc["UV_exposure_tissue"] = 'Chronically-photoexposed'
return dc
### Look for stats of all genes
def gene_exposed(merging,id=0):
return pd.DataFrame(list(map(lambda gene: inputgene(gene,merging,id),merging.gene_name.unique())))
### Merge stats for continuous and intermittently exposed
def mergecontintinfo(merging):
### Continuously Exposed
cont_exposed_info = gene_exposed(merging,1)
### Intermittently Exposed
int_exposed_info = gene_exposed(merging,0)
return pd.concat([cont_exposed_info,int_exposed_info],axis=0)
#### Common tissues, zygosities and histologies
def explodecommon(bd,N,col):
return Counter(bd[col].apply(lambda x: list(x.keys())).explode()).most_common(N)
def pdcommon(db,col,uv:str)->pd.DataFrame:
df = pd.DataFrame(db).rename(columns={0:col,1:"Times_{}".format(col)})
df["UV_exposure_tissue"] = uv
return df
def get_N_common(df,col,N=10)->pd.DataFrame:
cosm = df.copy(True)
cosm[col] = cosm[col].apply(lambda x: Counter(x))
intcosm = cosm[cosm.UV_exposure_tissue=="Intermittently-photoexposed"]
contcosm = cosm[cosm.UV_exposure_tissue=="Chronically-photoexposed"]
infotissues = explodecommon(cosm,N,col)
inttissues = explodecommon(intcosm,N,col)
contissues = explodecommon(contcosm,N,col)
df1 = pdcommon(infotissues,col,"Total")
df2 = pdcommon(inttissues,col,"Intermittently-photoexposed")
df3 = pdcommon(contissues,col,"Chronically-photoexposed")
return pd.concat([df1,df2,df3],axis=0)
### Deatiled information of mutation type
def mut_type(x):
if x.mut_type=="Indel":
if len(x.ref)>len(x.mut):
return "Del"
elif len(x.mut)>len(x.ref):
return "In"
# if len(x.ref)>1 and len(x.mut)>1:
return x.ref+">"+x.mut
return x.mut_type
def distribution_gene(df,hue):
plot4 = df.groupby([hue,"mut_type_cus"]).count().reset_index().iloc[:,:3]
plot4 = plot4.rename(columns={"sampleID":"n_mut"})
plot4 = plot4.sort_values(by="mut_type_cus",ascending=True)
fig = px.bar(plot4,x="mut_type_cus",y="n_mut",color=hue,barmode="group")
return fig