Spaces:

Rules99
/

Bioinformatics_Project

Runtime error

App Files Files Community

Bioinformatics_Project / functions.py

Rules99

Bioinformatics project

9f8cc36 over 3 years ago

raw

history blame

9.12 kB

	import numpy as np
	import pickle
	import pandas as pd
	import requests
	from selenium import webdriver
	import matplotlib.pyplot as plt
	#Simple assignment
	from selenium.webdriver import Firefox
	from selenium.webdriver.common.keys import Keys
	from selenium.common.exceptions import NoSuchElementException
	import requests
	import os
	import seaborn as sns
	from collections import Counter
	import plotly.express as px
	import streamlit as st



	### Scrap the cosmic id information
	# ### FRAMEWORKS NEEDED

	def scrap():
	#### Setting options to the driver
	options = webdriver.FirefoxOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.capabilities
	### Setting options of webdriver
	# a) Setting the chromedriver
	browser = Firefox(options=options,executable_path=r"C:\Users\Pablo\OneDrive\Documents\Documentos\Escuela Politécnica Superior Leganés\4 AÑO\ASIGNATURAS\1 CUATRI\WEB ANALYTICS\PART 2\Milestone3\geckodriver.exe")
	### Functions and execution to run the scrapping


	def getinfofromtable(oddrows:list,score:float,headertable)->list:
	rows = []
	for row in oddrows:
	cols = []
	for (i,col) in enumerate(row.find_elements_by_css_selector("td")):
	if i==headertable.index( 'Primary Tissue') or i==headertable.index('Primary Histology') or i==headertable.index('Zygosity'):
	cols.append(col.text)
	cols.append(score)
	rows.append(cols)
	return rows
	def getinfocosmic(mutationid):
	import time
	search = browser.find_element_by_id('search-field')
	search = search.find_element_by_class_name("text_def")
	search.send_keys(mutationid)
	search.send_keys(Keys.RETURN)
	time.sleep(5)
	try:
	container = browser.find_element_by_id("section-list")

	except NoSuchElementException:
	return []

	try:

	subq1 = container.text[container.text.find("score")+len("score"):]
	score = float(subq1[:subq1.find(")")].strip())
	except ValueError:
	score = 0



	section = browser.find_element_by_id("DataTables_Table_0")


	headertable = [header.text for header in section.find_element_by_tag_name("thead").find_elements_by_tag_name("th")]

	oddrows = section.find_elements_by_class_name("odd")
	evenrows = section.find_elements_by_class_name("even")

	l1 = getinfofromtable(oddrows,score,headertable)
	l1.extend(getinfofromtable(evenrows,score,headertable))

	# browser.close()
	return l1
	## Looking for cosmic id info
	cosl = []
	browser.get("https://cancer.sanger.ac.uk/cosmic")
	for cos in cosmicinfo.reset_index()["COSMIC_ID"].iloc[20:]:
	if cos.find(",")!=-1:
	cos = cos.split(",")[0]

	cosl.append(getinfocosmic(cos))
	browser.get("https://cancer.sanger.ac.uk/cosmic")
	### Pieplots
	def pieplot(merging,id=0):
	genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index()
	if id==0:
	gtype = genecount[genecount.UV_exposure_tissue=="Intermittently-photoexposed"]
	if id ==1 :
	gtype = genecount[genecount.UV_exposure_tissue=="Chronically-photoexposed"]
	else:
	gtype = genecount

	gtype = gtype.groupby("gene_name").count()["sampleID"].reset_index()
	gtype.sort_values(by="sampleID",ascending=False,inplace=True)
	#define Seaborn color palette to use
	colors = sns.color_palette('pastel')[0:len(gtype)]
	#create pie chart
	# plt.suptitle("Gene Occuring for different genes")
	plt.pie(gtype.sampleID, labels =gtype.gene_name, colors = colors, autopct='%.0f%%',radius=2,textprops={"fontsize":9})
	plt.show()

	### Depending on what result you want you return one or another
	def filterp4(dfgenes,id=0):
	if id==0 or id==1:

	if id==0:
	chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Intermittently-photoexposed"].sort_values(by=["mean_mut"],ascending=False)
	if id==1:
	chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Chronically-photoexposed"].sort_values(by=["mean_mut"],ascending=False)
	return px.bar(chexposed,x="gene_name",y="mean_mut",error_y="std")
	if id==2:
	return px.bar(dfgenes,x="gene_name",y="mean_mut",color="UV_exposure_tissue",barmode='group',error_y="std")

	### Read scrapping done with cosmic ids
	def read_scrap()->list:
	with open('my_pickle_file.pickle', 'rb') as f :
	cosbase = pickle.load(f)
	return cosbase
	### GendfClean
	def gendfclean(cosbase,cid)->pd.DataFrame:
	dfd = {"tissue": None , "histology": None,"zygosity": None, "score": None }
	for i,key in enumerate(list(dfd.keys())):
	dfd[key] = list(map(lambda x : np.array(x)[:,i].tolist() if x!=[] else [] ,cosbase))

	dfd["cosmic_id"] = cid.tolist()
	cosmicdb = pd.DataFrame(dfd)
	cosmicdb = cosmicdb[(cosmicdb['tissue'].map(lambda d: len(d)) > 0) & (cosmicdb['histology'].map(lambda d: len(d)) > 0) & (cosmicdb['zygosity'].map(lambda d: len(d)) > 0) & (cosmicdb['score'].map(lambda d: len(d)) > 0) ]

	cosmicdb["score"] = cosmicdb.score.apply(lambda x: float(x[0]))

	return cosmicdb

	### Look for stats of a gene
	def inputgene(lookforgene,merging,id =0)->dict:
	### id = 0--> Intermittently exposed
	### id = 1--> Continuously exposed
	genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index()
	tgene = genecount[genecount.gene_name==lookforgene]
	if id==0:
	ph_gene = tgene[tgene.UV_exposure_tissue=='Intermittently-photoexposed']
	else:
	ph_gene = tgene[tgene.UV_exposure_tissue=="Chronically-photoexposed"]
	### Statistiacs about gene\|samples
	stats = ph_gene.chr.describe()
	dc = dict(stats)
	dc["gene_name"] = lookforgene
	if id==0:
	dc["UV_exposure_tissue"] = 'Intermittently-photoexposed'
	else:
	dc["UV_exposure_tissue"] = 'Chronically-photoexposed'
	return dc
	### Look for stats of all genes
	def gene_exposed(merging,id=0):
	return pd.DataFrame(list(map(lambda gene: inputgene(gene,merging,id),merging.gene_name.unique())))
	### Merge stats for continuous and intermittently exposed
	def mergecontintinfo(merging):
	### Continuously Exposed
	cont_exposed_info = gene_exposed(merging,1)
	### Intermittently Exposed
	int_exposed_info = gene_exposed(merging,0)
	return pd.concat([cont_exposed_info,int_exposed_info],axis=0)

	#### Common tissues, zygosities and histologies
	def explodecommon(bd,N,col):
	return Counter(bd[col].apply(lambda x: list(x.keys())).explode()).most_common(N)
	def pdcommon(db,col,uv:str)->pd.DataFrame:
	df = pd.DataFrame(db).rename(columns={0:col,1:"Times_{}".format(col)})
	df["UV_exposure_tissue"] = uv
	return df
	def get_N_common(df,col,N=10)->pd.DataFrame:
	cosm = df.copy(True)
	cosm[col] = cosm[col].apply(lambda x: Counter(x))
	intcosm = cosm[cosm.UV_exposure_tissue=="Intermittently-photoexposed"]
	contcosm = cosm[cosm.UV_exposure_tissue=="Chronically-photoexposed"]

	infotissues = explodecommon(cosm,N,col)
	inttissues = explodecommon(intcosm,N,col)
	contissues = explodecommon(contcosm,N,col)

	df1 = pdcommon(infotissues,col,"Total")
	df2 = pdcommon(inttissues,col,"Intermittently-photoexposed")
	df3 = pdcommon(contissues,col,"Chronically-photoexposed")
	return pd.concat([df1,df2,df3],axis=0)

	### Deatiled information of mutation type
	def mut_type(x):
	if x.mut_type=="Indel":

	if len(x.ref)>len(x.mut):
	return "Del"
	elif len(x.mut)>len(x.ref):
	return "In"
	# if len(x.ref)>1 and len(x.mut)>1:

	return x.ref+">"+x.mut
	return x.mut_type


	def distribution_gene(df,hue):


	plot4 = df.groupby([hue,"mut_type_cus"]).count().reset_index().iloc[:,:3]
	plot4 = plot4.rename(columns={"sampleID":"n_mut"})
	plot4 = plot4.sort_values(by="mut_type_cus",ascending=True)
	fig = px.bar(plot4,x="mut_type_cus",y="n_mut",color=hue,barmode="group")
	return fig