import numpy as np import pickle import pandas as pd import requests from selenium import webdriver import matplotlib.pyplot as plt #Simple assignment from selenium.webdriver import Firefox from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import NoSuchElementException import requests import os import seaborn as sns from collections import Counter import plotly.express as px import streamlit as st ### Scrap the cosmic id information # ### FRAMEWORKS NEEDED def scrap(): #### Setting options to the driver options = webdriver.FirefoxOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.capabilities ### Setting options of webdriver # a) Setting the chromedriver browser = Firefox(options=options,executable_path=r"C:\Users\Pablo\OneDrive\Documents\Documentos\Escuela Politécnica Superior Leganés\4 AÑO\ASIGNATURAS\1 CUATRI\WEB ANALYTICS\PART 2\Milestone3\geckodriver.exe") ### Functions and execution to run the scrapping def getinfofromtable(oddrows:list,score:float,headertable)->list: rows = [] for row in oddrows: cols = [] for (i,col) in enumerate(row.find_elements_by_css_selector("td")): if i==headertable.index( 'Primary Tissue') or i==headertable.index('Primary Histology') or i==headertable.index('Zygosity'): cols.append(col.text) cols.append(score) rows.append(cols) return rows def getinfocosmic(mutationid): import time search = browser.find_element_by_id('search-field') search = search.find_element_by_class_name("text_def") search.send_keys(mutationid) search.send_keys(Keys.RETURN) time.sleep(5) try: container = browser.find_element_by_id("section-list") except NoSuchElementException: return [] try: subq1 = container.text[container.text.find("score")+len("score"):] score = float(subq1[:subq1.find(")")].strip()) except ValueError: score = 0 section = browser.find_element_by_id("DataTables_Table_0") headertable = [header.text for header in section.find_element_by_tag_name("thead").find_elements_by_tag_name("th")] oddrows = section.find_elements_by_class_name("odd") evenrows = section.find_elements_by_class_name("even") l1 = getinfofromtable(oddrows,score,headertable) l1.extend(getinfofromtable(evenrows,score,headertable)) # browser.close() return l1 ## Looking for cosmic id info cosl = [] browser.get("https://cancer.sanger.ac.uk/cosmic") for cos in cosmicinfo.reset_index()["COSMIC_ID"].iloc[20:]: if cos.find(",")!=-1: cos = cos.split(",")[0] cosl.append(getinfocosmic(cos)) browser.get("https://cancer.sanger.ac.uk/cosmic") ### Pieplots def pieplot(merging,id=0): genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index() if id==0: gtype = genecount[genecount.UV_exposure_tissue=="Intermittently-photoexposed"] if id ==1 : gtype = genecount[genecount.UV_exposure_tissue=="Chronically-photoexposed"] else: gtype = genecount gtype = gtype.groupby("gene_name").count()["sampleID"].reset_index() gtype.sort_values(by="sampleID",ascending=False,inplace=True) #define Seaborn color palette to use colors = sns.color_palette('pastel')[0:len(gtype)] #create pie chart # plt.suptitle("Gene Occuring for different genes") plt.pie(gtype.sampleID, labels =gtype.gene_name, colors = colors, autopct='%.0f%%',radius=2,textprops={"fontsize":9}) plt.show() ### Depending on what result you want you return one or another def filterp4(dfgenes,id=0): if id==0 or id==1: if id==0: chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Intermittently-photoexposed"].sort_values(by=["mean_mut"],ascending=False) if id==1: chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Chronically-photoexposed"].sort_values(by=["mean_mut"],ascending=False) return px.bar(chexposed,x="gene_name",y="mean_mut",error_y="std") if id==2: return px.bar(dfgenes,x="gene_name",y="mean_mut",color="UV_exposure_tissue",barmode='group',error_y="std") ### Read scrapping done with cosmic ids def read_scrap()->list: with open('my_pickle_file.pickle', 'rb') as f : cosbase = pickle.load(f) return cosbase ### GendfClean def gendfclean(cosbase,cid)->pd.DataFrame: dfd = {"tissue": None , "histology": None,"zygosity": None, "score": None } for i,key in enumerate(list(dfd.keys())): dfd[key] = list(map(lambda x : np.array(x)[:,i].tolist() if x!=[] else [] ,cosbase)) dfd["cosmic_id"] = cid.tolist() cosmicdb = pd.DataFrame(dfd) cosmicdb = cosmicdb[(cosmicdb['tissue'].map(lambda d: len(d)) > 0) & (cosmicdb['histology'].map(lambda d: len(d)) > 0) & (cosmicdb['zygosity'].map(lambda d: len(d)) > 0) & (cosmicdb['score'].map(lambda d: len(d)) > 0) ] cosmicdb["score"] = cosmicdb.score.apply(lambda x: float(x[0])) return cosmicdb ### Look for stats of a gene def inputgene(lookforgene,merging,id =0)->dict: ### id = 0--> Intermittently exposed ### id = 1--> Continuously exposed genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index() tgene = genecount[genecount.gene_name==lookforgene] if id==0: ph_gene = tgene[tgene.UV_exposure_tissue=='Intermittently-photoexposed'] else: ph_gene = tgene[tgene.UV_exposure_tissue=="Chronically-photoexposed"] ### Statistiacs about gene|samples stats = ph_gene.chr.describe() dc = dict(stats) dc["gene_name"] = lookforgene if id==0: dc["UV_exposure_tissue"] = 'Intermittently-photoexposed' else: dc["UV_exposure_tissue"] = 'Chronically-photoexposed' return dc ### Look for stats of all genes def gene_exposed(merging,id=0): return pd.DataFrame(list(map(lambda gene: inputgene(gene,merging,id),merging.gene_name.unique()))) ### Merge stats for continuous and intermittently exposed def mergecontintinfo(merging): ### Continuously Exposed cont_exposed_info = gene_exposed(merging,1) ### Intermittently Exposed int_exposed_info = gene_exposed(merging,0) return pd.concat([cont_exposed_info,int_exposed_info],axis=0) #### Common tissues, zygosities and histologies def explodecommon(bd,N,col): return Counter(bd[col].apply(lambda x: list(x.keys())).explode()).most_common(N) def pdcommon(db,col,uv:str)->pd.DataFrame: df = pd.DataFrame(db).rename(columns={0:col,1:"Times_{}".format(col)}) df["UV_exposure_tissue"] = uv return df def get_N_common(df,col,N=10)->pd.DataFrame: cosm = df.copy(True) cosm[col] = cosm[col].apply(lambda x: Counter(x)) intcosm = cosm[cosm.UV_exposure_tissue=="Intermittently-photoexposed"] contcosm = cosm[cosm.UV_exposure_tissue=="Chronically-photoexposed"] infotissues = explodecommon(cosm,N,col) inttissues = explodecommon(intcosm,N,col) contissues = explodecommon(contcosm,N,col) df1 = pdcommon(infotissues,col,"Total") df2 = pdcommon(inttissues,col,"Intermittently-photoexposed") df3 = pdcommon(contissues,col,"Chronically-photoexposed") return pd.concat([df1,df2,df3],axis=0) ### Deatiled information of mutation type def mut_type(x): if x.mut_type=="Indel": if len(x.ref)>len(x.mut): return "Del" elif len(x.mut)>len(x.ref): return "In" # if len(x.ref)>1 and len(x.mut)>1: return x.ref+">"+x.mut return x.mut_type def distribution_gene(df,hue): plot4 = df.groupby([hue,"mut_type_cus"]).count().reset_index().iloc[:,:3] plot4 = plot4.rename(columns={"sampleID":"n_mut"}) plot4 = plot4.sort_values(by="mut_type_cus",ascending=True) fig = px.bar(plot4,x="mut_type_cus",y="n_mut",color=hue,barmode="group") return fig