Spaces:
Runtime error
Runtime error
import numpy as np | |
import pickle | |
import pandas as pd | |
import requests | |
from selenium import webdriver | |
import matplotlib.pyplot as plt | |
#Simple assignment | |
from selenium.webdriver import Firefox | |
from selenium.webdriver.common.keys import Keys | |
from selenium.common.exceptions import NoSuchElementException | |
import requests | |
import os | |
import seaborn as sns | |
from collections import Counter | |
import plotly.express as px | |
import streamlit as st | |
### Scrap the cosmic id information | |
# ### FRAMEWORKS NEEDED | |
def scrap(): | |
#### Setting options to the driver | |
options = webdriver.FirefoxOptions() | |
options.add_argument('--headless') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
options.capabilities | |
### Setting options of webdriver | |
# a) Setting the chromedriver | |
browser = Firefox(options=options,executable_path=r"C:\Users\Pablo\OneDrive\Documents\Documentos\Escuela Politécnica Superior Leganés\4 AÑO\ASIGNATURAS\1 CUATRI\WEB ANALYTICS\PART 2\Milestone3\geckodriver.exe") | |
### Functions and execution to run the scrapping | |
def getinfofromtable(oddrows:list,score:float,headertable)->list: | |
rows = [] | |
for row in oddrows: | |
cols = [] | |
for (i,col) in enumerate(row.find_elements_by_css_selector("td")): | |
if i==headertable.index( 'Primary Tissue') or i==headertable.index('Primary Histology') or i==headertable.index('Zygosity'): | |
cols.append(col.text) | |
cols.append(score) | |
rows.append(cols) | |
return rows | |
def getinfocosmic(mutationid): | |
import time | |
search = browser.find_element_by_id('search-field') | |
search = search.find_element_by_class_name("text_def") | |
search.send_keys(mutationid) | |
search.send_keys(Keys.RETURN) | |
time.sleep(5) | |
try: | |
container = browser.find_element_by_id("section-list") | |
except NoSuchElementException: | |
return [] | |
try: | |
subq1 = container.text[container.text.find("score")+len("score"):] | |
score = float(subq1[:subq1.find(")")].strip()) | |
except ValueError: | |
score = 0 | |
section = browser.find_element_by_id("DataTables_Table_0") | |
headertable = [header.text for header in section.find_element_by_tag_name("thead").find_elements_by_tag_name("th")] | |
oddrows = section.find_elements_by_class_name("odd") | |
evenrows = section.find_elements_by_class_name("even") | |
l1 = getinfofromtable(oddrows,score,headertable) | |
l1.extend(getinfofromtable(evenrows,score,headertable)) | |
# browser.close() | |
return l1 | |
## Looking for cosmic id info | |
cosl = [] | |
browser.get("https://cancer.sanger.ac.uk/cosmic") | |
for cos in cosmicinfo.reset_index()["COSMIC_ID"].iloc[20:]: | |
if cos.find(",")!=-1: | |
cos = cos.split(",")[0] | |
cosl.append(getinfocosmic(cos)) | |
browser.get("https://cancer.sanger.ac.uk/cosmic") | |
### Pieplots | |
def pieplot(merging,id=0): | |
genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index() | |
if id==0: | |
gtype = genecount[genecount.UV_exposure_tissue=="Intermittently-photoexposed"] | |
if id ==1 : | |
gtype = genecount[genecount.UV_exposure_tissue=="Chronically-photoexposed"] | |
else: | |
gtype = genecount | |
gtype = gtype.groupby("gene_name").count()["sampleID"].reset_index() | |
gtype.sort_values(by="sampleID",ascending=False,inplace=True) | |
#define Seaborn color palette to use | |
colors = sns.color_palette('pastel')[0:len(gtype)] | |
#create pie chart | |
# plt.suptitle("Gene Occuring for different genes") | |
plt.pie(gtype.sampleID, labels =gtype.gene_name, colors = colors, autopct='%.0f%%',radius=2,textprops={"fontsize":9}) | |
plt.show() | |
### Depending on what result you want you return one or another | |
def filterp4(dfgenes,id=0): | |
if id==0 or id==1: | |
if id==0: | |
chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Intermittently-photoexposed"].sort_values(by=["mean_mut"],ascending=False) | |
if id==1: | |
chexposed= dfgenes[dfgenes.UV_exposure_tissue=="Chronically-photoexposed"].sort_values(by=["mean_mut"],ascending=False) | |
return px.bar(chexposed,x="gene_name",y="mean_mut",error_y="std") | |
if id==2: | |
return px.bar(dfgenes,x="gene_name",y="mean_mut",color="UV_exposure_tissue",barmode='group',error_y="std") | |
### Read scrapping done with cosmic ids | |
def read_scrap()->list: | |
with open('my_pickle_file.pickle', 'rb') as f : | |
cosbase = pickle.load(f) | |
return cosbase | |
### GendfClean | |
def gendfclean(cosbase,cid)->pd.DataFrame: | |
dfd = {"tissue": None , "histology": None,"zygosity": None, "score": None } | |
for i,key in enumerate(list(dfd.keys())): | |
dfd[key] = list(map(lambda x : np.array(x)[:,i].tolist() if x!=[] else [] ,cosbase)) | |
dfd["cosmic_id"] = cid.tolist() | |
cosmicdb = pd.DataFrame(dfd) | |
cosmicdb = cosmicdb[(cosmicdb['tissue'].map(lambda d: len(d)) > 0) & (cosmicdb['histology'].map(lambda d: len(d)) > 0) & (cosmicdb['zygosity'].map(lambda d: len(d)) > 0) & (cosmicdb['score'].map(lambda d: len(d)) > 0) ] | |
cosmicdb["score"] = cosmicdb.score.apply(lambda x: float(x[0])) | |
return cosmicdb | |
### Look for stats of a gene | |
def inputgene(lookforgene,merging,id =0)->dict: | |
### id = 0--> Intermittently exposed | |
### id = 1--> Continuously exposed | |
genecount = merging.groupby(by=["gene_name","UV_exposure_tissue","sampleID"]).count().reset_index() | |
tgene = genecount[genecount.gene_name==lookforgene] | |
if id==0: | |
ph_gene = tgene[tgene.UV_exposure_tissue=='Intermittently-photoexposed'] | |
else: | |
ph_gene = tgene[tgene.UV_exposure_tissue=="Chronically-photoexposed"] | |
### Statistiacs about gene|samples | |
stats = ph_gene.chr.describe() | |
dc = dict(stats) | |
dc["gene_name"] = lookforgene | |
if id==0: | |
dc["UV_exposure_tissue"] = 'Intermittently-photoexposed' | |
else: | |
dc["UV_exposure_tissue"] = 'Chronically-photoexposed' | |
return dc | |
### Look for stats of all genes | |
def gene_exposed(merging,id=0): | |
return pd.DataFrame(list(map(lambda gene: inputgene(gene,merging,id),merging.gene_name.unique()))) | |
### Merge stats for continuous and intermittently exposed | |
def mergecontintinfo(merging): | |
### Continuously Exposed | |
cont_exposed_info = gene_exposed(merging,1) | |
### Intermittently Exposed | |
int_exposed_info = gene_exposed(merging,0) | |
return pd.concat([cont_exposed_info,int_exposed_info],axis=0) | |
#### Common tissues, zygosities and histologies | |
def explodecommon(bd,N,col): | |
return Counter(bd[col].apply(lambda x: list(x.keys())).explode()).most_common(N) | |
def pdcommon(db,col,uv:str)->pd.DataFrame: | |
df = pd.DataFrame(db).rename(columns={0:col,1:"Times_{}".format(col)}) | |
df["UV_exposure_tissue"] = uv | |
return df | |
def get_N_common(df,col,N=10)->pd.DataFrame: | |
cosm = df.copy(True) | |
cosm[col] = cosm[col].apply(lambda x: Counter(x)) | |
intcosm = cosm[cosm.UV_exposure_tissue=="Intermittently-photoexposed"] | |
contcosm = cosm[cosm.UV_exposure_tissue=="Chronically-photoexposed"] | |
infotissues = explodecommon(cosm,N,col) | |
inttissues = explodecommon(intcosm,N,col) | |
contissues = explodecommon(contcosm,N,col) | |
df1 = pdcommon(infotissues,col,"Total") | |
df2 = pdcommon(inttissues,col,"Intermittently-photoexposed") | |
df3 = pdcommon(contissues,col,"Chronically-photoexposed") | |
return pd.concat([df1,df2,df3],axis=0) | |
### Deatiled information of mutation type | |
def mut_type(x): | |
if x.mut_type=="Indel": | |
if len(x.ref)>len(x.mut): | |
return "Del" | |
elif len(x.mut)>len(x.ref): | |
return "In" | |
# if len(x.ref)>1 and len(x.mut)>1: | |
return x.ref+">"+x.mut | |
return x.mut_type | |
def distribution_gene(df,hue): | |
plot4 = df.groupby([hue,"mut_type_cus"]).count().reset_index().iloc[:,:3] | |
plot4 = plot4.rename(columns={"sampleID":"n_mut"}) | |
plot4 = plot4.sort_values(by="mut_type_cus",ascending=True) | |
fig = px.bar(plot4,x="mut_type_cus",y="n_mut",color=hue,barmode="group") | |
return fig |