Spaces:

nyax
/

PhyloLM

Running on Zero

App Files Files Community

Daetheys commited on 6 days ago

Commit

3d6ba31

1 Parent(s): 77fc2c1

First version gradio

Browse files

Files changed (11) hide show

app.py +396 -0
constants.py +8 -0
family_table.json +1 -0
inputs/math.json +1 -0
llm_run.py +48 -0
loading.py +132 -0
packages.txt +2 -0
phylogeny.py +114 -0
plotting.py +522 -0
requirements.txt +15 -0
tools.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import gradio as gr
+import os
+import numpy as np
+import ujson as json
+from loading import load_data, save_git
+from tools import compute_ordered_matrix
+from plotting import plot_sim_matrix_fig, plot_umap_fig, plot_tree, update_sim_matrix_fig, update_umap_fig, update_tree_fig
+from llm_run import download_llm_to_cache, load_model, llm_run
+def reload_figures():
+    global MODEL_SEARCHED_X, MODEL_SEARCHED_Y, ALPHA_EDGES, ALPHA_NAMES, ALPHA_MARKERS, FIGS, ORDERED_MODEL_NAMES
+    fig1 = update_sim_matrix_fig(FIGS['fig1'], ORDERED_MODEL_NAMES, model_search_x=MODEL_SEARCHED_X, model_search_y=MODEL_SEARCHED_Y)
+    fig2 = update_umap_fig(FIGS['fig2'], DIST_MATRIX, MODEL_NAMES, FAMILIES, COLORS, model_search_x=MODEL_SEARCHED_X, alpha_edges=ALPHA_EDGES['fig2'], alpha_names=ALPHA_NAMES['fig2'], alpha_markers=ALPHA_MARKERS['fig2'])
+    fig4 = update_tree_fig(FIGS['fig4'], MODEL_NAMES, model_search=MODEL_SEARCHED_X, alpha_edges=ALPHA_EDGES['fig4'], alpha_names=ALPHA_NAMES['fig4'], alpha_markers=ALPHA_MARKERS['fig4'])
+    return [fig1,fig2,fig4]
+def search_bar_changeX(value):
+    global MODEL_SEARCHED_X
+    MODEL_SEARCHED_X = value
+    return reload_figures()
+def search_bar_changeY(value):
+    global MODEL_SEARCHED_Y
+    MODEL_SEARCHED_Y = value
+    return reload_figures()
+def slider_changeAlphaMarkers(value,key):
+    global ALPHA_MARKERS
+    ALPHA_MARKERS[key] = value
+    return reload_figures()
+def slider_changeAlphaNames(value,key):
+    global ALPHA_NAMES
+    ALPHA_NAMES[key] = value
+    return reload_figures()
+def slider_changeAlphaEdges(value,key):
+    global ALPHA_EDGES
+    ALPHA_EDGES[key] = value
+    return reload_figures()
+def search_bar_gr(model_names,slider=True,double_search=False,key=None):
+    global MODEL_SEARCHED_X,MODEL_SEARCHED_Y,ALPHA_EDGES,ALPHA_NAMES, ALPHA_MARKERS
+    #col1,col2 = gr.Row([0.2,0.8])
+    ret = []
+    with gr.Column(scale=1) as col1:
+        with gr.Group():
+            if MODEL_SEARCHED_X is None:
+                index = 0
+            else:
+                index = model_names.index(MODEL_SEARCHED_X)
+            ms_x = gr.Dropdown(label='Search'+(' X' if double_search else ''),choices=model_names,value=model_names[index],key='model_search_x_'+key,interactive=True)
+            #set MODEL_SEARCH_X
+            ret.append(ms_x)
+            if double_search:
+                if MODEL_SEARCHED_Y is None:
+                    index = 0
+                else:
+                    index = model_names.index(MODEL_SEARCHED_Y)
+                ms_y = gr.Dropdown(label='Search Y',choices=model_names,value=model_names[index],key='model_search_y_'+key,interactive=True)
+                ret.append(ms_y)
+        if slider:
+            with gr.Group():
+                values = np.arange(0, 1.05,0.05)
+                #truncate values to the 100th
+                values = np.round(values,2)
+                alpha_edges = gr.Slider(label='Alpha Edges',
+                                minimum=0,
+                                maximum=1,
+                                step=0.05,
+                                value=ALPHA_EDGES[key],
+                                    key='alpha_edges_'+key,
+                                    interactive=True)
+                values = np.arange(0, 1.05,0.05)
+                #truncate values to the 100th
+                values = np.round(values,2)
+                alpha_names = gr.Slider(label='Alpha Names',
+                                minimum=0,
+                                maximum=1,
+                                step=0.05,
+                                value=ALPHA_NAMES[key],
+                                    key='alpha_names_'+key,
+                                    interactive=True)
+                values = np.arange(0, 1.05,0.05)
+                #truncate values to the 100th
+                values = np.round(values,2)
+                alpha_markers = gr.Slider(label='Alpha Markers',
+                                minimum=0,
+                                maximum=1,
+                                step=0.05,
+                                value=ALPHA_MARKERS[key],
+                                    key='alpha_markers_'+key,
+                                    interactive=True)
+                ret.append(alpha_edges)
+                ret.append(alpha_names)
+                ret.append(alpha_markers)
+    col2 = gr.Column(scale=5)
+    ret.insert(0,col2)
+    return ret
+import spaces
+@spaces.GPU(duration=300)
+def _run(path,genes,N,progress_bar):
+    #Load the model
+    progress_bar(0.20, desc="Loading Model...",total=100)
+    try:
+        model,tokenizer = load_model(path)
+    except ValueError as e:
+            print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model couldn't load. This space currently only works with AutoModelForCausalLM models. Please check the model architecture and try again.")
+            return None
+    except OSError as e:
+            print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model doesn't seem to exist on the HuggingFace Hub. Please check the model name and try again.")
+            return None
+    except RuntimeError as e:
+            if 'out of memory' in str(e):
+                print(f"Error loading model '{path}': {e}")
+                gr.Warning("Loading the model triggered an out of memory error. It may be too big for the GPU (80Go RAM). Please try again with a smaller model.")
+                return None
+            else:
+                print(f"Error loading model '{path}': {e}")
+                gr.Warning("Model couldn't be loaded. Please check the logs or report an issue.")
+                return None
+    except Exception as e:
+            print(f"Error loading model '{path}': {e}")
+            gr.Warning("Model couldn't be loaded. Please check logs or report an issue.")
+            return None
+    progress_bar(0.25, desc="Generating data...",total=100)
+    for i,output in enumerate(llm_run(model,tokenizer,genes,N)):
+        progress_bar(0.25 + i*(70/len(genes))/100, desc=f"Generating data... {i+1}/{len(genes)}",total=100)
+    return output
+def run(path,progress_bar):
+    global DEFAULT_FAMILY_NAME, PHYLOLM_N
+    family = DEFAULT_FAMILY_NAME
+    N = PHYLOLM_N
+    #Loading bar
+    progress_bar(0, desc="Downloading model...",total=100)
+    try:
+        # Download the model to cache
+        if download_llm_to_cache(path) is None:
+            gr.Warning("Model not found on Hugging Face Hub. Please check the model name and try again.")
+            return None
+    except OSError as e:
+        print(f"Error downloading model: {e}")
+        gr.Warning("Model not found on Hugging Face Hub. Please check the model name and try again.")
+        return None
+    # Load the model
+    progress_bar(0.10, desc="Loading contexts...",total=100)
+    with open('inputs/math.json', 'r') as f:
+        genes = json.load(f)
+    # Load the model and run
+    progress_bar(0.15, desc="Waiting for GPU...",total=100)
+    try:
+        output = _run(path,genes,N,progress_bar)
+        if output is None:
+            return None
+    except Exception as e:
+        print(f"Error running model: {e}")
+        gr.Warning("Something unexpected happened during the run or the loading of the model. Please check the logs or report an issue.")
+        return None
+    progress_bar(0.95, desc="Saving data ...",total=100)
+    alleles = [[compl[j]['generated_text'][len(gene):][:4] for j in range(len(compl))] for gene,compl in zip(genes,output)]
+    save_git(alleles,genes,path,family)
+    progress_bar(1, desc="Done!",total=100)
+def prepare_run(model_name,progress_bar=gr.Progress()):
+    global MODEL_SEARCHED_X,MODEL_NAMES
+    if model_name in MODEL_NAMES:
+        gr.Warning('Model already exists in the database.')
+        MODEL_SEARCHED_X = model_name
+        reload_figures()
+        return
+    run(model_name,progress_bar)
+def reload_env():
+    global SIM_MAT_SEARCH_X, SIM_MAT_SEARCH_Y, VIZ_SEARCH, TREE_SEARCH
+    global MODEL_NAMES, FAMILIES, COLORS, SIM_MATRIX, DIST_MATRIX
+    global FIGS, FIGS_OBJECTS
+    # Load models for the dropdown
+    data, model_names, families, sim_matrix, colors = load_data()
+    sim_matrix_safe = np.where(sim_matrix == 0, np.finfo(np.float64).eps, sim_matrix)
+    dist_matrix = -np.log(sim_matrix_safe)
+    #Set globals
+    MODEL_NAMES = model_names
+    FAMILIES = families
+    COLORS = colors
+    SIM_MATRIX = sim_matrix
+    DIST_MATRIX = dist_matrix
+    #Update Figs
+    ordered_sim_matrix, ordered_model_names = compute_ordered_matrix(sim_matrix,dist_matrix, model_names)
+    ORDERED_MODEL_NAMES = ordered_model_names
+    FIGS['fig1'] = plot_sim_matrix_fig(ordered_sim_matrix, ordered_model_names, families, colors)
+    FIGS['fig2'] = plot_umap_fig(dist_matrix, sim_matrix, model_names, families, colors,
+                                        alpha_edges=ALPHA_EDGES['fig2'],alpha_names=ALPHA_NAMES['fig2'],alpha_markers=ALPHA_MARKERS['fig2'])
+    FIGS['fig4'] = plot_tree(sim_matrix, model_names, families, colors,alpha_edges=ALPHA_EDGES['fig4'],alpha_names=ALPHA_NAMES['fig4'],alpha_markers=ALPHA_MARKERS['fig4'])
+    #Update search bars
+    sim_mat_search_x = gr.Dropdown(label='Search X',choices=model_names,value=model_names[0],key='model_search_x_fig1',interactive=True)
+    sim_mat_search_y = gr.Dropdown(label='Search Y',choices=model_names,value=model_names[0],key='model_search_y_fig1',interactive=True)
+    viz_search = gr.Dropdown(label='Search',choices=model_names,value=model_names[0],key='model_search_fig2',interactive=True)
+    tree_search = gr.Dropdown(label='Search',choices=model_names,value=model_names[0],key='model_search_fig4',interactive=True)
+    return FIGS['fig1'], FIGS['fig2'], FIGS['fig4'], sim_mat_search_x, sim_mat_search_y, viz_search, tree_search
+# Load environment variables
+USERNAME = os.environ['GITHUB_USERNAME']
+TOKEN = os.environ['GITHUB_TOKEN']
+MAIL = os.environ['GITHUB_MAIL']
+MODEL_SEARCHED_X = None
+MODEL_SEARCHED_Y = None
+ALPHA_EDGES = {'fig2':0.05, 'fig3':0.05,'fig4':1.0}
+ALPHA_NAMES = {'fig2':0.0, 'fig3':0.0,'fig4':0.0}
+ALPHA_MARKERS = {'fig2':0.8, 'fig3':0.8,'fig4':1.0}
+FIGS = {'fig1':None,'fig2':None,'fig3':None,'fig4':None}
+FIGS_OBJECTS = [None,None,None]
+MODEL_NAMES = None
+FAMILIES = None
+COLORS = None
+ORDERED_MODEL_NAMES = None
+SIM_MATRIX = None
+DIST_MATRIX = None
+DEFAULT_FAMILY_NAME = '?'
+PHYLOLM_N = 32
+SIM_MAT_SEARCH_X = None
+SIM_MAT_SEARCH_Y = None
+VIZ_SEARCH = None
+TREE_SEARCH = None
+# Build the Gradio interface
+with gr.Blocks(title="PhyloLM", theme=gr.themes.Default()) as demo:
+    gr.Markdown("# PhyloLM: Phylogenetic Mapping of Language Models")
+    gr.Markdown(
+        "Welcome to PhyloLM ([paper](https://arxiv.org/abs/2404.04671) - [code](https://github.com/Nicolas-Yax/PhyloLM)) — a tool for comparing language models based on their **behavioral similarity**, inspired by methods from comparative genomics. "
+        "Instead of architecture or weights, we use output behavior on diagnostic prompts as a behavioral fingerprint to compute a distance metric, akin to how biologists compare species using genetic data. This makes it possible to draw a unique map of all LLMs (various architectures, gated and non gated, ...)."
+        "The goal of this space is to create a collaborative space where everyone can visualize these maps and extend them with models of their choice. "
+    )
+    gr.Markdown("## Explore Maps of Models")
+    gr.Markdown(
+        "This interactive space allows users to explore model similarities through four types of visualizations:\n"
+        "- A similarity matrix (values range from 0 = dissimilar to 1 = highly similar). \n"
+        "- 2D and 3D scatter plots representing how close or far from each other LLMs are (plotted using UMAP). \n"
+        "- A tree to visualize distances between models (distance from leaf A to leaf B in the tree is similar to the distance between the two models)\n\n"
+    )
+    # Load models for the dropdown
+    data, model_names, families, sim_matrix, colors = load_data()
+    sim_matrix_safe = np.where(sim_matrix == 0, np.finfo(np.float64).eps, sim_matrix)
+    dist_matrix = -np.log(sim_matrix_safe)
+    #Set globals
+    MODEL_NAMES = model_names
+    FAMILIES = families
+    COLORS = colors
+    SIM_MATRIX = sim_matrix
+    DIST_MATRIX = dist_matrix
+    # Create the tabs
+    tab_state = gr.State(value="Similarity Matrix")  # Default tab
+    tabs = gr.Tabs(["Similarity Matrix", "2D Visualization","Tree Visualization"])
+    with tabs:
+        with gr.TabItem("Similarity Matrix"):
+            # Similarity matrix visualization
+            with gr.Row():
+                col2,sim_mat_search_x,sim_mat_search_y = search_bar_gr(model_names,slider=False,double_search=True,key='fig1')
+                with col2:
+                    ordered_sim_matrix, ordered_model_names = compute_ordered_matrix(sim_matrix,dist_matrix, model_names)
+                    fig = plot_sim_matrix_fig(ordered_sim_matrix, ordered_model_names, families, colors)
+                    sim_matrix_output = gr.Plot(fig,label="Similarity Matrix")
+                    FIGS['fig1'] = fig
+                    ORDERED_MODEL_NAMES = ordered_model_names
+                    FIGS_OBJECTS[0] = sim_matrix_output
+        with gr.TabItem("2D Visualization"):
+            # 2D visualization
+            with gr.Row():
+                col2,viz_search,viz_alpha_edge,viz_alpha_name,viz_alpha_marker = search_bar_gr(model_names,slider=True,double_search=False,key='fig2')
+                with col2:
+                    fig = plot_umap_fig(dist_matrix, sim_matrix, model_names, families, colors,
+                                        alpha_edges=ALPHA_EDGES['fig2'],alpha_names=ALPHA_NAMES['fig2'],alpha_markers=ALPHA_MARKERS['fig2'])
+                    plot_output = gr.Plot(fig,label="2D Visualization")
+                    FIGS['fig2'] = fig
+                    FIGS_OBJECTS[1] = plot_output
+        with gr.TabItem("Tree Visualization"):
+            # Tree visualization
+            with gr.Row():
+                col2,tree_search,tree_alpha_edge,tree_alpha_name,tree_alpha_marker = search_bar_gr(model_names,slider=True,double_search=False,key='fig4')
+                with col2:
+                    fig = plot_tree(sim_matrix, model_names, families, colors,alpha_edges=ALPHA_EDGES['fig4'],alpha_names=ALPHA_NAMES['fig4'],alpha_markers=ALPHA_MARKERS['fig4'])
+                    tree_output = gr.Plot(fig,label="Tree Visualization")
+                    FIGS['fig4'] = fig
+                    FIGS_OBJECTS[2] = tree_output
+    # Submit model section
+    gr.Markdown("## Submitting a Model")
+    gr.Markdown(
+        "You may contribute new models to this collaborative space using compute resources. "
+        "Once processed, the model will be compared to existing ones, and its results added to a shared public database. "
+        "Model families (e.g., LLaMA, OPT, Mistral) are extracted from Hugging Face model cards and used only for visualization (e.g., coloring plots); they are **not** involved in the computation of similarity."
+    )
+    gr.Markdown(
+        "**To add a new model:**\n"
+        "1. Enter the name of a model hosted on Hugging Face (e.g., `'mistralai/Mistral-7B-Instruct-v0.3'`).\n"
+        "2. Click on the **Run PhyloLM** button.\n"
+        "- If the model has already been processed, you'll be notified and no new run will start.\n"
+        "- If it hasn't been processed, it will be downloaded and be evaluated.\n\n"
+        "⚠️ Be careful when submitting large LLMs (typically >15B parameters) as they may exceed the GPU RAM or the time limit, leading to failed runs."
+    )
+    with gr.Group():
+        model_input = gr.Textbox(label="Model", interactive=True)
+        submit_btn = gr.Button("Run PhyloLM", variant="primary")
+    # Disclaimer and citation
+    gr.Markdown("## Disclaimer")
+    gr.Markdown(
+        "This is a research prototype and may contain bugs or limitations. "
+        "All computed data are public and hosted on [GitHub](https://github.com/PhyloLM/Data). "
+        "If you'd like to contribute additional models — especially for gated or large models that cannot be processed via the web interface — "
+        "you are welcome to submit a pull request to the repository cited above. "
+        "All results are computed on the 'Math' set of genes used in the original paper."
+    )
+    gr.Markdown("## Citation")
+    gr.Markdown("If you find this project useful for your research, please consider citing the following paper:")
+    #bibtex
+    gr.Code('''@inproceedings{
+yax2025phylolm,
+title={Phylo{LM}: Inferring the Phylogeny of Large Language Models and Predicting their Performances in Benchmarks},
+author={Nicolas Yax and Pierre-Yves Oudeyer and Stefano Palminteri},
+booktitle={The Thirteenth International Conference on Learning Representations},
+year={2025},
+url={https://openreview.net/forum?id=rTQNGQxm4K}
+}''',language=None)
+    # Change actions from search bars
+    sim_mat_search_x.change(fn=search_bar_changeX, inputs=sim_mat_search_x, outputs=FIGS_OBJECTS)
+    sim_mat_search_y.change(fn=search_bar_changeY, inputs=sim_mat_search_y, outputs=FIGS_OBJECTS)
+    viz_search.change(fn=search_bar_changeX, inputs=viz_search, outputs=FIGS_OBJECTS)
+    tree_search.change(fn=search_bar_changeX, inputs=tree_search, outputs=FIGS_OBJECTS)
+    # Change actions from sliders
+    viz_alpha_edge.change(fn=lambda x : slider_changeAlphaEdges(x,'fig2'), inputs=viz_alpha_edge, outputs=FIGS_OBJECTS)
+    viz_alpha_name.change(fn=lambda x : slider_changeAlphaNames(x,'fig2'), inputs=viz_alpha_name, outputs=FIGS_OBJECTS)
+    viz_alpha_marker.change(fn=lambda x : slider_changeAlphaMarkers(x,'fig2'), inputs=viz_alpha_marker, outputs=FIGS_OBJECTS)
+    tree_alpha_edge.change(fn=lambda x : slider_changeAlphaEdges(x,'fig4'), inputs=tree_alpha_edge, outputs=FIGS_OBJECTS)
+    tree_alpha_name.change(fn=lambda x : slider_changeAlphaNames(x,'fig4'), inputs=tree_alpha_name, outputs=FIGS_OBJECTS)
+    tree_alpha_marker.change(fn=lambda x : slider_changeAlphaMarkers(x,'fig4'), inputs=tree_alpha_marker, outputs=FIGS_OBJECTS)
+    # Run PhyloLM button
+    submit_btn.click(fn=prepare_run, inputs=[model_input], outputs=[model_input]).then(fn=reload_env, inputs=[], outputs=FIGS_OBJECTS+ [sim_mat_search_x, sim_mat_search_y, viz_search, tree_search])
+    #Set more globals
+    SIM_MAT_SEARCH_X = sim_mat_search_x
+    SIM_MAT_SEARCH_Y = sim_mat_search_y
+    VIZ_SEARCH = viz_search
+    TREE_SEARCH = tree_search
+if __name__ == "__main__":
+    demo.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from matplotlib import colors as mcolors
+UNKNOWN_COLOR = 'gray'
+UNKNOWN_COLOR_RGB = mcolors.to_rgb(UNKNOWN_COLOR)
+UNKNOWN_COLOR_RGB = tuple([int(255 * c) for c in UNKNOWN_COLOR_RGB])
+DEFAULT_COLOR = 'black'
+DEFAULT_COLOR_RGB = mcolors.to_rgb(DEFAULT_COLOR)
+DEFAULT_COLOR_RGB = tuple([int(255 * c) for c in DEFAULT_COLOR_RGB])

family_table.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"JosephusCheung\/Guanaco":"Llama","Intel\/neural-chat-7b-v3":"Mistral","Intel\/neural-chat-7b-v3-1":"Mistral","teknium\/OpenHermes-2-Mistral-7B":"Mistral","teknium\/OpenHermes-2.5-Mistral-7B":"Mistral","teknium\/OpenHermes-13B":"Llama","teknium\/OpenHermes-7B":"Llama","mistralai\/Mistral-7B-Instruct-v0.2":"Mistral","mistralai\/Mixtral-8x7B-Instruct-v0.1":"Mistral","mistralai\/Mistral-7B-v0.1":"Mistral","mistralai\/Mixtral-8x7B-v0.1":"Mistral","mistralai\/Mistral-7B-Instruct-v0.1":"Mistral","chavinlo\/alpaca-native":"Llama","CausalLM\/14B":"Qwen","CausalLM\/7B":"Qwen","bigscience\/bloom-3b":"Bloom","bigscience\/bloom-7b1":"Bloom","bigscience\/bloomz-3b":"Bloom","bigscience\/bloom":"Bloom","bigscience\/bloomz-7b1":"Bloom","berkeley-nest\/Starling-LM-7B-alpha":"Mistral","EleutherAI\/pythia-6.9b":"Pythia","EleutherAI\/pythia-1.4b":"Pythia","EleutherAI\/pythia-12b":"Pythia","EleutherAI\/pythia-2.8b":"Pythia","EleutherAI\/pythia-410m":"Pythia","EleutherAI\/pythia-70m":"Pythia","EleutherAI\/pythia-160m":"Pythia","roneneldan\/TinyStories-1M":"TinyStories","lmsys\/vicuna-13b-v1.5":"Llama","lmsys\/vicuna-7b-v1.1":"Llama","lmsys\/vicuna-13b-v1.3":"Llama","lmsys\/vicuna-7b-v1.5":"Llama","lmsys\/vicuna-13b-v1.1":"Llama","lmsys\/vicuna-7b-v1.3":"Llama","google\/gemma-7b":"Gemma","google\/codegemma-7b":"Gemma","google\/gemma-2b-it":"Gemma","google\/codegemma-2b":"Gemma","google\/codegemma-7b-it":"Gemma","google\/gemma-1.1-7b-it":"Gemma","google\/gemma-2b":"Gemma","google\/gemma-1.1-2b-it":"Gemma","google\/gemma-7b-it":"Gemma","microsoft\/Orca-2-13b":"Llama","microsoft\/Orca-2-7b":"Llama","Imran1\/MedChat3.5":"Mistral","tenyx\/TenyxChat-7B-v1":"Mistral","databricks\/dolly-v2-7b":"Pythia","databricks\/dolly-v2-3b":"Pythia","databricks\/dolly-v2-12b":"Pythia","Qwen\/Qwen-1_8B":"Qwen","Qwen\/Qwen1.5-0.5B":"Qwen","Qwen\/Qwen1.5-72B-Chat":"Qwen","Qwen\/Qwen1.5-7B-Chat":"Qwen","Qwen\/Qwen1.5-2B-Chat":"Qwen","Qwen\/Qwen1.5-7B":"Qwen","Qwen\/Qwen1.5-72B":"Qwen","Qwen\/Qwen1.5-32B-Chat":"Qwen","Qwen\/Qwen1.5-4B-Chat":"Qwen","Qwen\/Qwen1.5-1.8B":"Qwen","Qwen\/Qwen1.5-14B-Chat":"Qwen","Qwen\/Qwen1.5-0.5B-Chat":"Qwen","Qwen\/Qwen-72B":"Qwen","Qwen\/Qwen-14B":"Qwen","Qwen\/Qwen1.5-4B":"Qwen","Qwen\/Qwen1.5-14B":"Qwen","Qwen\/Qwen-7B":"Qwen","Qwen\/Qwen1.5-32B":"Qwen","OpenAssistant\/oasst-sft-4-pythia-12b-epoch-3.5":"Pythia","mlabonne\/NeuralHermes-2.5-Mistral-7B":"Mistral","facebook\/opt-6.7b":"OPT","facebook\/opt-125m":"OPT","facebook\/opt-66b":"OPT","facebook\/opt-13b":"OPT","facebook\/opt-1.3b":"OPT","facebook\/opt-30b":"OPT","facebook\/opt-350m":"OPT","facebook\/opt-2.7b":"OPT","HuggingFaceH4\/zephyr-7b-beta":"Mistral","HuggingFaceH4\/zephyr-7b-alpha":"Mistral","openchat\/openchat_v2":"Llama","openchat\/openchat_v2_w":"Llama","openchat\/openchat_v3.2":"Llama","openchat\/openchat_v3.1":"Llama","openchat\/openchat_3.5":"Mistral","openchat\/openchat_v3.2_super":"Llama","TigerResearch\/tigerbot-13b-base-v2":"Llama","TigerResearch\/tigerbot-7b-base-v2":"Bloom","TigerResearch\/tigerbot-7b-chat":"Llama","TigerResearch\/tigerbot-13b-chat-v1":"Llama","TigerResearch\/tigerbot-7b-sft-v1":"Bloom","TigerResearch\/tigerbot-7b-sft-v2":"Bloom","TigerResearch\/tigerbot-13b-chat-v2":"Llama","TigerResearch\/tigerbot-13b-chat-v3":"Llama","TigerResearch\/tigerbot-13b-chat-v4":"Llama","TigerResearch\/tigerbot-7b-base-v1":"Bloom","TigerResearch\/tigerbot-13b-base-v1":"Llama","TigerResearch\/tigerbot-7b-base":"Llama","fxmarty\/tiny-llama-fast-tokenizer":"fxmarty","project-baize\/baize-v2-7b":"Llama","huggyllama\/llama-7b":"Llama","huggyllama\/llama-13b":"Llama","Arc53\/docsgpt-7b-mistral":"Mistral","meta-llama\/Llama-2-7b-hf":"Llama","meta-llama\/Llama-2-13b-hf":"Llama","meta-llama\/Llama-2-7b":"Llama"}

inputs/math.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["# In observing a Tetrahedron...", "# Strong and Weak Form Solution \u2013 FEA\n\nPartial Differential Equations \u2013 PDE is called \u201cstro", "# Loci Browse Articles\n\nDisplaying 41 - 50 of 323\n\nThis article describes methods for cr", "Paul's Online Notes\nHome / Calculus I / Review / Trig Functions\nShow Mobile Notice Show All Not", "# Recursive formula for joint moments in free probability\n\nSuppose $\\mathfrak{A}$ is an algebra ", "# Is the oxygen molecule $O_2$ fermion or boson?\n\nI ask something makes me confused.\n", "# Python Indices of numbers greater than K\n\nIn this tutorial, we are going to ", "To find total cost, to the nearest cent, to cool the house for this 24-hour p", "# Propositional Logic", "Modeling the train reservation kata -", "[texhax] environment ", "# stability \u2026 boring old and simple stability\n\n[xxx@yyy~]$uptime 15:07:51 up 505 days, 47 min", "By accessing our 180 Days of Math for Sixth Grade Answers Key Day 72 regularly, students can get b", "E. Square Root of Permutation\ntime limit per test\n2 seconds\nmemory l", "# Skills\n\nThe Discworld skill model is large and complex. It is broken up into eight branches.\n", "# Just Some Division\n\nNumber Theory Level 1\n\n$$N$$ is a positive integer such that $$10", "MathSciNet bibliographic data MR343259 55B25 (55G35 57E15) Matumoto, Takao Eq", "InTech uses cookies to offer you the best online experience. By continuing to use our sit", "## Train Tracks\n\nConsider a segment of", "## Stream: general\n\n### Topic: detecti", "This vignette discusses data.table\u2019s reference semant", "A cell made up of two hydrogen electrodes. The positive electrode is in cont", "# Better way to calculate coordinates in Tikz?\n\nI am great fan of pgf and tikz in general to pro", "# Physics (Version 8.4", "# Abc conjecture\n\n\ufeff\nAbc conjecture\n\nThe abc conjecture is a conjecture", "# No offense intended, but\u2026\n\nI have", "Select Board & Class\n\nAreas Related to Circles\n\nTo und", "Determine whether $\\sum_{n=1}^\\infty \\frac{\\sin^2 n}{n^2}$ conv", "# Calculating Derivative \u2013 Third root \u2013 Exercise 1106\n\nExercise\n\nFind the derivati", "# BE Thesis\n\nSenast inlagda poster:\n2018-09-06\n", "# All Questions\n\n1,360 questions\nFilter by\nSorted by\nTagged with\n1answer\n113 views\n\n###", "# Ignatius and the P", "## anonymous one year ago What is the fifth term of the sequence who", "size - Maple Help\n\nMTM\n\n size\n", "http://en.wikipedia.org/wiki/Taylor_series\n\n## Taylor series in several var", "# I Using determinant to find constraints on equation\n\nTags:\n1. Jan 15, 2017\n\n### TheDemx27\n", "# Decimal Numbers\n\nDecimal numbers are similar to fractions in basic principle.\n\nThey are often u", "Sales Toll Free No: 1-800-481-2338\n\n# How to Divide Monomial by a Non Zero Constant?\n\nTopPolynomial", "# Transformation of continuous uniform distribution\n", "CTF Team at the University of British Columbia\n\n# [corCTF 2021] smogofwar\n\n25 Aug 2021 ", "0\nResearch Papers\n\n# Analytic and Geometric ", "# Difference between revisions of \"1984 AIME Problems/Problem 11\"\n\n## Problem\n\nA ", "# Image Mosaicking\u00b6\n\n#", "# When two smaller atoms combine into a larger atom what has occurred?", "# Bessel functions\n\n(diff) \u2190 Older revision | La", "## WeBWorK Main Forum\n\n### Why Giga newton is not ", "### Homes\n\nThere are ", "## Return to Question\n\n2 deleted 256 characters in body\n\nA complex manifold $X$ is said ", "# Algebra Examples\n\nFind Pivot Positions and Pivot Co", "# The Physics Behind the American Death Tr", "Data\n\n1. Title: Webs and $q$-Howe dualities in types $\\mathbf{B}\\mathbf{C}\\math", "# All Questions\n\n1,524 questions\n", "# Pydon'ts\n\n## Improve your Python programming skills\n\n### Start here.\n\n294\n###", "# Math Help - 2-norm of a ma", "[texhax] \\mid Description\n\n", "# Q : 3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0Find the area of the region bou", "as.epidata {EpiILM} R Documentation\n\nDiscrete Tim", "# Basic Matrix Row Operations Tips\n\nThere are four basic op", "# Nets within nets from the Grothendieck\u00a0const", "Free Version\nEasy\n\n# Same Base Exponential Equations\n\n", "# Preserving Plant Diversity\n\n## Location:", "# Crash on HWNDComponentPeer::destroyWindowCallback\n\nIt just feels like you\u2019r", "# SNR\u00b6\n\nclass gammapy.astro.source.SNR(e_sn='1e51 erg', theta=<Quantity 0.1>, n_ISM=<Qua", "# Forecasting Pseudo Random Numbers Using Deep Learning\n\nPublisher: IEEE\n\n", "# Including both transformed and original data (untransformed) in a multivariable linear regressio", "# Technical Fridays\n\nFriday, September 1, 2017\n\nIn 1973, the Un", "# Representation of simple groups\n\nLet $G$ be a finite simple group, prove th", "# The Impact of Meteorological Facto", "Advertisement Remove all ads\n\n# If\u00a0y=log[x+sqrt(x^2+a^2)]\u00a0show that\u00a0(x^2+a^2)(d^2y)/(dx^2)+xdy", "2020 | Book\n\n# Principles of Data M", "# Thread: Math problem Parenthesis & PEMDAS\n\n1. ## Math problem Parenthesis & PEMDAS\n\nhi i n", "# Math Help - Physics problem; Find expression for veloci", "# Base class for finite field element", "# 0.1 Review exercises (ch 3-13) \u00a0(Page 11/12)\n\n Page 11 / 12\n\n130. Out", "All Rights Reserved. However, a compass needle will not be steady in the magneti", "# How is rest mass $m_0$ in $E=m_0c^2$ related to mass $m$ in $F=ma$?\n\nA p", "# All Questions\n\n7 views\n\n### Curve fitting of a list\n\nI have list obtained using a", "1 $\\begingroup$ Close", "# Annual income of A a", "## Seminars and Colloquia by Series\n\n### Geometric Equations for Matroid Varieties\n\nSeries\nSIAM S", "# How to prove that $C=\\{x: Ax\\le ", "# Phase locked Loop in Demodulation\n\nCan someone please clarify how a PLL works and how it can th", "# What is forward difference interpolation?\n\n## What is forward difference interpolati", "# Math Help - matlab code hel", "Question\n\n# What is the speed of the sound in a perfectly rigid rod?\n\nOpen in App\nSoluti", "# If $\\Sigma \\models \\phi$, then for some finite $\\Delta \\subset\\Sigma$, $\\Delta \\models", "# Need some help on a proof!\n\n1. Dec 12, 2004\n\n### MathematicalMatt\n\nHowdy, I just stumbled o", "## Calculus (3rd Edition)\n\n$f(x)=[x]$ has a jump discontinuity at $x=n$.\nThe function ", "Orthogonal functions\n\nOrthogonality\n\nTwo fu", "## MacKenzie's fundamental principle of greenkeeping\n\n##### 05 May 2017\n\nI taught two semi", "2 added 246 characters in body\n\n\"If you are walking between two policemen goin", "# \u3010BZOJ 4571\u3011[SCOI2016] \u7f8e\u5473\n\n#include<bits/stdc++.h>\n#define LL long long\nus", "# Math Help - confidence interval help!!!!\n\n1. ## confide", "# Alternatives\n\n## Summing Squares: Finding or Proving a Formula\n", "Thank you for visiting nature", "# How do I keep a string of text together without", "# Q6. In a model of a ship, the mast", "# How does the Taylor Series converge at all points f", "# Revealed preference\n\nRevealed preference theory, pioneered by American ec", "Previous issue \u00b7\u00a0 Next issue \u00b7\u00a0", "Bits - Maple Programming Help\n\nHome : Su", "# Difference between revisions of \"", "## [POJ2411]Mondriaan\\'s Dream\n\n \u6210", "# Period ofWeeks() method in Java\n\nJava 8Object Oriented ProgrammingProgramming\n\nThe", "Debugging graphs with ease: experimental Visual Studio plugin\n\nRevis", "k-mer overrepresentation of WGS Illumina reads\n0\n0\nEntering edit mode\n3.8 years a", "# Hydrometeorology Research Group\n\nIn\u00a0[5]:\nfrom IPython.display import HTM", "# \ud83d\udd35\u26aa\ud83d\udd34 Bioinspired tough gel sheath for robust and versatile surface functionalization \u2013 Content Mar", "# Recurring Decimal To Fraction Cal", "# Integral related to the modified Bessel function\n\nI would like to solve t", "# Viewpoint: Particle Decays Point to an Arrow of Time\n\n\u2022 Michael Ze", "CiteULike is a free online bibliography manager. Register a", "# Social Media, Misinformation, and Voting Decisions\n\nWorking Pap", "# Test Video\n\nAller \u00e0 : Navigation, rechercher\n\nThis is an", "Enterprise Multiples\n\n.\n\nEV/SALES\n\nEqu", "PLANET Discussion: Daeridune\n\nDiscussion in 'The Manaverse W", "# Tag Info\n\n## Hot answers tagged total\n\n6\n\nDon't subtrac", "# Welcome to grmpy\u2019s documentation!\u00b6\n\ngrmpy is an open-source packag", "# Solve the linear equatio", "## February 25, 2008\n\n### A Questio", "# Concept and expression of a real function\n\n## Concept o", "# Optimization Week 9: Convex conjugate (Fenche", "# Inversions of Insertion Sort and Bubble Sort\n\nAn array with bubblesort time $$\\Theta(n)$$ is noth", "## Simple power series\n\nHey all,\n\nDoes anybody know if $x^{\\alpha}$ can be written in terms of an ", "# HiggsTools\n\nStephen Jones (MPI Muni", "# Mathematics 1010 online\n\n## Complex Numbers\n\nRecall how we built the numbe", "# string.replace.regex\n\nSynt", "# Math Help - Odd Integers\n\n1. ## Odd Integers\n\nWhat is the product if the largest of three cons"]

llm_run.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import transformers
+from huggingface_hub import snapshot_download,constants
+def download_llm_to_cache(model_name, revision="main", cache_dir=None):
+    """
+    Download an LLM from the Hugging Face Hub to the cache without loading it into memory.
+    Args:
+        model_name (str): The name of the model on Hugging Face Hub (e.g., "meta-llama/Llama-2-7b-hf")
+        revision (str, optional): The specific model version to use. Defaults to "main".
+        cache_dir (str, optional): The cache directory to use. If None, uses the default HF cache directory.
+    Returns:
+        str: Path to the model in cache
+    """
+    # Get default cache dir if not specified
+    if cache_dir is None:
+        cache_dir = constants.HUGGINGFACE_HUB_CACHE
+    try:
+        # Download model to cache without loading into memory
+        cached_path = snapshot_download(
+            repo_id=model_name,
+            revision=revision,
+            cache_dir=cache_dir,
+            local_files_only=False  # Set to True if you want to check local cache only
+        )
+        print(f"Model '{model_name}' is available in cache at: {cached_path}")
+        return cached_path
+    except Exception as e:
+        print(f"Error downloading model '{model_name}': {e}")
+        return None
+def load_model(path,cache_dir=None):
+    model = transformers.AutoModelForCausalLM.from_pretrained(path,cache_dir=cache_dir,device_map='auto')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(path,cache_dir=cache_dir,device_map='auto')
+    return model,tokenizer
+def llm_run(model,tokenizer,genes,N):
+    generate = transformers.pipeline('text-generation',model=model, tokenizer=tokenizer,device_map='auto')
+    output = []
+    for i,gene in enumerate(genes):
+        out = generate([gene], min_new_tokens=4, max_new_tokens=4, do_sample=True, num_return_sequences=N)
+        output.append(out[0])
+        yield output
+    return output

loading.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import ujson as json
+import pygit2
+from phylogeny import compute_all_P, compute_sim_matrix
+from plotting import get_color, UNKNOWN_COLOR, DEFAULT_COLOR
+# ------------------------------------------------------------------------------------------------
+#
+#                                       Loading data
+#
+# ------------------------------------------------------------------------------------------------
+def load_data():
+    global UNKNOWN_COLOR, DEFAULT_COLOR, MODEL_SEARCHED_X
+    data, model_names,families = load_git()
+    if data is None:
+        return
+    #Rename families if needed
+    with open('family_table.json','r') as f:
+        rename_table = json.load(f)
+    for i in range(len(model_names)):
+        try:
+            families[i] = rename_table[model_names[i]]
+        except KeyError:
+            pass
+    all_P = compute_all_P(data, model_names)
+    sim_matrix = compute_sim_matrix(model_names, all_P)
+    k = list(all_P.keys())[0]
+    unknown_color = UNKNOWN_COLOR
+    unique_families = list(set([f for f in families]))
+    colors = {}
+    idx = 0
+    for i, family in enumerate(unique_families):
+        color = get_color(idx)
+        idx += 1
+        while color == unknown_color: # Avoid using the unknown color for a family
+            color = get_color(idx)
+            idx += 1
+        colors[family] = color
+    colors['?'] = unknown_color # Assign the unknown color to the unknown family
+    return data, model_names, families, sim_matrix, colors
+def load_git():
+    cred = pygit2.UserPass(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN'])
+    if os.path.exists('Data'):
+        repo = pygit2.Repository('Data')
+        remote = repo.remotes['origin']  # Use named reference instead of index
+        remote.fetch()
+        # Get the current branch name
+        branch_name = repo.head.shorthand
+        # Find the reference to the remote branch
+        remote_ref_name = f'refs/remotes/origin/{branch_name}'
+        # Merge the changes into the current branch
+        remote_commit = repo.lookup_reference(remote_ref_name).target
+    else:
+        repo = pygit2.clone_repository('https://github.com/PhyloLM/Data', './Data', bare=False, callbacks=GitHubRemoteCallbacks(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_TOKEN']))
+    data_array = []
+    model_names = []
+    families = []
+    for foname in os.listdir('Data/math'):
+        #check if it is a directory
+        if not os.path.isdir(os.path.join('Data/math',foname)):
+            continue
+        for fname in os.listdir('Data/math/'+foname):
+            if not fname.endswith('.json'):
+                continue
+            with open(os.path.join('Data/math',foname,fname),'r') as f:
+                d = json.load(f)
+                families.append(d['family'])
+                model_names.append(foname+'/'+fname[:-5])
+                data_array.append(d['alleles'])
+    if data_array == []:
+        return None,[],[]
+    return data_array,model_names,families
+# ------------------------------------------------------------------------------------------------
+#
+#                                       Git functions
+#
+# ------------------------------------------------------------------------------------------------
+class GitHubRemoteCallbacks(pygit2.RemoteCallbacks):
+    def __init__(self, username, token):
+        self.username = username
+        self.token = token
+        super().__init__()
+    def credentials(self, url, username_from_url, allowed_types):
+        return pygit2.UserPass(self.username, self.token)
+# ------------------------------------------------------------------------------------------------
+#
+#                                       Saving data
+#
+# ------------------------------------------------------------------------------------------------
+def save_git(alleles,genes,model,family):
+    repo = pygit2.Repository('Data')
+    remo = repo.remotes['origin']
+    d = {'family':family,'alleles':alleles}
+    model_name = model
+    data_path = f'math/{model_name}.json'
+    path = os.path.join('Data',data_path)
+    #create the file folder path
+    if not os.path.exists(os.path.dirname(path)):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+    #Open the file
+    with open(path,'w') as f:
+        json.dump(d,f)
+    repo.index.add(data_path)
+    repo.index.write()
+    reference='HEAD'
+    tree = repo.index.write_tree()
+    author = pygit2.Signature(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_MAIL'])
+    commiter = pygit2.Signature(os.environ['GITHUB_USERNAME'], os.environ['GITHUB_MAIL'])
+    oid = repo.create_commit(reference, author, commiter, f'Add data for model {model}', tree, [repo.head.target])
+    remo.push(['refs/heads/main'],callbacks=GitHubRemoteCallbacks(os.environ['GITHUB_USERNAME'],os.environ['GITHUB_TOKEN']))

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ graphviz
2	+ graphviz-dev

phylogeny.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+from constants import UNKNOWN_COLOR, DEFAULT_COLOR, UNKNOWN_COLOR_RGB, DEFAULT_COLOR_RGB
+def compute_P(alleles):
+    '''Compute the population matrix P(allele|gene) from the [alleles] given in input'''
+    P = []
+    # Process each gene position
+    for gene_alleles in alleles:
+        # Use Counter for more efficient counting
+        unique_alleles, counts = np.unique(gene_alleles, return_counts=True)
+        # Create frequency dictionary directly
+        d = dict(zip(unique_alleles, counts / len(gene_alleles)))
+        P.append(d)
+    return P
+def compute_all_P(data, models):
+    '''Compute all population matrices from a given list of [models] on the data'''
+    all_P = {}
+    for mi, m in enumerate(models):
+        alleles = data[mi]
+        P = compute_P(alleles)
+        all_P[m] = P
+    return all_P
+def compute_sim_matrix(models,all_P):
+    '''Compute the entire similarity matrix in one go'''
+    n_models = len(models)
+    n_genes = len(all_P[models[0]])
+    # Initialize matrices to store numerator and denominator terms
+    total_numerator = np.zeros((n_models, n_models))
+    left_denominators = np.zeros(n_models)
+    right_denominators = np.zeros(n_models)
+    # Process each gene position
+    for k in range(n_genes):
+        # Collect all alleles for this gene position
+        all_alleles = set()
+        for m in models:
+            all_alleles.update(all_P[m][k].keys())
+        all_alleles = list(all_alleles)
+        # Create frequency vectors for each model
+        freq_matrix = np.zeros((n_models, len(all_alleles)))
+        for i, m in enumerate(models):
+            for j, allele in enumerate(all_alleles):
+                if allele in all_P[m][k]:
+                    freq_matrix[i, j] = all_P[m][k][allele]
+        # Update numerator: dot product of frequency vectors
+        total_numerator += np.dot(freq_matrix, freq_matrix.T)
+        # Update denominators: sum of squared frequencies
+        squared_sums = np.sum(freq_matrix**2, axis=1)
+        left_denominators += squared_sums
+        right_denominators += squared_sums
+    # Calculate final similarity matrix
+    denominator_matrix = np.sqrt(np.outer(left_denominators, right_denominators))
+    sim_matrix = total_numerator / denominator_matrix
+    return sim_matrix
+def prepare_tree(tree, model_names, origins, colors):
+    """Prepare and color the phylogenetic tree based on model families."""
+    # Remove inner node names and color leaf nodes
+    for clade in tree.find_clades():
+        if clade.name and (clade.name.startswith('Inner') or clade.name.startswith('Clade')):
+            #clade.name = None
+            pass
+        if clade.name == None or clade.name not in model_names:
+            clade.family = None
+            clade.flag = False
+            continue
+        # Color the clades if it is a leaf
+        index = model_names.index(clade.name)
+        clade.family = origins[index]
+        clade.flag = True
+    # Propagate colors up the tree when all children have the same color
+    all_clades = list(tree.find_clades())
+    clades = [clade for clade in all_clades if clade.flag is False]
+    # Iterate this process until there are no more clades to color
+    i = 0
+    while clades:
+        clade = clades[i % len(clades)]
+        children_families = [c.family for c in clade.clades]
+        children_families_set = set(children_families)
+        children_flags = [c.flag for c in clade.clades]
+        children_flags_set = set(children_flags)
+        if len(children_families_set) == 1: # If all children have the same color : this clade is locked with the same color
+            clade.family = children_families[0]
+            clade.flag = True
+            del clades[i % len(clades)]
+        elif len(children_families_set) == 2 and '?' in children_families_set: # If children have different colors and one is unknown : this clade is locked with the known color
+            clade.family = [f for f in children_families_set if f != '?'][0]
+            clade.flag = True
+            del clades[i % len(clades)]
+        elif len(children_flags_set) == 1: # If children have different colors : this clade is locked with no color
+            clade.flag = True
+            del clades[i % len(clades)]
+        elif clade.flag == True: #Sholdn't happen
+            del clades[i % len(clades)]
+        i += 1
+    #Set color associated with family to each clade
+    for clade in all_clades:
+        if clade.family is None:
+            clade.color = UNKNOWN_COLOR
+        else:
+            clade.color = colors[clade.family]

plotting.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import networkx as nx
+import numpy as np
+from Bio.Phylo import to_networkx
+from networkx.drawing.nx_agraph import graphviz_layout
+import plotly.graph_objects as go
+import plotly.express as px
+from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceCalculator, _DistanceMatrix
+from tools import compute_ordered_matrix,compute_umap
+from phylogeny import prepare_tree
+from constants import UNKNOWN_COLOR, DEFAULT_COLOR, UNKNOWN_COLOR_RGB, DEFAULT_COLOR_RGB
+# ------------------------------------------------------------------------------------------------
+#
+#                                     Sim Matrix Plotting
+#
+# ------------------------------------------------------------------------------------------------
+def plot_sim_matrix_fig(ordered_sim_matrix,ordered_model_names,families,colors):
+    fig = px.imshow(
+        ordered_sim_matrix,
+        x=ordered_model_names,
+        y=ordered_model_names,
+        zmin=0, zmax=1,
+        color_continuous_scale='gray',
+    )
+    fig.update_layout(coloraxis_colorbar=dict(title='Similarity'),
+        margin=dict(l=0, r=0, t=0, b=0),
+        autosize=True,
+    )
+    fig.update_traces(
+        colorbar=dict(
+            thickness=20,
+            len=0.75,
+            xanchor="right",
+            x=1.02
+        )
+    )
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,constrain='range')
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,constrain='range')
+    #Create rectangles for highlighted models
+    rectX = go.layout.Shape(
+            type="rect",
+            xref="x", yref="y",
+            x0=0, y0=0,
+            x1=0, y1=0,
+            line=dict(color="red", width=1),
+            fillcolor="rgba(0,0,0,0)",
+            name='rectX',
+            opacity=0,
+        )
+    fig.add_shape(rectX)
+    rectY = go.layout.Shape(
+            type="rect",
+            xref="x", yref="y",
+            x0=0, y0=0,
+            x1=0, y1=0,
+            line=dict(color="red", width=1),
+            fillcolor="rgba(0,0,0,0)",
+            name='rectY',
+            opacity=0,
+        )
+    fig.add_shape(rectY)
+    return fig
+def update_sim_matrix_fig(fig, ordered_model_names, model_search_x=None, model_search_y=None):
+    if model_search_x in ordered_model_names:
+        idx_x = ordered_model_names.index(model_search_x)
+        fig.update_shapes(
+            selector=dict(name='rectX'),
+                x0=idx_x-0.5, y0=-0.5,
+                x1=idx_x+0.5, y1=len(ordered_model_names)-0.5,
+                opacity=0.7,
+            )
+    else:
+        fig.update_shapes(
+            selector=dict(name='rectX'),
+                opacity=0
+        )
+    if model_search_y in ordered_model_names:
+        idx_y = ordered_model_names.index(model_search_y)
+        fig.update_shapes(
+            selector=dict(name='rectY'),
+                x0=-0.5, y0=idx_y-0.5,
+                x1=len(ordered_model_names)-0.5, y1=idx_y+0.5,
+                opacity=0.7,
+            )
+    else:
+        fig.update_shapes(
+            selector=dict(name='rectY'),
+                opacity=0
+        )
+    return fig
+# ------------------------------------------------------------------------------------------------
+#
+#                                     2D UMAP Plotting
+#
+# ------------------------------------------------------------------------------------------------
+def alpha_scaling(val):
+    base = 0.35
+    return val**(1/(base+1/100))
+def plot_umap_fig(dist_matrix, sim_matrix, model_names, families, colors,key='fig2',alpha_edges=None, alpha_names=None, alpha_markers=None):
+    embedding = compute_umap(dist_matrix,d=2)
+    fig = go.Figure()
+    #-- EDGES
+    # Calculate edge transparencies based on similarity
+    edges = []
+    for i in range(len(model_names)):
+        for j in range(i+1, len(model_names)):  # Only process each pair once (i,j where i<j)
+            val = alpha_scaling(sim_matrix[i][j])
+            if val > 0.1:
+                edges.append((i, j, val, colors[families[i]]))
+    # Add all edges at once
+    for i, j, val, color in edges:
+        fig.add_trace(
+            go.Scatter(
+                x=[embedding[i,0], embedding[j,0]],
+                y=[embedding[i,1], embedding[j,1]],
+                mode='lines',
+                name='_edge',
+                line=dict(color=color, width=val),
+                opacity=alpha_edges,
+                showlegend=False,
+                hoverinfo='skip',
+            )
+        )
+    #-- NODES
+    marker_colors = [colors[f] for f in families]
+    fig.add_trace(
+        go.Scatter(
+            x=embedding[:,0],
+            y=embedding[:,1],
+            text=model_names,
+            mode='markers+text',
+            textposition='top center',
+            hoverinfo='text',
+            hoveron='points+fills',
+            showlegend=False,
+            name='_node',
+            marker=dict(
+                color=marker_colors,
+                size=8,
+                line_width=2,
+                opacity=alpha_markers,
+            ),
+            textfont=dict(
+                color=f'rgba(0,0,0,{alpha_names})',
+                size=8,
+                family="Arial Black",
+            )
+        )
+    )
+    #-- LEGEND
+    legends = []
+    for f in set(families):
+        legends.append(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode='markers',
+                marker=dict(
+                    color=colors[f],
+                    size=8,
+                    line_width=2,
+                    opacity=1
+                ),
+                name=f,
+            )
+        )
+    fig.add_traces(legends)
+    #Add highlighted node
+    node = go.Scatter(
+        x=[0],
+        y=[0],
+        mode='markers+text',
+        textposition='top center',
+        textfont=dict(color='red', size=16, family="Arial Black"),
+        marker=dict(
+            color='red',
+            size=12,
+            symbol='circle',
+            line=dict(color='red', width=3)
+        ),
+        showlegend=False,
+        name='node',
+        opacity=0,
+    )
+    fig.add_trace(node)
+    #Setup the layout
+    fig.update_layout(
+        margin=dict(l=0, r=0, t=0, b=0),
+        autosize=True,
+    )
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False,constrain='range')
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False,constrain='range')
+    return fig
+def update_umap_fig(fig, dist_matrix, model_names, families, colors, model_search_x=None, alpha_names=None, alpha_markers=None, alpha_edges=None, key='fig2'):
+    #Update nodes
+    fig.update_traces(
+        selector=dict(name='_node'),
+        textfont=dict(
+            color=f'rgba(0,0,0,{alpha_names})',
+        ),
+        marker=dict(
+            opacity=alpha_markers
+        ),
+    )
+    #Update edges
+    fig.update_traces(
+        selector=dict(mode='lines'),
+        line=dict(width=1),
+        opacity=alpha_edges
+    )
+    #Update highlighted node
+    if model_search_x in model_names:
+        searched_idx = model_names.index(model_search_x)
+        embedding = compute_umap(dist_matrix,d=2) #Cached computation
+        fig.update_traces(
+            selector=dict(name='node'),
+            x=[embedding[searched_idx,0]],
+            y=[embedding[searched_idx,1]],
+            text=[model_search_x],
+            marker=dict(
+                color=colors[families[searched_idx]],
+            ),
+            hovertext=model_search_x,
+            opacity=1
+        )
+    else:
+        fig.update_traces(
+            selector=dict(name='node'),
+            x=[0],
+            y=[0],
+            text=[''],
+            opacity=0
+        )
+    return fig
+# ------------------------------------------------------------------------------------------------
+#
+#                                     Phylogenetic Tree Plotting
+#
+# ------------------------------------------------------------------------------------------------
+def draw_graphviz(tree, label_func=str, prog='twopi', args='',
+                 node_size=15, edge_width=0.0, alpha_edges=None, alpha_names=None,alpha_markers=None, **kwargs):
+    #Display a tree or clade as a graph using Plotly, with layout from the graphviz engine.
+    global UNKNOWN_COLOR, DEFAULT_COLOR
+    # Convert the Bio.Phylo tree to a NetworkX graph
+    G = to_networkx(tree)
+    # Relabel nodes using integers while keeping original labels
+    Gi = nx.convert_node_labels_to_integers(G, label_attribute='label')
+    # Apply the Graphviz layout
+    pos = graphviz_layout(Gi, prog=prog, args=args)
+    # Prepare node labels for display
+    def get_label_mapping(G, selection):
+        for node, data in G.nodes(data=True):
+            if (selection is None) or (node in selection):
+                try:
+                    label = label_func(data.get('label', node))
+                    if label not in (None, node.__class__.__name__):
+                        yield (node, label)
+                except (LookupError, AttributeError, ValueError):
+                    pass
+    # Extract labels
+    labels = dict(get_label_mapping(Gi, None))
+    nodelist = list(labels.keys())
+    # Collect node colors and create edge traces
+    edge_traces = []
+    node_traces_by_family = {}
+    node_colors = {}
+    node_families = {}
+    # Track if we find the searched model and its position
+    searched_model_node = None
+    searched_model_pos = None
+    default_color = (0,0,0)
+    # Get colors and families for all nodes
+    for node in Gi.nodes():
+        node_data = Gi.nodes[node].get('label')
+        if hasattr(node_data, 'color'):
+            node_colors[node] = node_data.color.to_rgb() if not(node_data.color is None) else default_color
+        else:
+            node_colors[node] = default_color
+        node_colors[node] = f'rgb({node_colors[node][0]},{node_colors[node][1]},{node_colors[node][2]})'
+        if hasattr(node_data, 'family'):
+            node_families[node] = node_data.family
+        else:
+            node_families[node] = None
+    # Create edge traces
+    for edge in Gi.edges():
+        x0, y0 = pos[edge[0]]
+        x1, y1 = pos[edge[1]]
+        # Use the child node's color for the edge if available
+        edge_color = node_colors[edge[1]]
+        if list(edge_color) == list(UNKNOWN_COLOR_RGB): # Use the parent node's color for edge's color except if it's an unknown nodes
+            edge_color = tuple(DEFAULT_COLOR_RGB)
+        #edge_color = f'rgb({edge_color[0]},{edge_color[1]},{edge_color[2]})'
+        edge_trace = go.Scatter(
+            x=[x0, x1, None],
+            y=[y0, y1, None],
+            line=dict(width=edge_width, color=edge_color),
+            hoverinfo='none',
+            mode='lines',
+            showlegend=False,
+            name='_edge',
+            opacity=alpha_edges,
+        )
+        edge_traces.append(edge_trace)
+    # Create node traces
+    node_traces = []
+    for node in nodelist:
+        x,y = pos[node]
+        text = labels.get(node, None)
+        color = node_colors.get(node, None)
+        node_trace = go.Scatter(
+            x=[x],
+            y=[y],
+            text=text,
+            mode='markers+text',
+            textposition='top center',
+            hoverinfo='text',
+            showlegend=False,
+            name='_node',
+            marker=dict(
+                color=color,
+                size=node_size,
+                line_width=2,
+                opacity=alpha_markers,
+            ),
+            textfont=dict(
+                color=f'rgba(0,0,0,{alpha_names})',
+                size=8,
+                family="Arial Black",
+            )
+        )
+        node_traces.append(node_trace)
+    # Get color dict
+    colors = {}
+    families = []
+    for node in node_families.keys():
+        family = node_families[node]
+        if family is not None:
+            families.append(family)
+            colors[family] = node_colors.get(node, DEFAULT_COLOR)
+        else:
+            colors[family] = DEFAULT_COLOR
+    families = set(families)
+    #Custom legend
+    legends = []
+    for f in families:
+        legends.append(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode='markers',
+                marker=dict(
+                    color=colors[f],
+                    size=8,
+                    line_width=2,
+                    opacity=1
+                ),
+                name=f,
+            )
+        )
+    # Create the figure
+    fig = go.Figure(
+        data=edge_traces + node_traces,
+        layout=go.Layout(
+            showlegend=True,
+            hovermode='closest',
+            margin=dict(b=1, l=1, r=1, t=1),
+            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            legend=dict(
+                yanchor="top",
+                y=0.99,
+                xanchor="right",
+                x=0.99
+            )
+        )
+    )
+    fig.add_traces(legends)
+    return fig
+def get_color(index):
+    """Get a color from plotly's qualitative color palette."""
+    colors = px.colors.qualitative.Plotly
+    return colors[index % len(colors)]
+def plot_tree(sim_matrix, models, families,colors, alpha_names=None, alpha_markers=None, alpha_edges=None):
+    """
+    Plot a phylogenetic tree based on a similarity matrix.
+    Parameters:
+    - sim_matrix: similarity matrix between models
+    - models: list of model names
+    - families: list of family names for each model
+    Returns:
+    - fig: Plotly figure object with the phylogenetic tree
+    """
+    # Create color mapping for families
+    # Prepare the distance matrix
+    dist_matrix = -np.log(np.maximum(sim_matrix, 1e-10))  # Avoid log(0)
+    # Prepare the data for Bio.Phylo
+    low_triangle_kl_mean = [[dist_matrix[i][j] for j in range(i+1)] for i in range(len(dist_matrix))]
+    df = _DistanceMatrix(names=models, matrix=low_triangle_kl_mean)
+    # Setup Bio.Phylo
+    calculator = DistanceCalculator('identity')
+    constructor = DistanceTreeConstructor(calculator, 'nj')
+    # Build the tree
+    NJTree = constructor.nj(df)
+    NJTree.ladderize(reverse=False)
+    # Color the tree
+    prepare_tree(NJTree, models, families, colors)
+    # Generate the plotly figure
+    fig = draw_graphviz(NJTree, node_size=15, edge_width=6,alpha_names=alpha_names, alpha_markers=alpha_markers, alpha_edges=alpha_edges)
+    return fig
+def update_tree_fig(fig, model_names, model_search=None,alpha_names=None, alpha_markers=None, alpha_edges=None):
+    #Update nodes
+    fig.update_traces(
+        selector=dict(name='_node'),
+        marker=dict(
+            opacity=alpha_markers,
+        ),
+        textfont=dict(
+            color=f'rgba(0,0,0,{alpha_names})',
+        )
+    )
+    # Update edges
+    fig.update_traces(
+        selector=dict(name='_edge'),
+        opacity=alpha_edges,
+    )
+    for d in fig.data:
+        if d.name in ['_node','node']:
+            if d.text == 'mistralai/Mistral-7B-Instruct-v0.1':
+                print(d)
+    # Update highlighted node
+    fig.update_traces(
+            selector=dict(name='node'),
+            marker=dict(
+                size=15,  # Bigger than normal nodes
+                line=None  # Red border
+            ),
+            textfont=dict(
+                color=f'rgba(0,0,0,{alpha_names})', size=16, family="Arial Black",
+            ),
+            name='_node'
+        )
+    if model_search in model_names:
+        fig.update_traces(
+            selector=dict(name='_node',text=model_search),
+            marker=dict(
+                size=22,  # Bigger than normal nodes
+                line=dict(color='red', width=4)  # Red border
+            ),
+            textfont=dict(
+                color='red', size=16, family="Arial Black",
+            ),
+            name='node'
+        )
+        for d in fig.data:
+            if d.name in ['_node','node']:
+                if d.text == 'mistralai/Mistral-7B-Instruct-v0.1':
+                    print(d)
+    else:
+        pass
+    return fig

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+matplotlib
+networkx
+numpy==1.23.0
+biopython
+plotly
+scikit-learn
+streamlit
+transformers
+torch
+pygit2
+fastcluster
+pygraphviz
+accelerate
+umap-learn
+ujson

tools.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import numpy as np
+from scipy.spatial.distance import squareform
+from fastcluster import linkage
+import umap
+# ------------------------------------------------------------------------------------------------
+#
+#                                     Sim Matrix Ordering
+#
+# ------------------------------------------------------------------------------------------------
+def seriation(Z,N,cur_index):
+    '''
+        input:
+            - Z is a hierarchical tree (dendrogram)
+            - N is the number of points given to the clustering process
+            - cur_index is the position in the tree for the recursive traversal
+        output:
+            - order implied by the hierarchical tree Z
+        seriation computes the order implied by a hierarchical tree (dendrogram)
+    '''
+    if cur_index < N:
+        return [cur_index]
+    else:
+        left = int(Z[cur_index-N,0])
+        right = int(Z[cur_index-N,1])
+        return (seriation(Z,N,left) + seriation(Z,N,right))
+def compute_serial_matrix(dist_mat,method="ward"):
+    '''
+        input:
+            - dist_mat is a distance matrix
+            - method = ["ward","single","average","complete"]
+        output:
+            - seriated_dist is the input dist_mat,
+              but with re-ordered rows and columns
+              according to the seriation, i.e. the
+              order implied by the hierarchical tree
+            - res_order is the order implied by
+              the hierarhical tree
+            - res_linkage is the hierarhical tree (dendrogram)
+        compute_serial_matrix transforms a distance matrix into
+        a sorted distance matrix according to the order implied
+        by the hierarchical tree (dendrogram)
+    '''
+    N = len(dist_mat)
+    flat_dist_mat = squareform(dist_mat)
+    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
+    res_order = seriation(res_linkage, N, N + N-2)
+    seriated_dist = np.zeros((N,N))
+    a,b = np.triu_indices(N,k=1)
+    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
+    seriated_dist[b,a] = seriated_dist[a,b]
+    return seriated_dist, res_order, res_linkage
+def compute_ordered_matrix(sim_matrix,dist_matrix, model_names):
+    if len(sim_matrix) >= 2:
+        # Compute serial matrix (hierarchical clustering) for tab1
+        ordered_dist_matrix, order, Z = compute_serial_matrix(dist_matrix)
+        ordered_sim_matrix = sim_matrix[order][:, order]
+        ordered_model_names = [model_names[i] for i in order]
+    else:
+        ordered_sim_matrix = sim_matrix
+        ordered_model_names = model_names
+    return ordered_sim_matrix, ordered_model_names
+# ------------------------------------------------------------------------------------------------
+#
+#                                     UMAP computation
+#
+# ------------------------------------------------------------------------------------------------
+def compute_umap(dist_matrix,d=2):
+    embedding = umap.UMAP(densmap=True,n_components=d, metric='precomputed',random_state=42).fit_transform(dist_matrix)
+    return embedding