import re import threading import gradio as gr import spaces import transformers from transformers import pipeline # Loading model and tokenizer model_name = "meta-llama/Llama-3.1-8B-Instruct" if gr.NO_RELOAD: pipe = pipeline( "text-generation", model=model_name, device_map="auto", torch_dtype="auto", ) # Marker for detecting final answer ANSWER_MARKER = "**Answer**" # Sentences to start step-by-step reasoning rethink_prepends = [ "Now, I need to understand the following ", "In my opinion ", "Let me verify if the following is correct ", "Also, I should remember that ", "Another point to note is ", "And I also remember the following fact ", "Now I think I understand sufficiently ", ] # Prompt addition for generating final answer final_answer_prompt = """ Based on my reasoning process so far, I will answer the original question in the language it was asked: {question} Here is the conclusion I've reasoned: {reasoning_conclusion} Based on the above reasoning, my final answer: {ANSWER_MARKER} """ # Settings for displaying formulas latex_delimiters = [ {"left": "$$", "right": "$$", "display": True}, {"left": "$", "right": "$", "display": False}, ] def reformat_math(text): """Modify MathJax delimiters to use Gradio syntax (Katex). This is a temporary fix for displaying math formulas in Gradio. Currently, I haven't found a way to make it work as expected with other latex_delimiters... """ text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL) text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL) return text def user_input(message, history_original, history_thinking): """Add user input to history and clear input text box""" return "", history_original + [ gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, "")) ], history_thinking + [ gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, "")) ] def rebuild_messages(history: list): """Reconstruct messages from history for model use without intermediate thinking process""" messages = [] for h in history: if isinstance(h, dict) and not h.get("metadata", {}).get("title", False): messages.append(h) elif ( isinstance(h, gr.ChatMessage) and h.metadata.get("title", None) is None and isinstance(h.content, str) ): messages.append({"role": h.role, "content": h.content}) return messages @spaces.GPU def bot_original( history: list, max_num_tokens: int, do_sample: bool, temperature: float, ): """Make the original model answer questions (without reasoning process)""" # For streaming tokens from thread later streamer = transformers.TextIteratorStreamer( pipe.tokenizer, # pyright: ignore skip_special_tokens=True, skip_prompt=True, ) # Prepare assistant message history.append( gr.ChatMessage( role="assistant", content=str(""), ) ) # Messages to be displayed in current chat messages = rebuild_messages(history[:-1]) # Excluding last empty message # Original model answers directly without reasoning t = threading.Thread( target=pipe, args=(messages,), kwargs=dict( max_new_tokens=max_num_tokens, streamer=streamer, do_sample=do_sample, temperature=temperature, ), ) t.start() for token in streamer: history[-1].content += token history[-1].content = reformat_math(history[-1].content) yield history t.join() yield history @spaces.GPU def bot_thinking( history: list, max_num_tokens: int, final_num_tokens: int, do_sample: bool, temperature: float, ): """Make the model answer questions with reasoning process""" # For streaming tokens from thread later streamer = transformers.TextIteratorStreamer( pipe.tokenizer, # pyright: ignore skip_special_tokens=True, skip_prompt=True, ) # For reinserting the question into reasoning if needed question = history[-1]["content"] # Prepare assistant message history.append( gr.ChatMessage( role="assistant", content=str(""), metadata={"title": "🧠 Thinking...", "status": "pending"}, ) ) # Reasoning process to be displayed in current chat messages = rebuild_messages(history) # Variable to store the entire reasoning process full_reasoning = "" # Run reasoning steps for i, prepend in enumerate(rethink_prepends): if i > 0: messages[-1]["content"] += "\n\n" messages[-1]["content"] += prepend.format(question=question) t = threading.Thread( target=pipe, args=(messages,), kwargs=dict( max_new_tokens=max_num_tokens, streamer=streamer, do_sample=do_sample, temperature=temperature, ), ) t.start() # Reconstruct history with new content history[-1].content += prepend.format(question=question) for token in streamer: history[-1].content += token history[-1].content = reformat_math(history[-1].content) yield history t.join() # Save the result of each reasoning step to full_reasoning full_reasoning = history[-1].content # Reasoning complete, now generate final answer history[-1].metadata = {"title": "💭 Thought Process", "status": "done"} # Extract conclusion part from reasoning process (approximately last 1-2 paragraphs) reasoning_parts = full_reasoning.split("\n\n") reasoning_conclusion = "\n\n".join(reasoning_parts[-2:]) if len(reasoning_parts) > 2 else full_reasoning # Add final answer message history.append(gr.ChatMessage(role="assistant", content="")) # Construct message for final answer final_messages = rebuild_messages(history[:-1]) # Excluding last empty message final_prompt = final_answer_prompt.format( question=question, reasoning_conclusion=reasoning_conclusion, ANSWER_MARKER=ANSWER_MARKER ) final_messages[-1]["content"] += final_prompt # Generate final answer t = threading.Thread( target=pipe, args=(final_messages,), kwargs=dict( max_new_tokens=final_num_tokens, streamer=streamer, do_sample=do_sample, temperature=temperature, ), ) t.start() # Stream final answer for token in streamer: history[-1].content += token history[-1].content = reformat_math(history[-1].content) yield history t.join() yield history with gr.Blocks(fill_height=True, title="ThinkFlow") as demo: # Title and description gr.Markdown("# ThinkFlow") gr.Markdown("### An LLM reasoning generation platform that automatically applies reasoning capabilities to LLM models without modification") # Features and benefits section with gr.Accordion("✨ Features & Benefits", open=True): gr.Markdown(""" - **Enhanced Reasoning**: Transform any LLM into a step-by-step reasoning engine without model modifications - **Transparency**: Visualize the model's thought process alongside direct answers - **Improved Accuracy**: See how guided reasoning leads to more accurate solutions for complex problems - **Educational Tool**: Perfect for teaching critical thinking and problem-solving approaches - **Versatile Application**: Works with mathematical problems, logical puzzles, and complex questions - **Side-by-Side Comparison**: Compare standard model responses with reasoning-enhanced outputs """) with gr.Row(scale=1): with gr.Column(scale=2): gr.Markdown("## Before (Original)") chatbot_original = gr.Chatbot( scale=1, type="messages", latex_delimiters=latex_delimiters, label="Original Model (No Reasoning)" ) with gr.Column(scale=2): gr.Markdown("## After (Thinking)") chatbot_thinking = gr.Chatbot( scale=1, type="messages", latex_delimiters=latex_delimiters, label="Model with Reasoning" ) with gr.Row(): # Define msg textbox first msg = gr.Textbox( submit_btn=True, label="", show_label=False, placeholder="Enter your question here.", autofocus=True, ) # Examples section - placed after msg variable definition with gr.Accordion("EXAMPLES", open=False): examples = gr.Examples( examples=[ "[Source: MATH-500)] How many numbers among the first 100 positive integers are divisible by 3, 4, and 5?", "[Source: MATH-500)] In the land of Ink, the money system is unique. 1 trinket equals 4 blinkets, and 3 blinkets equal 7 drinkits. What is the value of 56 drinkits in trinkets?", "[Source: MATH-500)] The average age of Amy, Ben, and Chris is 6 years. Four years ago, Chris was the same age as Amy is now. Four years from now, Ben's age will be $\\frac{3}{5}$ of Amy's age at that time. How old is Chris now?", "[Source: MATH-500)] A bag contains yellow and blue marbles. Currently, the ratio of blue marbles to yellow marbles is 4:3. After adding 5 blue marbles and removing 3 yellow marbles, the ratio becomes 7:3. How many blue marbles were in the bag before any were added?" ], inputs=msg ) with gr.Row(): with gr.Column(): gr.Markdown("""## Parameter Adjustment""") num_tokens = gr.Slider( 50, 4000, 2000, step=1, label="Maximum tokens per reasoning step", interactive=True, ) final_num_tokens = gr.Slider( 50, 4000, 2000, step=1, label="Maximum tokens for final answer", interactive=True, ) do_sample = gr.Checkbox(True, label="Use sampling") temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature") # Community link at the bottom gr.Markdown("

Community: https://discord.gg/openfreeai

") # When user submits a message, both bots respond simultaneously msg.submit( user_input, [msg, chatbot_original, chatbot_thinking], # inputs [msg, chatbot_original, chatbot_thinking], # outputs ).then( bot_original, [ chatbot_original, num_tokens, do_sample, temperature, ], chatbot_original, # save new history in outputs ).then( bot_thinking, [ chatbot_thinking, num_tokens, final_num_tokens, do_sample, temperature, ], chatbot_thinking, # save new history in outputs ) if __name__ == "__main__": demo.queue().launch()