Arabic-Wiki-RAG / app.py
Omartificial-Intelligence-Space's picture
update app.py
802e961 verified
raw
history blame
3.98 kB
import gradio as gr
from sentence_transformers import SentenceTransformer
from wikipediaapi import Wikipedia
import textwrap
import numpy as np
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# Function to process the input and generate the output
def process_query(wiki_page, embed_dim, query, api_key=None, mode="OpenAI"):
model_mapping = {
"Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
"Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
"Arabert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka",
"Arabic-labse-Matryoshka": "Omartificial-Intelligence-Space/Arabic-labse-Matryoshka",
"Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka"
}
wiki = Wikipedia('RAGBot/0.0', 'ar')
doc = wiki.page(wiki_page).text
paragraphs = doc.split('\n\n') # chunking
for i, p in enumerate(paragraphs):
wrapped_text = textwrap.fill(p, width=100)
responses = {}
for model_name, model_path in model_mapping.items():
model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim)
docs_embed = model.encode(paragraphs, normalize_embeddings=True)
query_embed = model.encode(query, normalize_embeddings=True)
similarities = np.dot(docs_embed, query_embed.T)
top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
most_similar_documents = [paragraphs[idx] for idx in top_3_idx]
CONTEXT = ""
for p in most_similar_documents:
wrapped_text = textwrap.fill(p, width=100)
CONTEXT += wrapped_text + "\n\n"
prompt = f"""
use the following CONTEXT to answer the QUESTION at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
CONTEXT: {CONTEXT}
QUESTION: {query}
"""
if mode == "OpenAI":
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "user", "content": prompt},
]
)
responses[model_name] = response.choices[0].message.content
elif mode == "OpenSource":
tokenizer = AutoTokenizer.from_pretrained("google/gemini-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemini-2b")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
response = generator(prompt, max_length=512, num_return_sequences=1)
responses[model_name] = response[0]['generated_text']
return "\n\n".join([f"Model: {model_name}\nResponse: {response}" for model_name, response in responses.items()])
# Define the interface
wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)")
query_input = gr.Textbox(label="Query (in Arabic)")
api_key_input = gr.Textbox(label="OpenAI API Key", type="password", visible=False)
embed_dim_choice = gr.Dropdown(
choices=[768, 512, 256, 128, 64],
label="Embedding Dimension"
)
mode_choice = gr.Radio(
choices=["OpenAI", "OpenSource"],
label="Choose Mode"
)
output_text = gr.Textbox(label="Output")
def on_mode_change(mode):
if mode == "OpenAI":
api_key_input.visible = True
else:
api_key_input.visible = False
mode_choice.change(on_mode_change, inputs=mode_choice, outputs=api_key_input)
gr.Interface(
fn=process_query,
inputs=[wiki_page_input, embed_dim_choice, query_input, api_key_input, mode_choice],
outputs=output_text,
title="Arabic Wiki RAG",
description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic."
).launch()