Spaces:

Omartificial-Intelligence-Space
/

Arabic-Semantic-Embedding-Suite

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Jun 5

Commit

e01888b

verified ·

1 Parent(s): a2c9fec

Upload 2 files

Browse files

Files changed (2) hide show

qwen_embedding_app.py +1014 -0
requirements.txt +10 -0

qwen_embedding_app.py ADDED Viewed

	@@ -0,0 +1,1014 @@

+import gradio as gr
+import torch
+import torch.nn.functional as F
+import numpy as np
+import plotly.express as px
+import pandas as pd
+import spaces
+from typing import List, Tuple
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+# Check for GPU support and configure appropriately
+device = "cuda" if torch.cuda.is_available() else "cpu"
+zero = torch.Tensor([0]).to(device)
+print(f"Device being used: {zero.device}")
+def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'Instruct: {task_description}\nQuery: {query}'
+def tokenize(tokenizer, input_texts, eod_id, max_length):
+    batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
+    for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
+        seq.append(eod_id)
+        att.append(1)
+    batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
+    return batch_dict
+class QwenEmbedder:
+    def __init__(self, embedding_dim=768):
+        self.tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
+        self.model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
+        # Uncomment below for better performance if GPU available
+        # self.model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B',
+        #     attn_implementation="flash_attention_2",
+        #     torch_dtype=torch.float16
+        # ).cuda()
+        self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
+        self.max_length = 8192
+        self.embedding_dim = embedding_dim
+        self.projection = torch.nn.Linear(768, embedding_dim) if embedding_dim != 768 else None
+    def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> Tensor:
+        if with_instruction:
+            task = 'Process and understand the following text'
+            texts = [get_detailed_instruct(task, text) for text in texts]
+        batch_dict = tokenize(self.tokenizer, texts, self.eod_id, self.max_length)
+        batch_dict.to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model(**batch_dict)
+            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+            # Project to desired dimension if needed
+            if self.projection is not None:
+                embeddings = self.projection(embeddings)
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings
+def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str) -> float:
+    embeddings = embedder.get_embeddings([text1, text2])
+    similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
+    return round(similarity, 3)
+def rerank_documents(embedder: QwenEmbedder, query: str, documents: str) -> List[Tuple[str, float]]:
+    docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
+    # Add instruction to query
+    task = 'Given a search query, retrieve relevant passages that answer the query'
+    query_with_instruct = get_detailed_instruct(task, query)
+    # Get embeddings
+    query_embedding = embedder.get_embeddings([query_with_instruct])
+    doc_embeddings = embedder.get_embeddings(docs_list)
+    # Calculate similarities
+    scores = (query_embedding @ doc_embeddings.T).squeeze(0)
+    results = [(doc, float(score)) for doc, score in zip(docs_list, scores)]
+    results.sort(key=lambda x: x[1], reverse=True)
+    return [(doc, round(score, 3)) for doc, score in results]
+def process_batch_embeddings(embedder: QwenEmbedder, texts: str) -> pd.DataFrame:
+    text_list = [text.strip() for text in texts.split('\n') if text.strip()]
+    if len(text_list) < 1:
+        return pd.DataFrame()
+    embeddings = embedder.get_embeddings(text_list)
+    scores = (embeddings @ embeddings.T).cpu().numpy()
+    # Create similarity matrix DataFrame
+    df_similarities = pd.DataFrame(
+        scores,
+        index=text_list,
+        columns=text_list
+    )
+    return df_similarities.round(3)
+def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str) -> pd.DataFrame:
+    # Process queries and documents
+    query_list = [q.strip() for q in queries.split('\n') if q.strip()]
+    doc_list = [d.strip() for d in documents.split('\n') if d.strip()]
+    if not query_list or not doc_list:
+        return pd.DataFrame()
+    # Add instruction to queries
+    instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list]
+    # Get embeddings for both queries and documents
+    query_embeddings = embedder.get_embeddings(instructed_queries)
+    doc_embeddings = embedder.get_embeddings(doc_list)
+    # Calculate similarity scores
+    scores = (query_embeddings @ doc_embeddings.T).cpu().numpy()
+    # Create DataFrame with results
+    df = pd.DataFrame(scores, index=query_list, columns=doc_list)
+    return df.round(3)
+def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str) -> dict:
+    texts = [arabic_text, english_text]
+    embeddings = embedder.get_embeddings(texts)
+    similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
+    return {"similarity": round(similarity, 3)}
+def classify_text(embedder: QwenEmbedder, text: str, categories: str) -> List[Tuple[str, float]]:
+    cat_list = [c.strip() for c in categories.split('\n') if c.strip()]
+    text_embedding = embedder.get_embeddings([text])
+    cat_embeddings = embedder.get_embeddings(cat_list)
+    scores = (text_embedding @ cat_embeddings.T).squeeze(0)
+    results = [(cat, float(score)) for cat, score in zip(cat_list, scores)]
+    results.sort(key=lambda x: x[1], reverse=True)
+    return [(cat, round(score, 3)) for cat, score in results]
+def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int) -> pd.DataFrame:
+    from sklearn.cluster import KMeans
+    doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
+    if len(doc_list) < num_clusters:
+        return pd.DataFrame()
+    embeddings = embedder.get_embeddings(doc_list)
+    # Perform clustering
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    clusters = kmeans.fit_predict(embeddings.cpu().numpy())
+    # Calculate center document for each cluster
+    cluster_centers = kmeans.cluster_centers_
+    cluster_center_docs = []
+    for i in range(num_clusters):
+        cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i]
+        cluster_embeddings = embedder.get_embeddings(cluster_docs)
+        center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0)
+        similarities = F.cosine_similarity(cluster_embeddings, center_embedding)
+        center_doc = cluster_docs[similarities.argmax().item()]
+        cluster_center_docs.append(center_doc)
+    # Create results DataFrame
+    df = pd.DataFrame({
+        'Document': doc_list,
+        'Cluster': clusters,
+        'Cluster Center Document': [cluster_center_docs[c] for c in clusters]
+    })
+    return df.sort_values('Cluster')
+def analyze_sentiment(embedder: QwenEmbedder, text: str) -> Tuple[str, dict]:
+    # Define sentiment anchors
+    anchors = {
+        "very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية",
+        "positive": "هذا جيد وممتع",
+        "neutral": "هذا عادي ومقبول",
+        "negative": "هذا سيء ومزعج",
+        "very_negative": "هذا فظيع جداً ومحبط للغاية"
+    }
+    # Get embeddings
+    text_embedding = embedder.get_embeddings([text])
+    anchor_embeddings = embedder.get_embeddings(list(anchors.values()))
+    # Calculate similarities
+    scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
+    results = list(zip(anchors.keys(), scores.tolist()))
+    results.sort(key=lambda x: x[1], reverse=True)
+    # Return tuple of (sentiment, scores_dict)
+    return (
+        results[0][0],
+        {k: round(float(v), 3) for k, v in results}
+    )
+def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str) -> List[Tuple[str, float]]:
+    # Define concept anchors based on type
+    concept_anchors = {
+        "emotions": [
+            "الفرح والسعادة",
+            "الحزن والأسى",
+            "الغضب والإحباط",
+            "الخوف والقلق",
+            "الحب والعاطفة",
+            "الأمل والتفاؤل"
+        ],
+        "topics": [
+            "السياسة والحكم",
+            "الاقتصاد والمال",
+            "العلوم والتكنولوجيا",
+            "الفن والثقافة",
+            "الرياضة والترفيه",
+            "التعليم والمعرفة"
+        ],
+        "themes": [
+            "العدالة والمساواة",
+            "التقدم والتطور",
+            "التقاليد والتراث",
+            "الحرية والاستقلال",
+            "التعاون والوحدة",
+            "الإبداع والابتكار"
+        ]
+    }
+    anchors = concept_anchors.get(concept_type, concept_anchors["topics"])
+    # Get embeddings
+    text_embedding = embedder.get_embeddings([text])
+    anchor_embeddings = embedder.get_embeddings(anchors)
+    # Calculate similarities
+    scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
+    results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)]
+    results.sort(key=lambda x: x[1], reverse=True)
+    return [(concept, round(score, 3)) for concept, score in results]
+# Add a function to reinitialize embedder with new dimension
+def reinitialize_embedder(dim: int) -> QwenEmbedder:
+    global embedder
+    embedder = QwenEmbedder(embedding_dim=dim)
+    return "Embedder reinitialized with dimension: " + str(dim)
+# Initialize the embedder with default dimension
+embedder = QwenEmbedder()
+# Update the CSS to improve feature visibility
+custom_css = """
+:root {
+    --primary-color: #2196F3;
+    --secondary-color: #1976D2;
+    --background-color: #f8f9fa;
+    --sidebar-bg: #ffffff;
+    --text-color: #333333;
+    --border-color: #e0e0e0;
+}
+.container {
+    max-width: 1200px;
+    margin: auto;
+    padding: 20px;
+}
+.sidebar {
+    background-color: var(--sidebar-bg);
+    border-right: 1px solid var(--border-color);
+    padding: 20px;
+    margin-right: 20px;
+    position: sticky;
+    top: 0;
+    height: 100vh;
+    overflow-y: auto;
+}
+.main-content {
+    background-color: var(--background-color);
+    padding: 20px;
+    border-radius: 10px;
+}
+.features-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 20px;
+    margin: 20px 0;
+}
+.feature-card {
+    background: white;
+    padding: 20px;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    transition: all 0.3s ease;
+    border: 1px solid var(--border-color);
+}
+.feature-card:hover {
+    transform: translateY(-5px);
+    box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+    border-color: var(--primary-color);
+}
+.feature-icon {
+    font-size: 28px;
+    margin-bottom: 15px;
+    color: var(--primary-color);
+}
+.feature-card h3 {
+    color: var(--text-color);
+    margin: 10px 0;
+    font-size: 1.1em;
+}
+.feature-card p {
+    color: #666;
+    font-size: 0.9em;
+    line-height: 1.4;
+}
+.features-summary {
+    margin: 40px 0;
+    padding: 30px;
+    background: white;
+    border-radius: 12px;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+}
+.features-summary h2 {
+    color: var(--text-color);
+    margin-bottom: 25px;
+    text-align: center;
+    font-size: 1.5em;
+}
+.feature-list {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+    gap: 30px;
+}
+.feature-group {
+    padding: 20px;
+    background: var(--background-color);
+    border-radius: 8px;
+    border: 1px solid var(--border-color);
+}
+.feature-group h3 {
+    color: var(--primary-color);
+    margin-bottom: 15px;
+    font-size: 1.2em;
+}
+.feature-group ul {
+    list-style: none;
+    padding: 0;
+    margin: 0;
+}
+.feature-group li {
+    padding: 8px 0;
+    color: var(--text-color);
+    position: relative;
+    padding-left: 20px;
+}
+.feature-group li:before {
+    content: "•";
+    color: var(--primary-color);
+    position: absolute;
+    left: 0;
+}
+.description {
+    margin: 20px 0;
+    padding: 15px;
+    border-radius: 8px;
+    background-color: #ffffff;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+.example {
+    margin: 10px 0;
+    padding: 15px;
+    border-left: 4px solid var(--primary-color);
+    background-color: #ffffff;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+.warning {
+    color: #721c24;
+    background-color: #f8d7da;
+    border: 1px solid #f5c6cb;
+    padding: 15px;
+    border-radius: 8px;
+    margin: 10px 0;
+}
+.settings {
+    background-color: #ffffff;
+    padding: 20px;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    margin: 20px 0;
+}
+.tab-content {
+    padding: 20px;
+    background-color: #ffffff;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+.heading {
+    color: var(--text-color);
+    margin-bottom: 20px;
+    padding-bottom: 10px;
+    border-bottom: 2px solid var(--primary-color);
+}
+button.primary {
+    background-color: var(--primary-color) !important;
+}
+button.secondary {
+    background-color: var(--secondary-color) !important;
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Store embedder in state
+    state = gr.State(embedder)
+    with gr.Row():
+        # Sidebar
+        with gr.Column(scale=1, elem_classes="sidebar"):
+            gr.Markdown("""
+            # Qwen Embeddings
+            ### Navigation
+            - [Configuration](#configuration)
+            - [Features](#features)
+            - [Documentation](#documentation)
+            """)
+            with gr.Accordion("Configuration", open=True):
+                gr.Markdown("""
+                ### Model Settings
+                Configure the embedding model parameters below.
+                """)
+                embedding_dim = gr.Slider(
+                    minimum=32,
+                    maximum=1024,
+                    value=768,
+                    step=32,
+                    label="Embedding Dimension",
+                    elem_classes="settings"
+                )
+                update_dim_btn = gr.Button("Update Dimension", variant="secondary")
+                dim_status = gr.Textbox(label="Status", interactive=False)
+            with gr.Accordion("Documentation", open=False):
+                gr.Markdown("""
+                ### Usage Guide
+                1. **Embedding Dimension**
+                   - 32-128: Fast, simple tasks
+                   - 256-512: Balanced performance
+                   - 768: Default, full model
+                   - 1024: Maximum detail
+                2. **Best Practices**
+                   - Use appropriate dimensions for your task
+                   - Consider batch size for multiple documents
+                   - Test different settings for optimal results
+                """)
+        # Main Content
+        with gr.Column(scale=4):
+            gr.Markdown("""
+            # Advanced Text Processing Suite
+            Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings.
+            This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages.
+            """)
+            # Feature Grid
+            gr.HTML("""
+            <div class="features-grid">
+                <div class="feature-card">
+                    <div class="feature-icon">🔄</div>
+                    <h3>Text Similarity</h3>
+                    <p>Compare semantic meaning between texts</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🔍</div>
+                    <h3>Semantic Search</h3>
+                    <p>Find relevant documents by meaning</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">📊</div>
+                    <h3>Batch Analysis</h3>
+                    <p>Process multiple texts simultaneously</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🎯</div>
+                    <h3>Multi-Query Retrieval</h3>
+                    <p>Match queries with relevant documents</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🌐</div>
+                    <h3>Cross-Lingual</h3>
+                    <p>Match meaning across languages</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🏷️</div>
+                    <h3>Text Classification</h3>
+                    <p>Categorize text into predefined classes</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🔮</div>
+                    <h3>Document Clustering</h3>
+                    <p>Group similar documents together</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">😊</div>
+                    <h3>Sentiment Analysis</h3>
+                    <p>Analyze emotional content in text</p>
+                </div>
+                <div class="feature-card">
+                    <div class="feature-icon">🎨</div>
+                    <h3>Concept Extraction</h3>
+                    <p>Identify key themes and topics</p>
+                </div>
+            </div>
+            <div class="features-summary">
+                <h2>Advanced Features</h2>
+                <div class="feature-list">
+                    <div class="feature-group">
+                        <h3>Text Analysis</h3>
+                        <ul>
+                            <li>Semantic similarity scoring</li>
+                            <li>Cross-language understanding</li>
+                            <li>Batch text processing</li>
+                            <li>Emotion detection</li>
+                        </ul>
+                    </div>
+                    <div class="feature-group">
+                        <h3>Document Processing</h3>
+                        <ul>
+                            <li>Smart document search</li>
+                            <li>Automated clustering</li>
+                            <li>Theme extraction</li>
+                            <li>Content categorization</li>
+                        </ul>
+                    </div>
+                    <div class="feature-group">
+                        <h3>Model Configuration</h3>
+                        <ul>
+                            <li>Adjustable embedding dimensions</li>
+                            <li>GPU acceleration support</li>
+                            <li>Batch size optimization</li>
+                            <li>Multi-language support</li>
+                        </ul>
+                    </div>
+                </div>
+            </div>
+            """)
+            with gr.Tabs() as tabs:
+                # Text Similarity Tab
+                with gr.Tab("Text Similarity Analysis"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Text Similarity Analysis
+                        Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning).
+                        <div class="example">
+                        <strong>Try these Arabic examples:</strong><br>
+                        • "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br>
+                        • "السماء صافية اليوم" and "الطقس حار جداً"
+                        </div>
+                        """)
+                        with gr.Row():
+                            text1 = gr.Textbox(
+                                label="First Text",
+                                lines=3,
+                                placeholder="Enter first text here...",
+                                value="أحب القراءة كثيراً"
+                            )
+                            text2 = gr.Textbox(
+                                label="Second Text",
+                                lines=3,
+                                placeholder="Enter second text here...",
+                                value="القراءة من أحب هواياتي"
+                            )
+                        similarity_btn = gr.Button("Calculate Similarity", variant="primary")
+                        similarity_score = gr.Number(label="Similarity Score")
+                    similarity_btn.click(
+                        fn=lambda t1, t2, s: compute_similarity(s.value, t1, t2),
+                        inputs=[text1, text2, state],
+                        outputs=similarity_score
+                    )
+                # Document Reranking Tab
+                with gr.Tab("Semantic Search & Reranking"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Semantic Search & Document Reranking
+                        Search through a collection of documents and rank them by semantic relevance to your query.
+                        <div class="example">
+                        <strong>Try these Arabic queries:</strong><br>
+                        • "ما هي عواصم الدول العربية؟"<br>
+                        • "أين تقع أكبر المدن العربية؟"<br>
+                        • "ما هي المراكز الثقافية العربية؟"
+                        </div>
+                        """)
+                        query_text = gr.Textbox(
+                            label="Search Query",
+                            placeholder="Enter your search query...",
+                            value="ما هي عواصم الدول العربية؟"
+                        )
+                        documents_text = gr.Textbox(
+                            label="Documents Collection (one per line)",
+                            lines=10,
+                            placeholder="Enter documents here, one per line...",
+                            value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
+الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي.
+دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا.
+بغداد عاصمة العراق وتقع على نهر دجلة.
+الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية.
+تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي."""
+                        )
+                        rerank_btn = gr.Button("Search & Rank", variant="primary")
+                        rerank_results = gr.Dataframe(
+                            headers=["Document", "Relevance Score"],
+                            label="Search Results"
+                        )
+                    rerank_btn.click(
+                        fn=lambda q, d, s: rerank_documents(s.value, q, d),
+                        inputs=[query_text, documents_text, state],
+                        outputs=rerank_results
+                    )
+                # Batch Analysis Tab
+                with gr.Tab("Batch Similarity Analysis"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Batch Similarity Analysis
+                        Analyze semantic relationships between multiple texts simultaneously.
+                        <div class="example">
+                        <strong>The example shows Arabic proverbs about friendship:</strong><br>
+                        See how the model captures the semantic relationships between similar themes.
+                        </div>
+                        """)
+                        batch_texts = gr.Textbox(
+                            label="Input Texts (one per line)",
+                            lines=10,
+                            placeholder="Enter texts here, one per line...",
+                            value="""الصديق وقت الضيق.
+الصديق الحقيقي يظهر عند الشدائد.
+عند المحن تعرف إخوانك.
+وقت الشدة بتعرف صحابك.
+الصاحب ساحب."""
+                        )
+                        process_btn = gr.Button("Analyze Relationships", variant="primary")
+                        similarity_matrix = gr.Dataframe(
+                            label="Similarity Matrix",
+                            wrap=True
+                        )
+                    process_btn.click(
+                        fn=lambda t, s: process_batch_embeddings(s.value, t),
+                        inputs=[batch_texts, state],
+                        outputs=[similarity_matrix]
+                    )
+                # Add new Retrieval Tab
+                with gr.Tab("Multi-Query Retrieval"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Multi-Query Document Retrieval
+                        Match multiple queries against multiple documents simultaneously using semantic search.
+                        <div class="description">
+                        This tab implements the exact retrieval logic from the Qwen example, allowing you to:
+                        - Define a custom task prompt
+                        - Input multiple queries
+                        - Input multiple documents
+                        - See all query-document match scores in a matrix
+                        </div>
+                        <div class="example">
+                        <strong>Try these examples:</strong><br>
+                        <strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br>
+                        <strong>Queries:</strong>
+                        • "ما هي أكبر المدن العربية؟"
+                        • "أين تقع أهم المراكز الثقافية؟"<br>
+                        <strong>Documents:</strong> Use the example documents or add your own
+                        </div>
+                        """)
+                        task_prompt = gr.Textbox(
+                            label="Task Prompt",
+                            placeholder="Enter the task description here...",
+                            value="Given a web search query, retrieve relevant passages that answer the query",
+                            lines=2
+                        )
+                        with gr.Row():
+                            queries_text = gr.Textbox(
+                                label="Queries (one per line)",
+                                placeholder="Enter your queries here, one per line...",
+                                value="""ما هي أكبر المدن العربية؟
+أين تقع أهم المراكز الثقافية؟""",
+                                lines=5
+                            )
+                            documents_text = gr.Textbox(
+                                label="Documents (one per line)",
+                                placeholder="Enter your documents here, one per line...",
+                                value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية.
+الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم.
+دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي.
+بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""",
+                                lines=5
+                            )
+                        retrieve_btn = gr.Button("Process Retrieval", variant="primary")
+                        retrieval_matrix = gr.Dataframe(
+                            label="Query-Document Relevance Matrix",
+                            wrap=True
+                        )
+                        gr.Markdown("""
+                        <div class="description">
+                        <strong>How to read the results:</strong>
+                        - Each row represents a query
+                        - Each column represents a document
+                        - Values show the relevance score (0-1) between each query-document pair
+                        - Higher scores indicate better matches
+                        </div>
+                        """)
+                    retrieve_btn.click(
+                        fn=lambda p, q, d, s: process_retrieval(s.value, p, q, d),
+                        inputs=[task_prompt, queries_text, documents_text, state],
+                        outputs=[retrieval_matrix]
+                    )
+                # Add Cross-Lingual Tab after the Multi-Query Retrieval tab
+                with gr.Tab("Cross-Lingual Matching"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Cross-Lingual Semantic Matching
+                        Compare the meaning of texts across Arabic and English languages.
+                        <div class="description">
+                        This feature demonstrates the model's ability to understand semantic similarity across different languages.
+                        Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning.
+                        </div>
+                        <div class="example">
+                        <strong>Try these examples:</strong><br>
+                        <strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br>
+                        <strong>English:</strong> "Reading nourishes the mind and soul"<br>
+                        Or try your own pairs of semantically similar texts in both languages.
+                        </div>
+                        """)
+                        with gr.Row():
+                            arabic_text = gr.Textbox(
+                                label="Arabic Text",
+                                placeholder="Enter Arabic text here...",
+                                value="القراءة غذاء العقل والروح",
+                                lines=3
+                            )
+                            english_text = gr.Textbox(
+                                label="English Text",
+                                placeholder="Enter English text here...",
+                                value="Reading nourishes the mind and soul",
+                                lines=3
+                            )
+                        match_btn = gr.Button("Compare Texts", variant="primary")
+                        with gr.Row():
+                            cross_lingual_score = gr.Number(
+                                label="Cross-Lingual Similarity Score",
+                                value=None
+                            )
+                        gr.Markdown("""
+                        <div class="description">
+                        <strong>Understanding the score:</strong>
+                        - Score ranges from 0 (completely different meaning) to 1 (same meaning)
+                        - Scores above 0.7 usually indicate strong semantic similarity
+                        - The model considers the meaning, not just word-for-word translation
+                        </div>
+                        """)
+                    match_btn.click(
+                        fn=lambda a, e, s: process_cross_lingual(s.value, a, e)["similarity"],
+                        inputs=[arabic_text, english_text, state],
+                        outputs=[cross_lingual_score]
+                    )
+                # Add Text Classification Tab
+                with gr.Tab("Text Classification"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Text Classification
+                        Classify text into predefined categories using semantic similarity.
+                        <div class="description">
+                        The model will compare your text against each category and rank them by relevance.
+                        You can define your own categories or use the provided examples.
+                        </div>
+                        """)
+                        input_text = gr.Textbox(
+                            label="Input Text",
+                            placeholder="Enter the text to classify...",
+                            value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل",
+                            lines=3
+                        )
+                        categories_text = gr.Textbox(
+                            label="Categories (one per line)",
+                            placeholder="Enter categories here...",
+                            value="""التكنولوجيا والابتكار
+الاقتصاد والأعمال
+التعليم والتدريب
+الثقافة والفنون
+الصحة والطب""",
+                            lines=5
+                        )
+                        classify_btn = gr.Button("Classify Text", variant="primary")
+                        classification_results = gr.Dataframe(
+                            headers=["Category", "Relevance Score"],
+                            label="Classification Results"
+                        )
+                    classify_btn.click(
+                        fn=lambda t, c, s: classify_text(s.value, t, c),
+                        inputs=[input_text, categories_text, state],
+                        outputs=classification_results
+                    )
+                # Add Document Clustering Tab
+                with gr.Tab("Document Clustering"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Document Clustering
+                        Group similar documents together using semantic clustering.
+                        <div class="description">
+                        This feature will:
+                        - Group similar documents into clusters
+                        - Identify the most representative document for each cluster
+                        - Help discover themes and patterns in your document collection
+                        </div>
+                        """)
+                        cluster_docs = gr.Textbox(
+                            label="Documents (one per line)",
+                            placeholder="Enter documents to cluster...",
+                            value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب.
+الروبوتات تساعد الأطباء في إجراء العمليات الجراحية.
+التعلم الآلي يحسن من دقة التشخيص الطبي.
+الفن يعبر عن مشاعر الإنسان وأحاسيسه.
+الموسيقى لغة عالمية تتخطى حدود الثقافات.
+الرسم والنحت من أقدم أشكال التعبير الفني.
+التجارة الإلكترونية تغير نمط التسوق التقليدي.
+التسوق عبر الإنترنت يوفر الوقت والجهد.
+المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""",
+                            lines=10
+                        )
+                        num_clusters = gr.Slider(
+                            minimum=2,
+                            maximum=10,
+                            value=3,
+                            step=1,
+                            label="Number of Clusters"
+                        )
+                        cluster_btn = gr.Button("Cluster Documents", variant="primary")
+                        clustering_results = gr.Dataframe(
+                            label="Clustering Results"
+                        )
+                    cluster_btn.click(
+                        fn=lambda d, n, s: cluster_documents(s.value, d, n),
+                        inputs=[cluster_docs, num_clusters, state],
+                        outputs=clustering_results
+                    )
+                # Add Sentiment Analysis Tab
+                with gr.Tab("Sentiment Analysis"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Arabic Sentiment Analysis
+                        Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors.
+                        <div class="description">
+                        The model will compare your text against predefined sentiment anchors and determine:
+                        - The overall sentiment
+                        - Confidence scores for each sentiment level
+                        </div>
+                        """)
+                        sentiment_text = gr.Textbox(
+                            label="Text to Analyze",
+                            placeholder="Enter text to analyze sentiment...",
+                            value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين",
+                            lines=3
+                        )
+                        analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
+                        with gr.Row():
+                            sentiment_label = gr.Label(label="Overall Sentiment")
+                            sentiment_scores = gr.Json(label="Detailed Scores")
+                        analyze_btn.click(
+                            fn=lambda t, s: analyze_sentiment(s.value, t),
+                            inputs=[sentiment_text, state],
+                            outputs=[sentiment_label, sentiment_scores]
+                        )
+                # Add Concept Extraction Tab
+                with gr.Tab("Concept Extraction"):
+                    with gr.Column(elem_classes="tab-content"):
+                        gr.Markdown("""
+                        ### Concept Extraction
+                        Extract key concepts and themes from Arabic text.
+                        <div class="description">
+                        Analyze text to identify:
+                        - Emotional content
+                        - Main topics
+                        - Underlying themes
+                        </div>
+                        """)
+                        concept_text = gr.Textbox(
+                            label="Text to Analyze",
+                            placeholder="Enter text to analyze...",
+                            value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة",
+                            lines=3
+                        )
+                        concept_type = gr.Radio(
+                            choices=["emotions", "topics", "themes"],
+                            value="themes",
+                            label="Concept Type"
+                        )
+                        extract_btn = gr.Button("Extract Concepts", variant="primary")
+                        concept_results = gr.Dataframe(
+                            headers=["Concept", "Relevance Score"],
+                            label="Extracted Concepts"
+                        )
+                        extract_btn.click(
+                            fn=lambda t, c, s: extract_concepts(s.value, t, c),
+                            inputs=[concept_text, concept_type, state],
+                            outputs=concept_results
+                        )
+    # Fix dimension update functionality
+    def update_embedder_dim(dim, state):
+        try:
+            new_embedder = QwenEmbedder(embedding_dim=dim)
+            state.value = new_embedder
+            return state, f"Successfully updated embedding dimension to {dim}"
+        except Exception as e:
+            return state, f"Error updating dimension: {str(e)}"
+    update_dim_btn.click(
+        fn=update_embedder_dim,
+        inputs=[embedding_dim, state],
+        outputs=[state, dim_status]
+    )
+# Wrap the demo creation in the spaces decorator
+@spaces.GPU(duration=120)
+def create_demo():
+    # ... rest of your existing demo code ...
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=4.0.0
+numpy>=1.21.0
+requests>=2.26.0
+scipy>=1.7.0
+sentence-transformers>=2.2.0
+torch>=2.0.0
+scikit-learn>=1.0.0
+transformers>=4.51.0
+plotly>=5.18.0
+pandas>=2.0.0