Spaces:

uditk99
/

Financial_RAG_Chatbot

Sleeping

File size: 21,852 Bytes

ea40f87

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# =============================================================================\n",
    "# Imports & Setup\n",
    "# =============================================================================\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import faiss  # For fast vector similarity search\n",
    "from sentence_transformers import SentenceTransformer  # For generating text embeddings\n",
    "from rank_bm25 import BM25Okapi  # For BM25 keyword-based retrieval\n",
    "import spacy  # For tokenization\n",
    "from sklearn.metrics.pairwise import cosine_similarity  # For computing cosine similarity\n",
    "from sklearn.preprocessing import normalize  # For normalizing BM25 scores\n",
    "\n",
    "# For the Gradio UI\n",
    "import gradio as gr\n",
    "\n",
    "# For response generation using a small language model (we use FLAN-T5-Small)\n",
    "from transformers import pipeline, set_seed\n",
    "\n",
    "# Set a random seed for reproducibility\n",
    "set_seed(42)\n",
    "\n",
    "# Load SpaCy English model (make sure to download it with: python -m spacy download en_core_web_sm)\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 9800 entries, 0 to 9799\n",
      "Data columns (total 7 columns):\n",
      " #   Column     Non-Null Count  Dtype  \n",
      "---  ------     --------------  -----  \n",
      " 0   Date       9800 non-null   object \n",
      " 1   Open       9800 non-null   float64\n",
      " 2   High       9800 non-null   float64\n",
      " 3   Low        9800 non-null   float64\n",
      " 4   Close      9800 non-null   float64\n",
      " 5   Adj Close  9800 non-null   float64\n",
      " 6   Volume     9800 non-null   int64  \n",
      "dtypes: float64(5), int64(1), object(1)\n",
      "memory usage: 536.1+ KB\n",
      "None\n",
      "   Year  Open_Min  Open_Max  Close_Min  Close_Max    Avg_Volume  \\\n",
      "0  1986  0.088542  0.177083   0.090278   0.177083  3.620005e+07   \n",
      "1  1987  0.165799  0.548611   0.165799   0.548611  9.454613e+07   \n",
      "2  1988  0.319444  0.484375   0.319444   0.483507  6.906268e+07   \n",
      "3  1989  0.322049  0.618056   0.322917   0.614583  7.735760e+07   \n",
      "4  1990  0.591146  1.102431   0.598090   1.100694  7.408945e+07   \n",
      "\n",
      "                                             Summary  \n",
      "0  In 1986.0, the stock opened between $0.09 and ...  \n",
      "1  In 1987.0, the stock opened between $0.17 and ...  \n",
      "2  In 1988.0, the stock opened between $0.32 and ...  \n",
      "3  In 1989.0, the stock opened between $0.32 and ...  \n",
      "4  In 1990.0, the stock opened between $0.59 and ...  \n"
     ]
    }
   ],
   "source": [
    "# =============================================================================\n",
    "# 1. Data Collection & Preprocessing\n",
    "# =============================================================================\n",
    "# Load the CSV file containing financial data.\n",
    "# (Make sure the CSV file \"MSFT_1986-03-13_2025-02-04.csv\" is in the \"data\" folder)\n",
    "csv_file_path = r\"D:\\ConvAI_Code\\MSFT_1986-03-13_2025-02-04.csv\"  # Adjust the path if necessary\n",
    "# Load the CSV file into a DataFrame\n",
    "df = pd.read_csv(csv_file_path)\n",
    "\n",
    "# Display basic info about the dataset\n",
    "print(df.info())\n",
    "\n",
    "# Data Cleaning & Structuring\n",
    "\n",
    "# Convert 'Date' column to datetime format\n",
    "df['Date'] = pd.to_datetime(df['Date'])\n",
    "\n",
    "# Sort data by Date\n",
    "df = df.sort_values(by='Date')\n",
    "\n",
    "# Extract Year from Date\n",
    "df['Year'] = df['Date'].dt.year\n",
    "\n",
    "# Aggregate data by Year to generate financial summaries\n",
    "yearly_summary = df.groupby('Year').agg(\n",
    "    Open_Min=('Open', 'min'),\n",
    "    Open_Max=('Open', 'max'),\n",
    "    Close_Min=('Close', 'min'),\n",
    "    Close_Max=('Close', 'max'),\n",
    "    Avg_Volume=('Volume', 'mean')\n",
    ").reset_index()\n",
    "\n",
    "# Create a textual summary for each year\n",
    "yearly_summary['Summary'] = yearly_summary.apply(\n",
    "    lambda row: f\"In {row['Year']}, the stock opened between ${row['Open_Min']:.2f} and ${row['Open_Max']:.2f}, \"\n",
    "                f\"while closing between ${row['Close_Min']:.2f} and ${row['Close_Max']:.2f}. \"\n",
    "                f\"The average trading volume was {row['Avg_Volume']:,.0f} shares.\",\n",
    "    axis=1\n",
    ")\n",
    "\n",
    "# Display the cleaned and structured data\n",
    "print(yearly_summary.head())  # Use this for terminal/console\n",
    "# yearly_summary.head()  # Use this in Jupyter Notebook\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# =============================================================================\n",
    "# 2. Basic RAG Implementation\n",
    "# =============================================================================\n",
    "# Convert financial summaries into text chunks and generate vector embeddings.\n",
    "embedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
    "\n",
    "# Convert yearly financial summaries into vector embeddings\n",
    "summary_texts = yearly_summary[\"Summary\"].tolist()  # Extract summaries as text\n",
    "summary_embeddings = embedding_model.encode(summary_texts, convert_to_numpy=True)  # Generate embeddings\n",
    "\n",
    "# Store embeddings as a NumPy array for further processing\n",
    "summary_embeddings.shape  # This should be (num_years, embedding_size)\n",
    "\n",
    "# Define the dimension of embeddings (384 from MiniLM model)\n",
    "embedding_dim = 384\n",
    "\n",
    "# Create a FAISS index (Flat index for now, can be optimized later)\n",
    "faiss_index = faiss.IndexFlatL2(embedding_dim)\n",
    "\n",
    "# Convert embeddings to float32 (FAISS requires this format)\n",
    "summary_embeddings = summary_embeddings.astype('float32')\n",
    "\n",
    "# Add embeddings to the FAISS index\n",
    "faiss_index.add(summary_embeddings)\n",
    "\n",
    "# Store the year information for retrieval\n",
    "year_map = {i: yearly_summary[\"Year\"].iloc[i] for i in range(len(yearly_summary))}\n",
    "\n",
    "# Verify that embeddings are stored successfully\n",
    "faiss_index.ntotal\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged summaries shape: (12, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Year</th>\n",
       "      <th>Merged Summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1986</td>\n",
       "      <td>In 1986.0, the stock opened between $0.09 and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1990</td>\n",
       "      <td>In 1989.0, the stock opened between $0.32 and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1992</td>\n",
       "      <td>In 1991.0, the stock opened between $1.03 and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1996</td>\n",
       "      <td>In 1994.0, the stock opened between $2.45 and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1999</td>\n",
       "      <td>In 1997.0, the stock opened between $10.25 and...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Year                                     Merged Summary\n",
       "0  1986  In 1986.0, the stock opened between $0.09 and ...\n",
       "1  1990  In 1989.0, the stock opened between $0.32 and ...\n",
       "2  1992  In 1991.0, the stock opened between $1.03 and ...\n",
       "3  1996  In 1994.0, the stock opened between $2.45 and ...\n",
       "4  1999  In 1997.0, the stock opened between $10.25 and..."
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# =============================================================================\n",
    "# 3. Advanced RAG Implementation\n",
    "# =============================================================================\n",
    "# 3.1: BM25 for Keyword-Based Search\n",
    "# Tokenize each summary using SpaCy (tokens are converted to lowercase).\n",
    "tokenized_summaries = [[token.text.lower() for token in nlp(summary)] for summary in summary_texts]\n",
    "# Build the BM25 index.\n",
    "bm25 = BM25Okapi(tokenized_summaries)\n",
    "\n",
    "# 3.2: Define Retrieval Functions\n",
    "\n",
    "def retrieve_similar_summaries(query_text, top_k=3):\n",
    "    \"\"\"\n",
    "    Retrieve similar financial summaries using FAISS vector search.\n",
    "    \"\"\"\n",
    "    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n",
    "    distances, indices = faiss_index.search(query_embedding, top_k)\n",
    "    results = []\n",
    "    for idx in indices[0]:\n",
    "        results.append((year_map[idx], yearly_summary.iloc[idx][\"Summary\"]))\n",
    "    return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n",
    "\n",
    "def hybrid_retrieve(query_text, top_k=3, alpha=0.5):\n",
    "    \"\"\"\n",
    "    Hybrid retrieval combining FAISS (vector search) and BM25 (keyword search).\n",
    "    Scores are combined using the weighting factor 'alpha'.\n",
    "    \"\"\"\n",
    "    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n",
    "    _, faiss_indices = faiss_index.search(query_embedding, top_k)\n",
    "    \n",
    "    bm25_scores = bm25.get_scores([token.text.lower() for token in nlp(query_text)])\n",
    "    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]\n",
    "    \n",
    "    combined_scores = {}\n",
    "    for rank, idx in enumerate(faiss_indices[0]):\n",
    "        combined_scores[idx] = alpha * (top_k - rank)\n",
    "    bm25_norm_scores = normalize([bm25_scores])[0]\n",
    "    for rank, idx in enumerate(bm25_top_indices):\n",
    "        if idx in combined_scores:\n",
    "            combined_scores[idx] += (1 - alpha) * (top_k - rank)\n",
    "        else:\n",
    "            combined_scores[idx] = (1 - alpha) * (top_k - rank)\n",
    "    \n",
    "    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)\n",
    "    results = [(year_map[idx], yearly_summary.iloc[idx][\"Summary\"]) for idx, _ in sorted_results]\n",
    "    return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n",
    "\n",
    "def adaptive_retrieve(query_text, top_k=3, alpha=0.5):\n",
    "    \"\"\"\n",
    "    Adaptive retrieval re-ranks results by combining FAISS and BM25 scores.\n",
    "    \"\"\"\n",
    "    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n",
    "    _, faiss_indices = faiss_index.search(query_embedding, top_k)\n",
    "    \n",
    "    query_tokens = [token.text.lower() for token in nlp(query_text)]\n",
    "    bm25_scores = bm25.get_scores(query_tokens)\n",
    "    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]\n",
    "    \n",
    "    faiss_scores = np.linspace(1, 0, num=top_k)\n",
    "    bm25_norm_scores = normalize([bm25_scores])[0]\n",
    "    \n",
    "    combined_scores = {}\n",
    "    for rank, idx in enumerate(faiss_indices[0]):\n",
    "        combined_scores[idx] = alpha * faiss_scores[rank]\n",
    "    for idx in bm25_top_indices:\n",
    "        if idx in combined_scores:\n",
    "            combined_scores[idx] += (1 - alpha) * bm25_norm_scores[idx]\n",
    "        else:\n",
    "            combined_scores[idx] = (1 - alpha) * bm25_norm_scores[idx]\n",
    "    \n",
    "    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)\n",
    "    results = [(year_map[idx], yearly_summary.iloc[idx][\"Summary\"]) for idx, _ in sorted_results]\n",
    "    return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n",
    "\n",
    "def merge_similar_chunks(threshold=0.95):\n",
    "    \"\"\"\n",
    "    Chunk Merging: Merge similar financial summaries based on cosine similarity.\n",
    "    This reduces redundancy when multiple chunks are very similar.\n",
    "    \"\"\"\n",
    "    merged_summaries = []\n",
    "    used_indices = set()\n",
    "    for i in range(len(summary_embeddings)):\n",
    "        if i in used_indices:\n",
    "            continue\n",
    "        similarities = cosine_similarity([summary_embeddings[i]], summary_embeddings)[0]\n",
    "        similar_indices = np.where(similarities >= threshold)[0]\n",
    "        merged_text = \" \".join(yearly_summary.iloc[idx][\"Summary\"] for idx in similar_indices)\n",
    "        merged_summaries.append((yearly_summary.iloc[i][\"Year\"], merged_text))\n",
    "        used_indices.update(similar_indices)\n",
    "    return pd.DataFrame(merged_summaries, columns=[\"Year\", \"Merged Summary\"])\n",
    "\n",
    "# Optional: Check merged summaries for debugging.\n",
    "merged_summary_df = merge_similar_chunks(threshold=0.95)\n",
    "print(\"Merged summaries shape:\", merged_summary_df.shape)\n",
    "merged_summary_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7864\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7864/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# =============================================================================\n",
    "# 4. UI Development using Gradio (Updated for newer API)\n",
    "# =============================================================================\n",
    "def generate_response(query_text, top_k=3, alpha=0.5):\n",
    "    \"\"\"\n",
    "    Generate an answer for a financial query by:\n",
    "      - Validating the query with an input-side guardrail.\n",
    "      - Retrieving context using adaptive retrieval.\n",
    "      - Generating a refined answer using FLAN-T5-Small.\n",
    "    Returns:\n",
    "      answer (str): The generated answer.\n",
    "      confidence (float): A mock confidence score based on BM25 scores.\n",
    "    \"\"\"\n",
    "    # -----------------------------------------------------------------------------\n",
    "    # Guard Rail Implementation (Input-Side)\n",
    "    # -----------------------------------------------------------------------------\n",
    "    financial_keywords = [\"open\", \"close\", \"stock\", \"price\", \"volume\", \"trading\"]\n",
    "    if not any(keyword in query_text.lower() for keyword in financial_keywords):\n",
    "        return (\"Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question.\"), 0.0\n",
    "\n",
    "    # Retrieve context using adaptive retrieval.\n",
    "    context_df = adaptive_retrieve(query_text, top_k=top_k, alpha=alpha)\n",
    "    context_text = \" \".join(context_df[\"Summary\"].tolist())\n",
    "    \n",
    "    # Adjust the prompt to provide clear instructions.\n",
    "    prompt = f\"Given the following financial data:\\n{context_text}\\nAnswer this question: {query_text}.\"\n",
    "    \n",
    "    # Use FLAN-T5-Small for text generation via the text2text-generation pipeline.\n",
    "    # Increase max_length to allow longer answers.\n",
    "    generator = pipeline('text2text-generation', model='google/flan-t5-small')\n",
    "    generated = generator(prompt, max_length=200, num_return_sequences=1)\n",
    "    answer = generated[0]['generated_text'].replace(prompt, \"\").strip()\n",
    "    \n",
    "    # Fallback message if answer is empty.\n",
    "    if not answer:\n",
    "        answer = \"I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question.\"\n",
    "    \n",
    "    # Compute a mock confidence score using normalized BM25 scores.\n",
    "    query_tokens = [token.text.lower() for token in nlp(query_text)]\n",
    "    bm25_scores = bm25.get_scores(query_tokens)\n",
    "    max_score = np.max(bm25_scores) if np.max(bm25_scores) > 0 else 1\n",
    "    confidence = round(np.mean(bm25_scores) / max_score, 2)\n",
    "    \n",
    "    return answer, confidence\n",
    "\n",
    "# Create the Gradio interface using the new API.\n",
    "iface = gr.Interface(\n",
    "    fn=generate_response,\n",
    "    inputs=gr.Textbox(lines=2, placeholder=\"Enter your financial question here...\"),\n",
    "    outputs=[gr.Textbox(label=\"Answer\"), gr.Textbox(label=\"Confidence Score\")],\n",
    "    title=\"Financial RAG Model Interface\",\n",
    "    description=(\"Ask questions based on the company's financial summaries  \"\n",
    "                 )\n",
    ")\n",
    "\n",
    "# Launch the Gradio interface.\n",
    "iface.launch()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cpu\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:  What year had the lowest stock prices?\n",
      "Answer:  I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question.\n",
      "Confidence Score:  1.0\n",
      "--------------------------------------------------\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cpu\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:  How did the trading volume vary?\n",
      "Answer:  The average trading volume was 23,244,919 shares\n",
      "Confidence Score:  1.0\n",
      "--------------------------------------------------\n",
      "Question:  What is the capital of France?\n",
      "Answer:  Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question.\n",
      "Confidence Score:  0.0\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "# =============================================================================\n",
    "# 6. Testing & Validation (Updated)\n",
    "# =============================================================================\n",
    "def print_test_results(query_text, top_k=3, alpha=0.5):\n",
    "    answer, confidence = generate_response(query_text, top_k, alpha)\n",
    "    print(\"Question: \", query_text)\n",
    "    print(\"Answer: \", answer)\n",
    "    print(\"Confidence Score: \", confidence)\n",
    "    print(\"-\" * 50)\n",
    "\n",
    "# Test 1: High-confidence financial query.\n",
    "query_high = \"What year had the lowest stock prices?\"\n",
    "print_test_results(query_high)\n",
    "\n",
    "# Test 2: Low-confidence financial query.\n",
    "query_low = \"How did the trading volume vary?\"\n",
    "print_test_results(query_low)\n",
    "\n",
    "# Test 3: Irrelevant query (should trigger guardrail).\n",
    "query_irrelevant = \"What is the capital of France?\"\n",
    "print_test_results(query_irrelevant)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}