{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# =============================================================================\n", "# Imports & Setup\n", "# =============================================================================\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import faiss # For fast vector similarity search\n", "from sentence_transformers import SentenceTransformer # For generating text embeddings\n", "from rank_bm25 import BM25Okapi # For BM25 keyword-based retrieval\n", "import spacy # For tokenization\n", "from sklearn.metrics.pairwise import cosine_similarity # For computing cosine similarity\n", "from sklearn.preprocessing import normalize # For normalizing BM25 scores\n", "\n", "# For the Gradio UI\n", "import gradio as gr\n", "\n", "# For response generation using a small language model (we use FLAN-T5-Small)\n", "from transformers import pipeline, set_seed\n", "\n", "# Set a random seed for reproducibility\n", "set_seed(42)\n", "\n", "# Load SpaCy English model (make sure to download it with: python -m spacy download en_core_web_sm)\n", "nlp = spacy.load(\"en_core_web_sm\")\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 9800 entries, 0 to 9799\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Date 9800 non-null object \n", " 1 Open 9800 non-null float64\n", " 2 High 9800 non-null float64\n", " 3 Low 9800 non-null float64\n", " 4 Close 9800 non-null float64\n", " 5 Adj Close 9800 non-null float64\n", " 6 Volume 9800 non-null int64 \n", "dtypes: float64(5), int64(1), object(1)\n", "memory usage: 536.1+ KB\n", "None\n", " Year Open_Min Open_Max Close_Min Close_Max Avg_Volume \\\n", "0 1986 0.088542 0.177083 0.090278 0.177083 3.620005e+07 \n", "1 1987 0.165799 0.548611 0.165799 0.548611 9.454613e+07 \n", "2 1988 0.319444 0.484375 0.319444 0.483507 6.906268e+07 \n", "3 1989 0.322049 0.618056 0.322917 0.614583 7.735760e+07 \n", "4 1990 0.591146 1.102431 0.598090 1.100694 7.408945e+07 \n", "\n", " Summary \n", "0 In 1986.0, the stock opened between $0.09 and ... \n", "1 In 1987.0, the stock opened between $0.17 and ... \n", "2 In 1988.0, the stock opened between $0.32 and ... \n", "3 In 1989.0, the stock opened between $0.32 and ... \n", "4 In 1990.0, the stock opened between $0.59 and ... \n" ] } ], "source": [ "# =============================================================================\n", "# 1. Data Collection & Preprocessing\n", "# =============================================================================\n", "# Load the CSV file containing financial data.\n", "# (Make sure the CSV file \"MSFT_1986-03-13_2025-02-04.csv\" is in the \"data\" folder)\n", "csv_file_path = r\"D:\\ConvAI_Code\\MSFT_1986-03-13_2025-02-04.csv\" # Adjust the path if necessary\n", "# Load the CSV file into a DataFrame\n", "df = pd.read_csv(csv_file_path)\n", "\n", "# Display basic info about the dataset\n", "print(df.info())\n", "\n", "# Data Cleaning & Structuring\n", "\n", "# Convert 'Date' column to datetime format\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "\n", "# Sort data by Date\n", "df = df.sort_values(by='Date')\n", "\n", "# Extract Year from Date\n", "df['Year'] = df['Date'].dt.year\n", "\n", "# Aggregate data by Year to generate financial summaries\n", "yearly_summary = df.groupby('Year').agg(\n", " Open_Min=('Open', 'min'),\n", " Open_Max=('Open', 'max'),\n", " Close_Min=('Close', 'min'),\n", " Close_Max=('Close', 'max'),\n", " Avg_Volume=('Volume', 'mean')\n", ").reset_index()\n", "\n", "# Create a textual summary for each year\n", "yearly_summary['Summary'] = yearly_summary.apply(\n", " lambda row: f\"In {row['Year']}, the stock opened between ${row['Open_Min']:.2f} and ${row['Open_Max']:.2f}, \"\n", " f\"while closing between ${row['Close_Min']:.2f} and ${row['Close_Max']:.2f}. \"\n", " f\"The average trading volume was {row['Avg_Volume']:,.0f} shares.\",\n", " axis=1\n", ")\n", "\n", "# Display the cleaned and structured data\n", "print(yearly_summary.head()) # Use this for terminal/console\n", "# yearly_summary.head() # Use this in Jupyter Notebook\n", "\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "40" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# =============================================================================\n", "# 2. Basic RAG Implementation\n", "# =============================================================================\n", "# Convert financial summaries into text chunks and generate vector embeddings.\n", "embedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "# Convert yearly financial summaries into vector embeddings\n", "summary_texts = yearly_summary[\"Summary\"].tolist() # Extract summaries as text\n", "summary_embeddings = embedding_model.encode(summary_texts, convert_to_numpy=True) # Generate embeddings\n", "\n", "# Store embeddings as a NumPy array for further processing\n", "summary_embeddings.shape # This should be (num_years, embedding_size)\n", "\n", "# Define the dimension of embeddings (384 from MiniLM model)\n", "embedding_dim = 384\n", "\n", "# Create a FAISS index (Flat index for now, can be optimized later)\n", "faiss_index = faiss.IndexFlatL2(embedding_dim)\n", "\n", "# Convert embeddings to float32 (FAISS requires this format)\n", "summary_embeddings = summary_embeddings.astype('float32')\n", "\n", "# Add embeddings to the FAISS index\n", "faiss_index.add(summary_embeddings)\n", "\n", "# Store the year information for retrieval\n", "year_map = {i: yearly_summary[\"Year\"].iloc[i] for i in range(len(yearly_summary))}\n", "\n", "# Verify that embeddings are stored successfully\n", "faiss_index.ntotal\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merged summaries shape: (12, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YearMerged Summary
01986In 1986.0, the stock opened between $0.09 and ...
11990In 1989.0, the stock opened between $0.32 and ...
21992In 1991.0, the stock opened between $1.03 and ...
31996In 1994.0, the stock opened between $2.45 and ...
41999In 1997.0, the stock opened between $10.25 and...
\n", "
" ], "text/plain": [ " Year Merged Summary\n", "0 1986 In 1986.0, the stock opened between $0.09 and ...\n", "1 1990 In 1989.0, the stock opened between $0.32 and ...\n", "2 1992 In 1991.0, the stock opened between $1.03 and ...\n", "3 1996 In 1994.0, the stock opened between $2.45 and ...\n", "4 1999 In 1997.0, the stock opened between $10.25 and..." ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# =============================================================================\n", "# 3. Advanced RAG Implementation\n", "# =============================================================================\n", "# 3.1: BM25 for Keyword-Based Search\n", "# Tokenize each summary using SpaCy (tokens are converted to lowercase).\n", "tokenized_summaries = [[token.text.lower() for token in nlp(summary)] for summary in summary_texts]\n", "# Build the BM25 index.\n", "bm25 = BM25Okapi(tokenized_summaries)\n", "\n", "# 3.2: Define Retrieval Functions\n", "\n", "def retrieve_similar_summaries(query_text, top_k=3):\n", " \"\"\"\n", " Retrieve similar financial summaries using FAISS vector search.\n", " \"\"\"\n", " query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n", " distances, indices = faiss_index.search(query_embedding, top_k)\n", " results = []\n", " for idx in indices[0]:\n", " results.append((year_map[idx], yearly_summary.iloc[idx][\"Summary\"]))\n", " return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n", "\n", "def hybrid_retrieve(query_text, top_k=3, alpha=0.5):\n", " \"\"\"\n", " Hybrid retrieval combining FAISS (vector search) and BM25 (keyword search).\n", " Scores are combined using the weighting factor 'alpha'.\n", " \"\"\"\n", " query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n", " _, faiss_indices = faiss_index.search(query_embedding, top_k)\n", " \n", " bm25_scores = bm25.get_scores([token.text.lower() for token in nlp(query_text)])\n", " bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]\n", " \n", " combined_scores = {}\n", " for rank, idx in enumerate(faiss_indices[0]):\n", " combined_scores[idx] = alpha * (top_k - rank)\n", " bm25_norm_scores = normalize([bm25_scores])[0]\n", " for rank, idx in enumerate(bm25_top_indices):\n", " if idx in combined_scores:\n", " combined_scores[idx] += (1 - alpha) * (top_k - rank)\n", " else:\n", " combined_scores[idx] = (1 - alpha) * (top_k - rank)\n", " \n", " sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)\n", " results = [(year_map[idx], yearly_summary.iloc[idx][\"Summary\"]) for idx, _ in sorted_results]\n", " return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n", "\n", "def adaptive_retrieve(query_text, top_k=3, alpha=0.5):\n", " \"\"\"\n", " Adaptive retrieval re-ranks results by combining FAISS and BM25 scores.\n", " \"\"\"\n", " query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')\n", " _, faiss_indices = faiss_index.search(query_embedding, top_k)\n", " \n", " query_tokens = [token.text.lower() for token in nlp(query_text)]\n", " bm25_scores = bm25.get_scores(query_tokens)\n", " bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]\n", " \n", " faiss_scores = np.linspace(1, 0, num=top_k)\n", " bm25_norm_scores = normalize([bm25_scores])[0]\n", " \n", " combined_scores = {}\n", " for rank, idx in enumerate(faiss_indices[0]):\n", " combined_scores[idx] = alpha * faiss_scores[rank]\n", " for idx in bm25_top_indices:\n", " if idx in combined_scores:\n", " combined_scores[idx] += (1 - alpha) * bm25_norm_scores[idx]\n", " else:\n", " combined_scores[idx] = (1 - alpha) * bm25_norm_scores[idx]\n", " \n", " sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)\n", " results = [(year_map[idx], yearly_summary.iloc[idx][\"Summary\"]) for idx, _ in sorted_results]\n", " return pd.DataFrame(results, columns=[\"Year\", \"Summary\"])\n", "\n", "def merge_similar_chunks(threshold=0.95):\n", " \"\"\"\n", " Chunk Merging: Merge similar financial summaries based on cosine similarity.\n", " This reduces redundancy when multiple chunks are very similar.\n", " \"\"\"\n", " merged_summaries = []\n", " used_indices = set()\n", " for i in range(len(summary_embeddings)):\n", " if i in used_indices:\n", " continue\n", " similarities = cosine_similarity([summary_embeddings[i]], summary_embeddings)[0]\n", " similar_indices = np.where(similarities >= threshold)[0]\n", " merged_text = \" \".join(yearly_summary.iloc[idx][\"Summary\"] for idx in similar_indices)\n", " merged_summaries.append((yearly_summary.iloc[i][\"Year\"], merged_text))\n", " used_indices.update(similar_indices)\n", " return pd.DataFrame(merged_summaries, columns=[\"Year\", \"Merged Summary\"])\n", "\n", "# Optional: Check merged summaries for debugging.\n", "merged_summary_df = merge_similar_chunks(threshold=0.95)\n", "print(\"Merged summaries shape:\", merged_summary_df.shape)\n", "merged_summary_df.head()\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* Running on local URL: http://127.0.0.1:7864\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# =============================================================================\n", "# 4. UI Development using Gradio (Updated for newer API)\n", "# =============================================================================\n", "def generate_response(query_text, top_k=3, alpha=0.5):\n", " \"\"\"\n", " Generate an answer for a financial query by:\n", " - Validating the query with an input-side guardrail.\n", " - Retrieving context using adaptive retrieval.\n", " - Generating a refined answer using FLAN-T5-Small.\n", " Returns:\n", " answer (str): The generated answer.\n", " confidence (float): A mock confidence score based on BM25 scores.\n", " \"\"\"\n", " # -----------------------------------------------------------------------------\n", " # Guard Rail Implementation (Input-Side)\n", " # -----------------------------------------------------------------------------\n", " financial_keywords = [\"open\", \"close\", \"stock\", \"price\", \"volume\", \"trading\"]\n", " if not any(keyword in query_text.lower() for keyword in financial_keywords):\n", " return (\"Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question.\"), 0.0\n", "\n", " # Retrieve context using adaptive retrieval.\n", " context_df = adaptive_retrieve(query_text, top_k=top_k, alpha=alpha)\n", " context_text = \" \".join(context_df[\"Summary\"].tolist())\n", " \n", " # Adjust the prompt to provide clear instructions.\n", " prompt = f\"Given the following financial data:\\n{context_text}\\nAnswer this question: {query_text}.\"\n", " \n", " # Use FLAN-T5-Small for text generation via the text2text-generation pipeline.\n", " # Increase max_length to allow longer answers.\n", " generator = pipeline('text2text-generation', model='google/flan-t5-small')\n", " generated = generator(prompt, max_length=200, num_return_sequences=1)\n", " answer = generated[0]['generated_text'].replace(prompt, \"\").strip()\n", " \n", " # Fallback message if answer is empty.\n", " if not answer:\n", " answer = \"I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question.\"\n", " \n", " # Compute a mock confidence score using normalized BM25 scores.\n", " query_tokens = [token.text.lower() for token in nlp(query_text)]\n", " bm25_scores = bm25.get_scores(query_tokens)\n", " max_score = np.max(bm25_scores) if np.max(bm25_scores) > 0 else 1\n", " confidence = round(np.mean(bm25_scores) / max_score, 2)\n", " \n", " return answer, confidence\n", "\n", "# Create the Gradio interface using the new API.\n", "iface = gr.Interface(\n", " fn=generate_response,\n", " inputs=gr.Textbox(lines=2, placeholder=\"Enter your financial question here...\"),\n", " outputs=[gr.Textbox(label=\"Answer\"), gr.Textbox(label=\"Confidence Score\")],\n", " title=\"Financial RAG Model Interface\",\n", " description=(\"Ask questions based on the company's financial summaries \"\n", " )\n", ")\n", "\n", "# Launch the Gradio interface.\n", "iface.launch()\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cpu\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Question: What year had the lowest stock prices?\n", "Answer: I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question.\n", "Confidence Score: 1.0\n", "--------------------------------------------------\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cpu\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Question: How did the trading volume vary?\n", "Answer: The average trading volume was 23,244,919 shares\n", "Confidence Score: 1.0\n", "--------------------------------------------------\n", "Question: What is the capital of France?\n", "Answer: Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question.\n", "Confidence Score: 0.0\n", "--------------------------------------------------\n" ] } ], "source": [ "# =============================================================================\n", "# 6. Testing & Validation (Updated)\n", "# =============================================================================\n", "def print_test_results(query_text, top_k=3, alpha=0.5):\n", " answer, confidence = generate_response(query_text, top_k, alpha)\n", " print(\"Question: \", query_text)\n", " print(\"Answer: \", answer)\n", " print(\"Confidence Score: \", confidence)\n", " print(\"-\" * 50)\n", "\n", "# Test 1: High-confidence financial query.\n", "query_high = \"What year had the lowest stock prices?\"\n", "print_test_results(query_high)\n", "\n", "# Test 2: Low-confidence financial query.\n", "query_low = \"How did the trading volume vary?\"\n", "print_test_results(query_low)\n", "\n", "# Test 3: Irrelevant query (should trigger guardrail).\n", "query_irrelevant = \"What is the capital of France?\"\n", "print_test_results(query_irrelevant)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 2 }