import streamlit as st import json import os import numpy as np import faiss from sentence_transformers import SentenceTransformer from PyPDF2 import PdfReader from openai import OpenAI import time from PIL import Image class IntegratedChatSystem: def __init__(self, api_key: str): self.api_key = api_key self.client = OpenAI(api_key=api_key) self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') self.embedding_dim = 384 self.index = faiss.IndexFlatIP(self.embedding_dim) self.metadata = [] self.fine_tuned_model = None def add_image(self, image, context_text: str): """Add an image and its context to the retrieval system""" try: # Generate embedding for the context text embedding = self.embedding_model.encode(context_text) embedding = np.expand_dims(embedding, axis=0) # Save image and add to index if not os.path.exists('uploaded_images'): os.makedirs('uploaded_images') # Generate unique filename filename = f"image_{len(self.metadata)}.jpg" image_path = os.path.join('uploaded_images', filename) # Save image image.save(image_path) # Add to FAISS index self.index.add(embedding) self.metadata.append({ "filepath": image_path, "context": context_text }) return True except Exception as e: st.error(f"Error adding image: {str(e)}") return False def search_relevant_images(self, query: str, similarity_threshold: float = 0.7, top_k: int = 3): """Search for relevant images based on query""" try: if self.index.ntotal == 0: return [] # Generate embedding for the query query_embedding = self.embedding_model.encode(query) query_embedding = np.expand_dims(query_embedding, axis=0) # Search in the index distances, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal)) # Filter results based on similarity threshold relevant_images = [ self.metadata[i] for i, distance in zip(indices[0], distances[0]) if i != -1 and distance >= similarity_threshold ] return relevant_images except Exception as e: st.error(f"Error searching images: {str(e)}") return [] def generate_qna_pairs(self, text: str): """Generate question-answer pairs from text using OpenAI API""" try: completion = self.client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Generate 11 relevant question-answer pairs from the given text. Format each pair as a complete, informative question with its corresponding detailed answer."}, {"role": "user", "content": f"Text: {text}"} ], temperature=0.7 ) response_text = completion.choices[0].message.content qa_pairs = [] pairs = response_text.split('\n\n') for pair in pairs: if 'Q:' in pair and 'A:' in pair: question = pair.split('A:')[0].replace('Q:', '').strip() answer = pair.split('A:')[1].strip() qa_pairs.append({ "messages": [ {"role": "system", "content": "You are an assistant chatbot. You should help the user by answering their question."}, {"role": "user", "content": question}, {"role": "assistant", "content": answer} ] }) return qa_pairs except Exception as e: st.error(f"Error generating QA pairs: {str(e)}") return [] def create_fine_tuning_job(self, training_file_id): try: response = self.client.fine_tuning.jobs.create( training_file=training_file_id, model="gpt-3.5-turbo-0125" ) return response.id except Exception as e: st.error(f"Error creating fine-tuning job: {str(e)}") return None def monitor_fine_tuning_job(self, job_id): try: progress_bar = st.progress(0) status_text = st.empty() details_text = st.empty() stages = { "validating_files": "Validating training files...", "queued": "Job queued - waiting to start...", "running": "Training in progress...", "succeeded": "Training completed successfully!", "failed": "Training failed.", "cancelled": "Training was cancelled." } # Approximate progress percentages for each stage progress_mapping = { "validating_files": 0.1, "queued": 0.2, "running": 0.6, "succeeded": 1.0, "failed": 1.0, "cancelled": 1.0 } last_status = None start_time = time.time() while True: job_status = self.client.fine_tuning.jobs.retrieve(job_id) current_status = job_status.status # Update progress bar progress_bar.progress(progress_mapping.get(current_status, 0)) # Update status message status_message = stages.get(current_status, "Processing...") status_text.markdown(f"**Status:** {status_message}") # Show elapsed time and other details elapsed_time = int(time.time() - start_time) details_text.markdown(f""" **Details:** - Time elapsed: {elapsed_time // 60}m {elapsed_time % 60}s - Job ID: {job_id} - Current stage: {current_status} """) # Status changed notification if current_status != last_status: if current_status == "running": st.info("🚀 Model training has begun!") elif current_status == "succeeded": st.success("✅ Fine-tuning completed successfully!") self.fine_tuned_model = job_status.fine_tuned_model st.balloons() # Celebration effect # Display model details st.markdown(f""" **Training Completed!** - Model ID: `{self.fine_tuned_model}` - Total training time: {elapsed_time // 60}m {elapsed_time % 60}s - Status: Ready to use You can now use the chat interface to interact with your fine-tuned model! """) return True elif current_status in ["failed", "cancelled"]: st.error(f"❌ Training {current_status}. Please check the OpenAI dashboard for details.") return False last_status = current_status time.sleep(10) except Exception as e: st.error(f"Error monitoring fine-tuning job: {str(e)}") return False # Initialize Streamlit interface st.title("PDF Fine-tuning and Chat System with Image Retrieval") # Initialize session state if 'chat_system' not in st.session_state: api_key = "" st.session_state.chat_system = IntegratedChatSystem(api_key) # Sidebar for image upload with st.sidebar: st.header("Image Upload") uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) image_context = st.text_area("Image Context Description") if uploaded_image and image_context and st.button("Add Image"): image = Image.open(uploaded_image) if st.session_state.chat_system.add_image(image, image_context): st.success("Image added successfully!") # Main area tabs tab1, tab2 = st.tabs(["Fine-tuning", "Chat"]) with tab1: st.header("Upload and Fine-tune") uploaded_file = st.file_uploader("Upload a PDF for Fine-Tuning", type=["pdf"]) if uploaded_file is not None: if st.button("Process and Fine-tune"): with st.spinner("Processing PDF..."): # Extract text from PDF reader = PdfReader(uploaded_file) text = "\n".join([page.extract_text() for page in reader.pages]) # Show processing steps progress_placeholder = st.empty() # Step 1: Generate QA pairs progress_placeholder.text("Step 1/3: Generating QA pairs...") qa_pairs = st.session_state.chat_system.generate_qna_pairs(text) if qa_pairs: # Step 2: Save and upload training file progress_placeholder.text("Step 2/3: Preparing training file...") jsonl_file = "questions_and_answers.jsonl" with open(jsonl_file, 'w') as f: for pair in qa_pairs: json.dump(pair, f) f.write("\n") with open(jsonl_file, "rb") as f: response = st.session_state.chat_system.client.files.create( file=f, purpose="fine-tune" ) training_file_id = response.id # Step 3: Start fine-tuning progress_placeholder.text("Step 3/3: Starting fine-tuning process...") job_id = st.session_state.chat_system.create_fine_tuning_job(training_file_id) if job_id: progress_placeholder.empty() # Clear the step indicator st.info(f"🎯 Fine-tuning job initiated!") st.session_state.chat_system.monitor_fine_tuning_job(job_id) with tab2: st.header("Chat Interface") if st.session_state.chat_system.fine_tuned_model: st.success(f"Using fine-tuned model: {st.session_state.chat_system.fine_tuned_model}") else: st.info("Using default model (fine-tuned model not available)") user_message = st.text_input("Enter your message:") if st.button("Send") and user_message: result = st.session_state.chat_system.chat(user_message) st.write("Response:", result["response"]) if result["relevant_images"]: st.subheader("Relevant Images:") for img_data in result["relevant_images"]: if os.path.exists(img_data["filepath"]): image = Image.open(img_data["filepath"]) st.image(image, caption=img_data["context"])