|
import streamlit as st |
|
import json |
|
import os |
|
import numpy as np |
|
import faiss |
|
from sentence_transformers import SentenceTransformer |
|
from PyPDF2 import PdfReader |
|
from openai import OpenAI |
|
import time |
|
from PIL import Image |
|
|
|
class IntegratedChatSystem: |
|
def __init__(self, api_key: str): |
|
self.api_key = api_key |
|
self.client = OpenAI(api_key=api_key) |
|
self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
self.embedding_dim = 384 |
|
self.index = faiss.IndexFlatIP(self.embedding_dim) |
|
self.metadata = [] |
|
self.fine_tuned_model = None |
|
|
|
|
|
def add_image(self, image, context_text: str): |
|
"""Add an image and its context to the retrieval system""" |
|
try: |
|
|
|
embedding = self.embedding_model.encode(context_text) |
|
embedding = np.expand_dims(embedding, axis=0) |
|
|
|
|
|
if not os.path.exists('uploaded_images'): |
|
os.makedirs('uploaded_images') |
|
|
|
|
|
filename = f"image_{len(self.metadata)}.jpg" |
|
image_path = os.path.join('uploaded_images', filename) |
|
|
|
|
|
image.save(image_path) |
|
|
|
|
|
self.index.add(embedding) |
|
self.metadata.append({ |
|
"filepath": image_path, |
|
"context": context_text |
|
}) |
|
|
|
return True |
|
except Exception as e: |
|
st.error(f"Error adding image: {str(e)}") |
|
return False |
|
|
|
def search_relevant_images(self, query: str, similarity_threshold: float = 0.7, top_k: int = 3): |
|
"""Search for relevant images based on query""" |
|
try: |
|
if self.index.ntotal == 0: |
|
return [] |
|
|
|
|
|
query_embedding = self.embedding_model.encode(query) |
|
query_embedding = np.expand_dims(query_embedding, axis=0) |
|
|
|
|
|
distances, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal)) |
|
|
|
|
|
relevant_images = [ |
|
self.metadata[i] for i, distance in zip(indices[0], distances[0]) |
|
if i != -1 and distance >= similarity_threshold |
|
] |
|
|
|
return relevant_images |
|
except Exception as e: |
|
st.error(f"Error searching images: {str(e)}") |
|
return [] |
|
|
|
def generate_qna_pairs(self, text: str): |
|
"""Generate question-answer pairs from text using OpenAI API""" |
|
try: |
|
completion = self.client.chat.completions.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "system", "content": "Generate 11 relevant question-answer pairs from the given text. Format each pair as a complete, informative question with its corresponding detailed answer."}, |
|
{"role": "user", "content": f"Text: {text}"} |
|
], |
|
temperature=0.7 |
|
) |
|
|
|
response_text = completion.choices[0].message.content |
|
qa_pairs = [] |
|
|
|
pairs = response_text.split('\n\n') |
|
for pair in pairs: |
|
if 'Q:' in pair and 'A:' in pair: |
|
question = pair.split('A:')[0].replace('Q:', '').strip() |
|
answer = pair.split('A:')[1].strip() |
|
|
|
qa_pairs.append({ |
|
"messages": [ |
|
{"role": "system", "content": "You are an assistant chatbot. You should help the user by answering their question."}, |
|
{"role": "user", "content": question}, |
|
{"role": "assistant", "content": answer} |
|
] |
|
}) |
|
|
|
return qa_pairs |
|
except Exception as e: |
|
st.error(f"Error generating QA pairs: {str(e)}") |
|
return [] |
|
|
|
def create_fine_tuning_job(self, training_file_id): |
|
try: |
|
response = self.client.fine_tuning.jobs.create( |
|
training_file=training_file_id, |
|
model="gpt-3.5-turbo-0125" |
|
) |
|
return response.id |
|
except Exception as e: |
|
st.error(f"Error creating fine-tuning job: {str(e)}") |
|
return None |
|
|
|
|
|
def monitor_fine_tuning_job(self, job_id): |
|
try: |
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
details_text = st.empty() |
|
|
|
stages = { |
|
"validating_files": "Validating training files...", |
|
"queued": "Job queued - waiting to start...", |
|
"running": "Training in progress...", |
|
"succeeded": "Training completed successfully!", |
|
"failed": "Training failed.", |
|
"cancelled": "Training was cancelled." |
|
} |
|
|
|
|
|
progress_mapping = { |
|
"validating_files": 0.1, |
|
"queued": 0.2, |
|
"running": 0.6, |
|
"succeeded": 1.0, |
|
"failed": 1.0, |
|
"cancelled": 1.0 |
|
} |
|
|
|
last_status = None |
|
start_time = time.time() |
|
|
|
while True: |
|
job_status = self.client.fine_tuning.jobs.retrieve(job_id) |
|
current_status = job_status.status |
|
|
|
|
|
progress_bar.progress(progress_mapping.get(current_status, 0)) |
|
|
|
|
|
status_message = stages.get(current_status, "Processing...") |
|
status_text.markdown(f"**Status:** {status_message}") |
|
|
|
|
|
elapsed_time = int(time.time() - start_time) |
|
details_text.markdown(f""" |
|
**Details:** |
|
- Time elapsed: {elapsed_time // 60}m {elapsed_time % 60}s |
|
- Job ID: {job_id} |
|
- Current stage: {current_status} |
|
""") |
|
|
|
|
|
if current_status != last_status: |
|
if current_status == "running": |
|
st.info("π Model training has begun!") |
|
elif current_status == "succeeded": |
|
st.success("β
Fine-tuning completed successfully!") |
|
self.fine_tuned_model = job_status.fine_tuned_model |
|
st.balloons() |
|
|
|
st.markdown(f""" |
|
**Training Completed!** |
|
- Model ID: `{self.fine_tuned_model}` |
|
- Total training time: {elapsed_time // 60}m {elapsed_time % 60}s |
|
- Status: Ready to use |
|
|
|
You can now use the chat interface to interact with your fine-tuned model! |
|
""") |
|
return True |
|
elif current_status in ["failed", "cancelled"]: |
|
st.error(f"β Training {current_status}. Please check the OpenAI dashboard for details.") |
|
return False |
|
|
|
last_status = current_status |
|
time.sleep(10) |
|
|
|
except Exception as e: |
|
st.error(f"Error monitoring fine-tuning job: {str(e)}") |
|
return False |
|
|
|
|
|
st.title("PDF Fine-tuning and Chat System with Image Retrieval") |
|
|
|
|
|
if 'chat_system' not in st.session_state: |
|
api_key = "" |
|
st.session_state.chat_system = IntegratedChatSystem(api_key) |
|
|
|
|
|
with st.sidebar: |
|
st.header("Image Upload") |
|
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) |
|
image_context = st.text_area("Image Context Description") |
|
|
|
if uploaded_image and image_context and st.button("Add Image"): |
|
image = Image.open(uploaded_image) |
|
if st.session_state.chat_system.add_image(image, image_context): |
|
st.success("Image added successfully!") |
|
|
|
|
|
tab1, tab2 = st.tabs(["Fine-tuning", "Chat"]) |
|
|
|
with tab1: |
|
st.header("Upload and Fine-tune") |
|
uploaded_file = st.file_uploader("Upload a PDF for Fine-Tuning", type=["pdf"]) |
|
|
|
if uploaded_file is not None: |
|
if st.button("Process and Fine-tune"): |
|
with st.spinner("Processing PDF..."): |
|
|
|
reader = PdfReader(uploaded_file) |
|
text = "\n".join([page.extract_text() for page in reader.pages]) |
|
|
|
|
|
progress_placeholder = st.empty() |
|
|
|
|
|
progress_placeholder.text("Step 1/3: Generating QA pairs...") |
|
qa_pairs = st.session_state.chat_system.generate_qna_pairs(text) |
|
|
|
if qa_pairs: |
|
|
|
progress_placeholder.text("Step 2/3: Preparing training file...") |
|
jsonl_file = "questions_and_answers.jsonl" |
|
with open(jsonl_file, 'w') as f: |
|
for pair in qa_pairs: |
|
json.dump(pair, f) |
|
f.write("\n") |
|
|
|
with open(jsonl_file, "rb") as f: |
|
response = st.session_state.chat_system.client.files.create( |
|
file=f, |
|
purpose="fine-tune" |
|
) |
|
training_file_id = response.id |
|
|
|
|
|
progress_placeholder.text("Step 3/3: Starting fine-tuning process...") |
|
job_id = st.session_state.chat_system.create_fine_tuning_job(training_file_id) |
|
|
|
if job_id: |
|
progress_placeholder.empty() |
|
st.info(f"π― Fine-tuning job initiated!") |
|
st.session_state.chat_system.monitor_fine_tuning_job(job_id) |
|
|
|
with tab2: |
|
st.header("Chat Interface") |
|
if st.session_state.chat_system.fine_tuned_model: |
|
st.success(f"Using fine-tuned model: {st.session_state.chat_system.fine_tuned_model}") |
|
else: |
|
st.info("Using default model (fine-tuned model not available)") |
|
|
|
user_message = st.text_input("Enter your message:") |
|
if st.button("Send") and user_message: |
|
result = st.session_state.chat_system.chat(user_message) |
|
|
|
st.write("Response:", result["response"]) |
|
|
|
if result["relevant_images"]: |
|
st.subheader("Relevant Images:") |
|
for img_data in result["relevant_images"]: |
|
if os.path.exists(img_data["filepath"]): |
|
image = Image.open(img_data["filepath"]) |
|
st.image(image, caption=img_data["context"]) |