Spaces:

seemggoel
/

Finetuning_Multimodal_LLM

Runtime error

App Files Files Community

Finetuning_Multimodal_LLM / app_old.py

seemggoel

Rename app.py to app_old.py

f6a39dc verified 4 months ago

raw

history blame contribute delete

9.59 kB

	# import gradio as gr
	# from PIL import Image
	# import torch
	# from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM

	# # Set device to CPU and default dtype to float32
	# DEVICE = torch.device("cpu")
	# torch.set_default_dtype(torch.float32)

	# # Load CLIP model and processor
	# try:
	# clip_model = CLIPModel.from_pretrained(
	# "openai/clip-vit-base-patch32",
	# torch_dtype=torch.float32
	# ).to(DEVICE)
	# clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
	# except Exception as e:
	# raise Exception(f"Error loading CLIP model or processor: {str(e)}")

	# # Load language model and tokenizer
	# def load_model():
	# try:
	# # Use a lightweight model suitable for CPU (distilgpt2 for lower memory)
	# #model_name = "distilgpt2" # Switched to distilgpt2 for better CPU performance
	# model_name="microsoft/phi-3-mini-4k-instruct"
	# model = AutoModelForCausalLM.from_pretrained(
	# model_name,
	# torch_dtype=torch.float32,
	# trust_remote_code=True
	# ).to(DEVICE)
	# tokenizer = AutoTokenizer.from_pretrained(
	# model_name,
	# trust_remote_code=True
	# )

	# # Set pad token if not defined
	# if tokenizer.pad_token is None:
	# tokenizer.pad_token = tokenizer.eos_token
	# model.config.pad_token_id = model.config.eos_token_id

	# model.eval()
	# return model, tokenizer

	# except Exception as e:
	# raise Exception(f"Error loading language model: {str(e)}")

	# # Simple multimodal captioning function
	# def generate_caption(image, model, tokenizer):
	# try:
	# if not isinstance(image, Image.Image):
	# return "Error: Input must be a valid image."
	# if image.mode != "RGB":
	# image = image.convert("RGB")

	# # Process image with CLIP
	# image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE)
	# with torch.no_grad():
	# image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32)

	# # Prepare prompt
	# prompt = "Caption this image:"
	# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
	# input_ids = inputs["input_ids"].to(DEVICE)
	# attention_mask = inputs["attention_mask"].to(DEVICE)

	# # Simple projection: use image embedding as a prefix
	# projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE)
	# with torch.no_grad():
	# image_embedding_projected = projection(image_embedding)

	# # Combine image and text embeddings
	# text_embedding = model.get_input_embeddings()(input_ids)
	# fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1)
	# attention_mask = torch.cat([
	# torch.ones(input_ids.size(0), 1, device=DEVICE),
	# attention_mask
	# ], dim=1)

	# # Generate caption
	# with torch.no_grad():
	# generated_ids = model.generate(
	# inputs_embeds=fused_embedding,
	# attention_mask=attention_mask,
	# max_new_tokens=50,
	# min_length=10,
	# num_beams=3, # Reduced for CPU speed
	# repetition_penalty=1.2,
	# do_sample=False
	# )
	# caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	# return caption.strip()

	# except Exception as e:
	# return f"Error generating caption: {str(e)}"

	# # Load model and tokenizer
	# model, tokenizer = load_model()

	# # Gradio interface with explicit component configuration
	# def gradio_caption(image):
	# if image is None:
	# return "Please upload an image."
	# result = generate_caption(image, model, tokenizer)
	# return result if isinstance(result, str) else str(result)

	# # Define components explicitly to avoid schema issues
	# inputs = gr.Image(
	# type="pil",
	# label="Upload an Image",
	# sources=["upload"], # Restrict to uploads to simplify schema
	# )
	# outputs = gr.Textbox(
	# label="Generated Caption",
	# lines=2,
	# placeholder="Caption will appear here..."
	# )

	# # Use gr.Blocks for finer control instead of gr.Interface
	# with gr.Blocks(title="CPU-Based Image Captioning") as interface:
	# gr.Markdown(
	# """
	# # CPU-Based Image Captioning with CLIP and DistilGPT2
	# Upload an image to generate a caption using a lightweight multimodal model.
	# This app runs on CPU and may produce basic captions due to simplified processing.
	# """
	# )
	# with gr.Row():
	# with gr.Column():
	# image_input = inputs
	# submit_button = gr.Button("Generate Caption")
	# with gr.Column():
	# caption_output = outputs
	# submit_button.click(
	# fn=gradio_caption,
	# inputs=image_input,
	# outputs=caption_output
	# )

	# # Launch locally with debugging enabled
	# interface.launch(debug=True)




	import gradio as gr
	from PIL import Image
	import torch
	from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM

	# Set device to CPU and default dtype to float32
	DEVICE = torch.device("cpu")
	torch.set_default_dtype(torch.float32)

	# Load CLIP model and processor
	try:
	clip_model = CLIPModel.from_pretrained(
	"openai/clip-vit-base-patch32",
	torch_dtype=torch.float32
	).to(DEVICE)
	clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
	except Exception as e:
	raise Exception(f"Error loading CLIP model or processor: {str(e)}")

	# Load language model and tokenizer
	def load_model():
	try:
	#model_name = "distilgpt2"
	model_name="microsoft/phi-3-mini-4k-instruct"
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	trust_remote_code=True
	).to(DEVICE)
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	trust_remote_code=True
	)

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model.config.pad_token_id = model.config.eos_token_id

	model.eval()
	return model, tokenizer

	except Exception as e:
	raise Exception(f"Error loading language model: {str(e)}")

	# Caption generation logic
	def generate_caption(image, model, tokenizer):
	try:
	# Ensure the image is a PIL Image and convert to RGB if necessary
	if not isinstance(image, Image.Image):
	image = Image.frombytes('RGB', image.size, image.rgb) if hasattr(image, 'rgb') else image
	else:
	# Convert to RGB if the image has a different mode (e.g., RGBA, L)
	if image.mode != 'RGB':
	image = image.convert('RGB')


	image_inputs = clip_processor(images=image, return_tensors="pt").to(DEVICE)
	with torch.no_grad():
	image_embedding = clip_model.get_image_features(**image_inputs).to(torch.float32)

	prompt = "[IMG] Caption this image:"
	inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
	input_ids = inputs["input_ids"].to(DEVICE)
	attention_mask = inputs["attention_mask"].to(DEVICE)

	projection = torch.nn.Linear(512, model.config.hidden_size).to(DEVICE)
	with torch.no_grad():
	image_embedding_projected = projection(image_embedding)

	text_embedding = model.get_input_embeddings()(input_ids)
	fused_embedding = torch.cat([image_embedding_projected.unsqueeze(1), text_embedding], dim=1)
	attention_mask = torch.cat([
	torch.ones(input_ids.size(0), 1, device=DEVICE),
	attention_mask
	], dim=1)

	with torch.no_grad():
	generated_ids = model.generate(
	inputs_embeds=fused_embedding,
	attention_mask=attention_mask,
	max_new_tokens=50,
	min_length=10,
	num_beams=3,
	repetition_penalty=1.2,
	do_sample=False
	)
	caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	return caption.strip()

	except Exception as e:
	return f"Error generating caption: {str(e)}"

	# Load model/tokenizer
	model, tokenizer = load_model()

	# Wrapper for Gradio function call
	def gradio_caption(image):
	if image is None:
	return "Please upload an image."
	return generate_caption(image, model, tokenizer)

	# Reusable UI component blocks
	def create_image_input():
	return gr.Image(
	type="pil",
	label="Upload an Image",
	sources=["upload"]
	)

	def create_caption_output():
	return gr.Textbox(
	label="Generated Caption",
	lines=2,
	placeholder="Caption will appear here..."
	)

	# Build UI
	interface = gr.Interface(
	fn=gradio_caption,
	inputs=gr.Image(type="pil", label="Upload an Image"),
	outputs=gr.Textbox(label="Generated Caption"),
	title="Image Captioning with Fine-Tuned MultiModalModel (Epoch 0)",
	description=(
	"Upload an image to generate a caption using a fine-tuned multimodal model based on Phi-3 and CLIP. "
	"The weights from Epoch_0 are used here, but the model may not generate accurate captions due to limited training."
	)
	)
	interface.launch()