Spaces:

VinayHajare
/

OCRHub

Running

App Files Files Community

OCRHub / app.py

VinayHajare

Create app.py

34bb902 verified 9 months ago

raw

history blame contribute delete

9.35 kB

	import streamlit as st
	from ocr_processor import OCRProcessor
	import tempfile
	import os
	from PIL import Image
	import json

	# Page configuration
	st.set_page_config(
	page_title="OCR Hub",
	page_icon="🔍",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better UI
	st.markdown("""
	<style>
	.stApp {
	max-width: 100%;
	padding: 1rem;
	}
	.main {
	background-color: #f8f9fa;
	}
	.stButton button {
	width: 100%;
	border-radius: 5px;
	height: 3em;
	background-color: #4CAF50;
	color: white;
	}
	.stSelectbox {
	margin-bottom: 1rem;
	}
	.upload-text {
	text-align: center;
	padding: 2rem;
	border: 2px dashed #ccc;
	border-radius: 10px;
	background-color: #ffffff;
	}
	.stImage {
	border-radius: 10px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.gallery {
	display: grid;
	grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
	gap: 1rem;
	padding: 1rem;
	}
	.gallery-item {
	border: 1px solid #ddd;
	border-radius: 8px;
	padding: 0.5rem;
	background: white;
	}
	</style>
	""", unsafe_allow_html=True)

	def get_available_models():
	return ["llava:7b", "MiniCPM-V","llama3.2-vision:11b"]

	def process_single_image(processor, image_path, format_type, enable_preprocessing):
	"""Process a single image and return the result"""
	try:
	result = processor.process_image(
	image_path=image_path,
	format_type=format_type,
	preprocess=enable_preprocessing
	)
	return result
	except Exception as e:
	return f"Error processing image: {str(e)}"

	def process_batch_images(processor, image_paths, format_type, enable_preprocessing):
	"""Process multiple images and return results"""
	try:
	results = processor.process_batch(
	input_path=image_paths,
	format_type=format_type,
	preprocess=enable_preprocessing
	)
	return results
	except Exception as e:
	return {"error": str(e)}

	def main():
	st.title("🔍 OCR Hub")
	st.markdown("<p style='text-align: center; color: #666;'>Powered by Ollama Vision Models</p>", unsafe_allow_html=True)

	# Sidebar controls
	with st.sidebar:
	st.header("🎮 Controls")

	selected_model = st.selectbox(
	"🤖 Select Vision Model",
	get_available_models(),
	index=0,
	)

	format_type = st.selectbox(
	"📄 Output Format",
	["markdown", "text", "json", "structured", "key_value"],
	help="Choose how you want the extracted text to be formatted"
	)

	max_workers = st.slider(
	"🔄 Parallel Processing",
	min_value=1,
	max_value=8,
	value=2,
	help="Number of images to process in parallel (for batch processing)"
	)

	enable_preprocessing = st.checkbox(
	"🔍 Enable Preprocessing",
	value=True,
	help="Apply image enhancement and preprocessing"
	)

	st.markdown("---")

	# Model info box
	if selected_model == "llava:7b":
	st.info("LLaVA 7B: Efficient vision-language model optimized for real-time processing")
	elif selected_model == "MiniCPM-V":
	st.info("MiniCPM-V 2.6: A GPT-4V Level MLLM for Single Image, Multi Image and Video, outperforms GPT-4o mini, Gemini 1.5 Pro and Claude 3.5 Sonnet")
	else:
	st.info("Llama 3.2 Vision: Advanced model with high accuracy for complex text extraction")

	# Initialize OCR Processor
	processor = OCRProcessor(model_name=selected_model, max_workers=max_workers)

	# Main content area with tabs
	tab1, tab2 = st.tabs(["📸 Image Processing", "ℹ️ About"])

	with tab1:
	# File upload area with multiple file support
	uploaded_files = st.file_uploader(
	"Drop your images here",
	type=['png', 'jpg', 'jpeg', 'tiff', 'bmp', 'pdf'],
	accept_multiple_files=True,
	help="Supported formats: PNG, JPG, JPEG, TIFF, BMP, PDF"
	)

	if uploaded_files:
	# Create a temporary directory for uploaded files
	with tempfile.TemporaryDirectory() as temp_dir:
	image_paths = []

	# Save uploaded files and collect paths
	for uploaded_file in uploaded_files:
	temp_path = os.path.join(temp_dir, uploaded_file.name)
	with open(temp_path, "wb") as f:
	f.write(uploaded_file.getvalue())
	image_paths.append(temp_path)

	# Display images in a gallery
	st.subheader(f"📸 Input Images ({len(uploaded_files)} files)")
	cols = st.columns(min(len(uploaded_files), 4))
	for idx, uploaded_file in enumerate(uploaded_files):
	with cols[idx % 4]:
	image = Image.open(uploaded_file)
	st.image(image, use_container_width=True, caption=uploaded_file.name)

	# Process button
	if st.button("🚀 Process Images"):
	with st.spinner("Processing images..."):
	if len(image_paths) == 1:
	# Single image processing
	result = process_single_image(
	processor,
	image_paths[0],
	format_type,
	enable_preprocessing
	)
	st.subheader("📝 Extracted Text")
	st.markdown(result)

	# Download button for single result
	st.download_button(
	"📥 Download Result",
	result,
	file_name=f"ocr_result.{format_type}",
	mime="text/plain"
	)
	else:
	# Batch processing
	results = process_batch_images(
	processor,
	image_paths,
	format_type,
	enable_preprocessing
	)

	# Display statistics
	st.subheader("📊 Processing Statistics")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Images", results['statistics']['total'])
	with col2:
	st.metric("Successful", results['statistics']['successful'])
	with col3:
	st.metric("Failed", results['statistics']['failed'])

	# Display results
	st.subheader("📝 Extracted Text")
	for file_path, text in results['results'].items():
	with st.expander(f"Result: {os.path.basename(file_path)}"):
	st.markdown(text)

	# Display errors if any
	if results['errors']:
	st.error("⚠️ Some files had errors:")
	for file_path, error in results['errors'].items():
	st.warning(f"{os.path.basename(file_path)}: {error}")

	# Download all results as JSON
	if st.button("📥 Download All Results"):
	json_results = json.dumps(results, indent=2)
	st.download_button(
	"📥 Download Results JSON",
	json_results,
	file_name="ocr_results.json",
	mime="application/json"
	)

	with tab2:
	st.header("About OCR Hub")
	st.markdown("""
	This application uses state-of-the-art vision language models through Ollama to extract text from images.

	### Features:
	- 🖼️ Support for multiple image formats
	- 📦 Batch processing capability
	- 🔄 Parallel processing
	- 🔍 Image preprocessing and enhancement
	- 📊 Multiple output formats
	- 📥 Easy result download

	### Models:
	- LLaVA 7B: Efficient vision-language model for real-time processing
	- Llama 3.2 Vision: Advanced model with high accuracy for complex documents
	- MiniCPM-V 2.6: Process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344)
	""")

	if __name__ == "__main__":
	main()