Spaces:
Running
Running
import streamlit as st | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
from transformers import pipeline | |
import re | |
# Ensure Tesseract-OCR is properly configured (Uncomment & update path if needed) | |
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
# Load pre-trained Hugging Face models | |
summarizer = pipeline("summarization", model="t5-small") | |
medical_qa = pipeline("question-answering", model="deepset/bert-base-cased-squad2") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text()) | |
return text if text else "No text found in PDF." | |
# Function to extract text from images (JPG, PNG) | |
def extract_text_from_image(image_file): | |
image = Image.open(image_file) | |
text = pytesseract.image_to_string(image) | |
return text.strip() if text else "No text found in Image." | |
# Function to summarize medical report | |
def summarize_report(text): | |
if len(text) > 500: # Handle long text | |
text = text[:500] | |
summary = summarizer(text, max_length=150, min_length=50, do_sample=False) | |
return summary[0]['summary_text'] | |
# Function to find medical terms dynamically using regex | |
def extract_medical_terms(text): | |
words = re.findall(r'\b[A-Z][a-z]+(?:[ -][A-Z][a-z]+)*\b', text) | |
return list(set(words)) | |
# Function to explain medical terms | |
def explain_term(term): | |
context = "Hypercholesterolemia is a condition with high cholesterol in the blood. Atherosclerosis refers to artery narrowing due to fat buildup." | |
response = medical_qa(question=f"What is {term}?", context=context) | |
return response["answer"] | |
# Streamlit UI | |
st.title("🩺 AI Medical Report Analyzer") | |
st.write("Upload a medical **PDF or Image (JPG, PNG)** to get a summarized report with term explanations.") | |
uploaded_file = st.file_uploader("Upload a PDF or Image", type=["pdf", "jpg", "png"]) | |
if uploaded_file: | |
file_type = uploaded_file.type | |
if file_type == "application/pdf": | |
text = extract_text_from_pdf(uploaded_file) | |
st.subheader("📜 Extracted Text from PDF:") | |
elif file_type in ["image/png", "image/jpeg"]: | |
text = extract_text_from_image(uploaded_file) | |
st.subheader("🖼️ Extracted Text from Image:") | |
st.text_area("Report Content:", text, height=200) | |
if st.button("Generate AI Summary"): | |
summary = summarize_report(text) | |
st.subheader("📑 AI-Generated Summary:") | |
st.markdown(f"**{summary}**") | |
if st.button("Explain Medical Terms"): | |
terms = extract_medical_terms(text) | |
if terms: | |
st.subheader("📖 Medical Term Explanations:") | |
for term in terms[:5]: # Limit to 5 terms for efficiency | |
explanation = explain_term(term) | |
st.markdown(f"**{term}:** {explanation}") | |
else: | |
st.write("No medical terms detected.") |