Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| from paddleocr import PaddleOCR | |
| from langchain_groq import ChatGroq | |
| from langchain.output_parsers import PydanticOutputParser | |
| from langchain_core.prompts import PromptTemplate | |
| from pydantic import BaseModel, Field | |
| import fitz | |
| import json | |
| from PIL import Image | |
| ocr = PaddleOCR(use_angle_cls=True, lang='es') | |
| st.set_page_config(layout="wide") | |
| class CarInfoEntity(BaseModel): | |
| dealer_name: str = Field(description="Nombre del concesionario o empresa.") | |
| dealer_address: str = Field(description="Direcci贸n f铆sica del concesionario.") | |
| tax_id: str = Field(description="N煤mero de identificaci贸n fiscal del concesionario.") | |
| contact_phone: str = Field(description="N煤mero de tel茅fono principal para contactar con el concesionario.") | |
| contact_fax: str = Field(description="N煤mero de fax del concesionario.") | |
| contact_email: str = Field(description="Direcci贸n de correo electr贸nico para consultas.") | |
| website_url: str = Field(description="Sitio web oficial del concesionario.") | |
| operating_hours: str = Field(description="Horario habitual de atenci贸n del concesionario.") | |
| saturday_hours: str = Field(description="Horario de atenci贸n espec铆fico para los s谩bados.") | |
| order_date: str = Field(description="Fecha en que se realiz贸 el pedido.") | |
| order_number: str = Field(description="Identificador 煤nico del pedido.") | |
| sales_rep: str = Field(description="Nombre del vendedor que maneja la transacci贸n.") | |
| customer_full_name: str = Field(description="Nombre completo del comprador.") | |
| customer_address: str = Field(description="Direcci贸n del comprador.") | |
| customer_city: str = Field(description="Ciudad donde reside el comprador.") | |
| customer_postal_code: str = Field(description="C贸digo postal de la direcci贸n del comprador.") | |
| customer_province: str = Field(description="Provincia donde se encuentra el comprador.") | |
| customer_id: str = Field(description="N煤mero de identificaci贸n del comprador (NIF).") | |
| customer_phone: str = Field(description="N煤mero de tel茅fono del comprador.") | |
| vehicle_description: str = Field(description="Descripci贸n del veh铆culo que se est谩 comprando, incluyendo marca, modelo y a帽o.") | |
| vehicle_color: str = Field(description="Color del veh铆culo.") | |
| vehicle_price: str = Field(description="Precio total del veh铆culo, incluyendo impuestos.") | |
| model = ChatGroq( | |
| model="llama-3.1-70b-versatile", | |
| temperature=0, | |
| max_tokens=None, | |
| timeout=None, | |
| max_retries=2, | |
| api_key='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' | |
| ) | |
| os.environ['GROQ_API_KEY'] = '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' | |
| entity = ['dealer_name', 'dealer_address', 'tax_id', 'contact_phone', 'contact_fax', 'contact_email', 'website_url', | |
| 'operating_hours', 'saturday_hours', 'order_date', 'order_number', 'sales_rep', | |
| 'customer_full_name', 'customer_address', 'customer_city', 'customer_postal_code', | |
| 'customer_province', 'customer_id','customer_phone', 'vehicle_description','vehicle_color','vehicle_price'] | |
| # Streamlit App | |
| st.title("Vehicle Information Extractor") | |
| st.write("Upload a PDF file to extract vehicle information.") | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| use_default = st.checkbox("Use Default Pdf") | |
| doc = None | |
| if use_default: | |
| default_pdf_path = "pedido V.O.pdf" | |
| if os.path.exists(default_pdf_path): | |
| print("Present") | |
| doc = fitz.open(default_pdf_path) | |
| st.write("Using default PDF:") | |
| else: | |
| st.error("Default PDF not found.") | |
| else: | |
| if uploaded_file is not None: | |
| with open("temp.pdf", "wb") as f: | |
| f.write(uploaded_file.read()) | |
| doc = fitz.open("temp.pdf") | |
| st.write("Uploaded PDF:") | |
| if doc: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| pix = page.get_pixmap() | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| st.image(img, caption=f"Page {page_num + 1}", use_column_width=True) | |
| # Perform OCR | |
| ocr_result = ocr.ocr(default_pdf_path if use_default else "temp.pdf") | |
| extracted_text = [] | |
| for page in ocr_result: | |
| for result in page: | |
| text = result[1][0] | |
| extracted_text.append(text) | |
| all_text = " ".join(extracted_text) | |
| prompt_text = """Task: Analyze the {all_text} and find out given entity value:{entity} from the {all_text}: | |
| Output Format: A table with the entity and value. First column contains the {entity} and second column contains the value fetched from the {all_text}. | |
| Do not include any additional explanations or unnecessary details. | |
| {format_instructions}""" | |
| parser = PydanticOutputParser(pydantic_object=CarInfoEntity) | |
| prompt = PromptTemplate( | |
| template=prompt_text, | |
| input_variables=["all_text", "entity"], | |
| partial_variables={"format_instructions": parser.get_format_instructions()}, | |
| ) | |
| chain = prompt | model | parser | |
| output = chain.invoke({"all_text": all_text, "entity": entity}) | |
| with col2: | |
| st.write("Extracted Vehicle Information (Table):") | |
| st.table(output) | |