Spaces:

garage-lab
/

MCP_STRUCTRA

Running

App Files Files Community

abdo-Mansour commited on Jul 4

Commit

ab74ea1

1 Parent(s): 13f15f6

hey

Browse files

Files changed (15) hide show

.gitignore +7 -0
.gradio/certificate.pem +31 -0
.txt +0 -0
README.md +9 -6
app.py +308 -0
requirements.txt +27 -0
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +732 -0
web2json/contentextractors.py +379 -0
web2json/pipeline.py +43 -0
web2json/postprocessor.py +27 -0
web2json/preprocessor.py +222 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.env
+*.ipynb
+venv
+*.csv
+*.json
+*.jsonl
+vllm*

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.txt ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
-title: MCP STRUCTRA
-emoji: 🐢
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 5.35.0
 app_file: app.py
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MCP Server Web2JSON
+emoji: 🖇️
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.33.0
 app_file: app.py
+pinned: True
+tags: [mcp-server-track]
 ---
+[Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import json
+import pandas as pd
+import gradio as gr
+from typing import Dict, Any, Type
+from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
+from web2json.postprocessor import PostProcessor
+from web2json.pipeline import Pipeline
+from pydantic import BaseModel, Field, create_model
+import os
+import dotenv
+import random
+import numpy as np
+import torch
+dotenv.load_dotenv()
+def seed_everything(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+seed_everything(22)
+def parse_schema_input(schema_input: str) -> Type[BaseModel]:
+    """
+    Convert user schema input to a Pydantic BaseModel.
+    Supports multiple input formats:
+    1. JSON schema format
+    2. Python class definition
+    3. Simple field definitions
+    """
+    schema_input = schema_input.strip()
+    if not schema_input:
+        # Default schema if none provided
+        return create_model('DefaultSchema',
+                          title=(str, Field(description="Title of the content")),
+                          content=(str, Field(description="Main content")))
+    try:
+        # Try parsing as JSON schema
+        if schema_input.startswith('{'):
+            schema_dict = json.loads(schema_input)
+            return json_schema_to_basemodel(schema_dict)
+        # Try parsing as Python class definition
+        elif 'class ' in schema_input and 'BaseModel' in schema_input:
+            return python_class_to_basemodel(schema_input)
+        # Try parsing as simple field definitions
+        else:
+            return simple_fields_to_basemodel(schema_input)
+    except Exception as e:
+        raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
+def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
+    """Convert JSON schema to BaseModel"""
+    fields = {}
+    properties = schema_dict.get('properties', {})
+    required = schema_dict.get('required', [])
+    for field_name, field_info in properties.items():
+        field_type = get_python_type(field_info.get('type', 'string'))
+        field_description = field_info.get('description', '')
+        if field_name in required:
+            fields[field_name] = (field_type, Field(description=field_description))
+        else:
+            fields[field_name] = (field_type, Field(default=None, description=field_description))
+    return create_model('DynamicSchema', **fields)
+def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
+    """Convert Python class definition to BaseModel"""
+    try:
+        # Execute the class definition in a safe namespace
+        namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
+                    'float': float, 'bool': bool, 'list': list, 'dict': dict}
+        exec(class_definition, namespace)
+        # Find the class that inherits from BaseModel
+        for name, obj in namespace.items():
+            if (isinstance(obj, type) and
+                issubclass(obj, BaseModel) and
+                obj != BaseModel):
+                return obj
+        raise ValueError("No BaseModel class found in definition")
+    except Exception as e:
+        raise ValueError(f"Invalid Python class definition: {str(e)}")
+def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
+    """Convert simple field definitions to BaseModel"""
+    fields = {}
+    for line in fields_text.strip().split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        # Parse field definition (e.g., "name: str = description")
+        if ':' in line:
+            parts = line.split(':', 1)
+            field_name = parts[0].strip()
+            type_and_desc = parts[1].strip()
+            if '=' in type_and_desc:
+                type_part, desc_part = type_and_desc.split('=', 1)
+                field_type = get_python_type(type_part.strip())
+                description = desc_part.strip().strip('"\'')
+            else:
+                field_type = get_python_type(type_and_desc.strip())
+                description = ""
+            fields[field_name] = (field_type, Field(description=description))
+        else:
+            # Simple field name only
+            field_name = line.strip()
+            fields[field_name] = (str, Field(description=""))
+    if not fields:
+        raise ValueError("No valid fields found in schema definition")
+    return create_model('DynamicSchema', **fields)
+def get_python_type(type_str: str):
+    """Convert type string to Python type"""
+    type_str = type_str.lower().strip()
+    type_mapping = {
+        'string': str, 'str': str,
+        'integer': int, 'int': int,
+        'number': float, 'float': float,
+        'boolean': bool, 'bool': bool,
+        'array': list, 'list': list,
+        'object': dict, 'dict': dict
+    }
+    return type_mapping.get(type_str, str)
+def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
+    """Wrapper function that converts schema input to BaseModel"""
+    try:
+        # Parse the schema input into a BaseModel
+        schema_model = parse_schema_input(schema_input)
+        # Call the original function
+        return webpage_to_json(content, is_url, schema_model)
+    except Exception as e:
+        return {"error": f"Schema parsing error: {str(e)}"}
+def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
+    """
+    Extracts structured JSON information from a given content based on a specified schema.
+    This function sets up a processing pipeline that includes:
+    - Preprocessing the input content.
+    - Utilizing an AI language model to extract information according to the provided schema.
+    - Postprocessing the extracted output to match the exact schema requirements.
+    Parameters:
+        content (str): The input content to be analyzed. This can be direct text or a URL content.
+        is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
+        schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
+    Returns:
+        Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
+                        or processing, the dictionary will include an "error" key with a descriptive message.
+    """
+    prompt_template = """Extract the following information from the provided content according to the specified schema.
+    Content to analyze:
+    {content}
+    Schema requirements:
+    {schema}
+    Instructions:
+    - Extract only information that is explicitly present in the content
+    - Follow the exact structure and data types specified in the schema
+    - If a required field cannot be found, indicate this clearly
+    - Preserve the original formatting and context where relevant
+    - Return the extracted data in the format specified by the schema
+    - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
+    - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
+    - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
+    classification_prompt_template = schema.model_json_schema()
+    # Initialize pipeline components
+    # TODO: improve the RAG system and optimize (don't instantiate every time)
+    preprocessor = BasicPreprocessor(config={'keep_tags': True})
+    try:
+        # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
+        llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
+        # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
+        reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run")
+    except Exception as e:
+        return {"error": f"Failed to initialize LLM client: {str(e)}"}
+    # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
+    ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
+    postprocessor = PostProcessor()
+    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
+    try:
+        result = pipeline.run(content, is_url, schema)
+        print("-"*80)
+        print(f"Processed result: {result}")
+        return result
+    except Exception as e:
+        return {"error": f"Processing error: {str(e)}"}
+# Example schemas for the user
+example_schemas = """
+**Example Schema Formats:**
+1. **Simple field definitions:**
+```
+title: str = Page title
+price: float = Product price
+description: str = Product description
+available: bool = Is available
+```
+2. **JSON Schema:**
+```json
+{
+  "properties": {
+    "title": {"type": "string", "description": "Page title"},
+    "price": {"type": "number", "description": "Product price"},
+    "description": {"type": "string", "description": "Product description"}
+  },
+  "required": ["title"]
+}
+```
+3. **Python Class Definition:**
+```python
+class ProductSchema(BaseModel):
+    title: str = Field(description="Product title")
+    price: float = Field(description="Product price")
+    description: str = Field(description="Product description")
+    available: bool = Field(default=False, description="Availability status")
+```
+"""
+# Build Gradio Interface
+demo = gr.Interface(
+    fn=webpage_to_json_wrapper,
+    inputs=[
+        gr.Textbox(
+            label="Content (URL or Raw Text)",
+            lines=10,
+            placeholder="Enter URL or paste raw HTML/text here."
+        ),
+        gr.Checkbox(label="Content is URL?", value=False),
+        gr.Textbox(
+            label="Schema Definition",
+            lines=15,
+            placeholder="Define your extraction schema (see examples below)",
+            info=example_schemas
+        )
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="Webpage to JSON Converter",
+    description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
+    examples=[
+        [
+            "https://example.com",
+            True,
+            "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
+        ],
+        [
+            "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
+            False,
+            '''{
+            "type": "object",
+            "properties": {
+                "title": {
+                "type": "string",
+                "description": "Name of the product"
+                },
+                "price": {
+                "type": "number",
+                "description": "Price of the product"
+                },
+                "description": {
+                "type": "string",
+                "description": "Detailed description of the product"
+                },
+                "availability": {
+                "type": "boolean",
+                "description": "Whether the product is in stock (true) or not (false)"
+                }
+            },
+            "required": ["title", "price"]
+            }'''
+        ]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+pandas
+gradio
+gradio[mcp]
+pydantic
+python-dotenv
+beautifulsoup4
+requests
+google-genai
+json_repair
+numpy
+langchain
+langchain-text-splitters
+sentence-transformers
+openai
+html_chunking
+langchain_nvidia_ai_endpoints
+langchain_core
+lxml
+pdfkit
+html2text
+inscriptis
+trafilatura
+markdownify
+beautifulsoup4
+readabilipy
+docling
+htmlrag

web2json/__pycache__/ai_extractor.cpython-311.pyc ADDED Viewed

Binary file (41.3 kB). View file

web2json/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (2.29 kB). View file

web2json/__pycache__/postprocessor.cpython-311.pyc ADDED Viewed

Binary file (1.64 kB). View file

web2json/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (9.89 kB). View file

web2json/ai_extractor.py ADDED Viewed

	@@ -0,0 +1,732 @@

+import os
+import time
+import numpy as np
+from google import genai
+from openai import OpenAI
+import time
+import random
+from openai import RateLimitError
+from functools import wraps
+from google.genai import types
+from pydantic import BaseModel
+from concurrent.futures import ThreadPoolExecutor
+from html_chunking import get_html_chunks
+from langchain_nvidia_ai_endpoints import NVIDIARerank
+from langchain_core.documents import Document
+from abc import ABC, abstractmethod
+from typing import List, Any, Dict, Tuple, Optional
+import re
+import json
+from langchain_text_splitters import HTMLHeaderTextSplitter
+from sentence_transformers import SentenceTransformer
+import requests
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+from typing import List, Dict
+from tenacity import retry, wait_exponential, stop_after_attempt
+import trafilatura
+class LLMClient(ABC):
+    """
+    Abstract base class for calling LLM APIs.
+    """
+    def __init__(self, config: dict = None):
+        """
+        Initializes the LLMClient with a configuration dictionary.
+        Args:
+            config (dict): Configuration settings for the LLM client.
+        """
+        self.config = config or {}
+    @abstractmethod
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the underlying LLM API with the given prompt.
+        Args:
+            prompt (str): The prompt or input text for the LLM.
+        Returns:
+            str: The response from the LLM.
+        """
+        pass
+class RerankerClient(ABC):
+    """
+    Abstract base class for reranker APIs.
+    """
+    def __init__(self, config: dict = None):
+        """
+        Initializes the RerankerClient with a configuration dictionary.
+        Args:
+            config (dict): Configuration settings for the reranker client.
+        """
+        self.config = config or {}
+    @abstractmethod
+    def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
+        """
+        Rerank passages based on relevance to query.
+        Args:
+            query (str): Query string.
+            passages (List[str]): List of passages.
+            top_k (int): Number of top passages to return.
+        Returns:
+            List[str]: Top-k most relevant passages.
+        """
+        pass
+class GeminiLLMClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the Gemini API.
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
+        Args:
+            config (dict): Configuration containing:
+                - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
+                - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
+                - 'generation_config': (optional) dict of GenerateContentConfig parameters
+        """
+        api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
+            )
+        self.client = genai.Client(api_key=api_key)
+        self.model_name = config.get("model_name", "gemini-2.0-flash")
+        # allow custom generation settings, fallback to sensible defaults
+        gen_conf = config.get("generation_config", {})
+        self.generate_config = types.GenerateContentConfig(
+            response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
+            temperature=gen_conf.get("temperature"),
+            max_output_tokens=gen_conf.get("max_output_tokens"),
+            top_p=gen_conf.get("top_p"),
+            top_k=gen_conf.get("top_k"),
+            # add any other fields you want to expose
+        )
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the Gemini API with the given prompt (non-streaming).
+        Args:
+            prompt (str): The input text for the API.
+        Returns:
+            str: The generated text from the Gemini API.
+        """
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)],
+            )
+        ]
+        # Non-streaming call returns a full response object
+        response = self.client.models.generate_content(
+            model=self.model_name,
+            contents=contents,
+            config=self.generate_config,
+        )
+        # Combine all output parts into a single string
+        return response.text
+def extract_markdown_json(text: str) -> Optional[Dict[str, Any]]:
+        """
+        Find the first Markdown ```json ...``` block in `text`,
+        parse it as JSON, and return the resulting dict.
+        Returns None if no valid JSON block is found.
+        """
+        # 1) Look specifically for a ```json code fence
+        fence_match = re.search(
+            r"```json\s*(\{.*?\})\s*```",
+            text,
+            re.DOTALL | re.IGNORECASE
+        )
+        if not fence_match:
+            return None
+        json_str = fence_match.group(1)
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            return None
+def retry_on_ratelimit(max_retries=5, base_delay=1.0, max_delay=10.0):
+    def deco(fn):
+        @wraps(fn)
+        def wrapped(*args, **kwargs):
+            delay = base_delay
+            for attempt in range(max_retries):
+                try:
+                    return fn(*args, **kwargs)
+                except RateLimitError:
+                    if attempt == max_retries - 1:
+                        # give up
+                        raise
+                    # back off + jitter
+                    sleep = min(max_delay, delay) + random.uniform(0, delay)
+                    time.sleep(sleep)
+                    delay *= 2
+            # unreachable
+        return wrapped
+    return deco
+class NvidiaLLMClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the NvidiaLLMClient with an API key, model name, and optional generation settings.
+        Args:
+            config (dict): Configuration containing:
+                - 'api_key': (optional) API key for NVIDIA (falls back to NVIDIA_API_KEY env var)
+                - 'model_name': (optional) the model to use (default 'google/gemma-3-1b-it')
+                - 'generation_config': (optional) dict of generation parameters like temperature, top_p, etc.
+        """
+        api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key for NVIDIA must be provided in config['api_key'] or NVIDIA_API_KEY env var."
+            )
+        self.client = OpenAI(
+            base_url="https://integrate.api.nvidia.com/v1",
+            api_key=api_key
+        )
+        self.model_name = config.get("model_name", "google/gemma-3-1b-it")
+        # Store generation settings with sensible defaults
+        gen_conf = config.get("generation_config", {})
+        self.temperature = gen_conf.get("temperature", 0)
+        self.top_p = gen_conf.get("top_p", 0.7)
+        self.max_tokens = gen_conf.get("max_tokens", 8192)
+    def set_model(self, model_name: str):
+        """
+        Set the model name for the NVIDIA API client.
+        Args:
+            model_name (str): The name of the model to use.
+        """
+        self.model_name = model_name
+    @retry_on_ratelimit(max_retries=20, base_delay=0.5, max_delay=5.0)
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the NVIDIA API with the given prompt (non-streaming).
+        Args:
+            prompt (str): The input text for the API.
+        Returns:
+            str: The generated text from the NVIDIA API.
+        """
+        print("prompt: ", prompt)
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_tokens=self.max_tokens,
+            extra_body={"chat_template_kwargs": {"thinking":True}},
+            # stream is omitted (defaults to False)
+        )
+        # print("DONE")
+        # For the standard (non-streaming) response:
+        # choices[0].message.content holds the generated text
+        return response.choices[0].message.content
+    def call_batch(self, prompts, max_workers=8):
+        """
+        Parallel batch with isolated errors: each prompt that still
+        fails after retries will raise, but others succeed.
+        """
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        results = [None] * len(prompts)
+        with ThreadPoolExecutor(max_workers=max_workers) as ex:
+            futures = {ex.submit(self.call_api, p): i for i, p in enumerate(prompts)}
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                try:
+                    results[idx] = fut.result()
+                    print("DONE")
+                except RateLimitError:
+                    # You could set results[idx] = None or a default string
+                    results[idx] = f"<failed after retries>"
+        return results
+class NvidiaRerankerClient(RerankerClient):
+    """
+    Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
+    """
+    def __init__(self, config: dict):
+        self.model_name = config.get("model_name", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+        self.client = NVIDIARerank(
+            model=self.model_name,
+            api_key=os.getenv("NVIDIA_API_KEY"),
+        )
+    def set_model(self, model_name: str):
+        """
+        Set the model name for the NVIDIA API client.
+        Args:
+            model_name (str): The name of the model to use.
+        """
+        self.model_name = model_name
+    @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[Document]:
+        # 1. Prepare and send documents for scoring
+        docs = [Document(page_content=p) for p in passages]
+        scored_docs = self.client.compress_documents(
+            query=str(query),
+            documents=docs
+        )
+        # 2. Extract raw scores and compute sigmoid probabilities
+        raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
+        print(f"raw scores {raw_scores}")
+        p_scores = 1 / (1 + np.exp(-raw_scores))
+        print(f"Sigmoid scores: {p_scores}")
+        # 3. Max normalization
+        max_score = np.max(p_scores)
+        if max_score == 0:
+            norm_scores = np.zeros_like(p_scores)
+        else:
+            norm_scores = p_scores / max_score
+        print(f"Normalized scores: {norm_scores}")
+        # 4. Filter by threshold using normalized scores
+        scored_pairs = [(doc, norm) for doc, norm in zip(scored_docs, norm_scores) if norm > threshold]
+        print(f"Filtered pairs:\n{scored_pairs}")
+        # 5. Return top_k documents (already sorted by model, no need to re-sort)
+        top_docs = [doc.page_content for doc, _ in scored_pairs]
+        return top_docs
+    # TODO: will I need it ?
+    # def call_batch(self, prompts, max_workers=8):
+    #     pass
+def retry_on_error(fn):
+    """Simple retry decorator (exponential back-off, max 6 tries)."""
+    return retry(
+        wait=wait_exponential(multiplier=0.5, min=0.5, max=5),
+        stop=stop_after_attempt(6),
+        reraise=True,
+    )(fn)
+class ModalRerankerClient(RerankerClient):
+    """Client for the Modal Qwen3-Reranker endpoint (non-streaming)."""
+    def __init__(self, endpoint_url: str):
+        self.endpoint_url = endpoint_url.rstrip("/")  # ensure no trailing slash
+    def set_endpoint(self, url: str):
+        self.endpoint_url = url.rstrip("/")
+    @retry_on_error
+    def rerank(
+        self,
+        query: str,
+        passages: List[str],
+        threshold: float = 0.5,
+    ) -> List[Document]:
+        """Call the remote endpoint and return filtered passages."""
+        if not isinstance(query,str):
+            query = str(query)
+        payload = {"query": query, "passages": passages}
+        print(payload)
+        res = requests.post(self.endpoint_url, json=payload, timeout=60)
+        res.raise_for_status()
+        data = res.json()
+        # The endpoint already returns probabilities (0-1). Extract them.
+        ranked = data.get("ranked_passages", [])
+        # Extract scores
+        scores = np.array([p["score"] for p in ranked], dtype=float)
+        # Max normalization
+        max_score = scores.max() if len(scores) > 0 else 1.0
+        # max_score = 1
+        if max_score == 0:
+            norm_scores = np.zeros_like(scores)
+        else:
+            norm_scores = scores / max_score
+        # Filter by threshold using normalized scores
+        filtered = [
+            (p, norm) for p, norm in zip(ranked, norm_scores) if norm >= threshold
+        ]
+        # Convert to LangChain Documents
+        docs = [
+            Document(page_content=p["passage"], metadata={"score": p["score"], "norm_score": norm})
+            for p, norm in filtered
+        ]
+        # docs.reverse()
+        return docs
+class HFRerankerClient(LLMClient):
+    """
+    Hugging Face Reranker client using Qwen/Qwen1.5-MoE-A14B-Chat reranking style (0.6B variant).
+    """
+    def __init__(self, model_name: str = "Qwen/Qwen3-Reranker-0.6B", device: str = None):
+        """
+        Initialize the Hugging Face reranker.
+        """
+        self.model_name = model_name
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
+        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+    def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[str]:
+        """
+        Rerank passages based on relevance to query using min-max normalized scores.
+        Args:
+            query (str): Query string.
+            passages (List[str]): List of passages.
+            top_k (int): Number of top passages to return.
+            threshold (float): Minimum normalized score to include passage.
+        Returns:
+            List[str]: Top-k most relevant passages above threshold.
+        """
+        inputs = [
+            self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device)
+            for p in passages
+        ]
+        scores = []
+        with torch.no_grad():
+            for inp in inputs:
+                logits = self.model(**inp).logits
+                # print("logits:", logits)
+                score = torch.softmax(logits, dim=1)[0, 1].item()  # probability of relevance
+                scores.append(score)
+        print(f"Softmax Scores: {scores}")
+        # Min-max normalize the scores
+        scores_np = np.array(scores)
+        min_score = scores_np.min()
+        max_score = scores_np.max()
+        if max_score == min_score:
+            norm_scores = np.ones_like(scores_np)
+        else:
+            norm_scores = (scores_np - min_score) / (max_score - min_score)
+        print(f"Normalized Scores: {norm_scores}")
+        # Filter based on normalized threshold
+        filtered = [(i, s) for i, s in enumerate(norm_scores) if s > threshold]
+        print(f"Filtered: {filtered}")
+        # Sort by normalized score descending
+        filtered.sort(key=lambda x: x[1], reverse=True)
+        # Select top_k passages
+        top_passages = [passages[i] for i, _ in filtered]
+        return top_passages
+    @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def call_api(self, prompt: str) -> str:
+        pass
+    def call_batch(self, prompts, max_workers=8):
+        pass
+class AIExtractor:
+    def __init__(self, llm_client: LLMClient, prompt_template: str):
+        """
+        Initializes the AIExtractor with a specific LLM client and configuration.
+        Args:
+            llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
+            prompt_template (str): The template to use for generating prompts for the LLM.
+            should contain placeholders for dynamic content.
+            e.g., "Extract the following information: {content} based on schema: {schema}"
+        """
+        self.llm_client = llm_client
+        self.prompt_template = prompt_template
+    def extract(self, content: str, schema: BaseModel) -> str:
+        """
+        Extracts structured information from the given content based on the provided schema.
+        Args:
+            content (str): The raw content to extract information from.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+        Returns:
+            str: The structured JSON object as a string.
+        """
+        prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
+        # print(f"Generated prompt: {prompt}")
+        response = self.llm_client.call_api(prompt)
+        return response
+class LLMClassifierExtractor(AIExtractor):
+    """
+    Extractor that uses an LLM to classify and extract structured information from text content.
+    This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
+    """
+    def __init__(self, reranker: RerankerClient, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
+        """
+        Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
+        Args:
+            llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
+            prompt_template (str): The template to use for generating prompts for the LLM.
+        """
+        super().__init__(llm_client, prompt_template)
+        self.reranker = reranker
+        self.classifier_prompt = classifier_prompt
+    def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
+        """
+        Splits the content into manageable chunks for processing.
+        Args:
+            content (str): The raw content to be chunked.
+        Returns:
+            List[str]: A list of text chunks.
+        """
+        # Use the get_html_chunks function to split the content into chunks
+        return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
+    def classify_chunks(self, passages, top_k=3, hf: bool = False):  # reranker
+        # print("TIME TO CLASSIFY")
+        query = self.classifier_prompt
+        if hf:
+            # print("Using Hugging Face reranker for classification.")
+            return self.reranker.rerank(query, passages, top_k=top_k)
+        response = self.reranker.rerank(query,passages)
+        print(f"response: {response}")
+        # print("DONNNNE")
+        # NVIDIA reranker path
+        return response
+    def extract(self, content, schema, hf: bool = False):
+        """
+        Extracts structured information from the given content based on the provided schema.
+        Args:
+            content (str): The raw content to extract information from.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+            hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
+        """
+        # print("TIME TO EXTRACT")
+        chunks = self.chunk_content(content, max_tokens=500)
+        print(f"Content successfully chunked into {len(chunks)}.")
+        # print(f"Content successfully chunked: {chunks}")
+        # chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in chunks]
+        # chunks = [chunk for chunk in chunks if chunk is not None]
+        classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
+        # extracting the content
+        if isinstance(classified_chunks[0],Document):
+            classified_chunks = [chunk.page_content for chunk in classified_chunks]
+        print(f"Classified Chunks {len(classified_chunks)}")
+        # print(classified_chunks)
+        # print('='*80)
+        # NOTE: More preprocesing
+        # classified_chunks = [trafilatura.extract(chunk,favor_recall=True) for chunk in classified_chunks]
+        # classified_chunks = [chunk for chunk in classified_chunks if chunk is not None]
+        filtered_content = "\n\n".join(classified_chunks)
+        if not filtered_content:
+            print("Warning: No relevant chunks found. Returning empty response.")
+            return "{}"
+        prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
+        # print(f"Generated prompt for extraction: {prompt[:500]}...")
+        llm_response = self.llm_client.call_api(prompt)
+        # print(f"LLM response: {llm_response[:500]}...")
+        return llm_response or "{}"
+# TODO: RAGExtractor class
+class RAGExtractor(AIExtractor):
+    """
+    RAG-enhanced extractor that uses similarity search to find relevant chunks
+    before performing extraction, utilizing HTML header-based chunking and SentenceTransformer embeddings.
+    """
+    def __init__(self,
+                 llm_client: LLMClient,
+                 prompt_template: str,
+                 embedding_model_path: str = "sentence-transformers/all-mpnet-base-v2",
+                 top_k: int = 3):
+        """
+        Initialize RAG extractor with embedding and chunking capabilities.
+        Args:
+            llm_client: LLM client for generation.
+            prompt_template: Template for prompts.
+            embedding_model_path: Path/name for the SentenceTransformer embedding model.
+            top_k: Number of top similar chunks to retrieve.
+        """
+        super().__init__(llm_client, prompt_template)
+        self.embedding_model_path = embedding_model_path
+        # Initialize the SentenceTransformer model for embeddings
+        self.embedding_model_instance = SentenceTransformer(self.embedding_model_path)
+        self.top_k = top_k
+    @staticmethod
+    def _langchain_HHTS(text: str) -> List[str]:
+        """
+        Chunks HTML text using Langchain's HTMLHeaderTextSplitter based on h1 and h2 headers.
+        Args:
+            text (str): The HTML content to chunk.
+        Returns:
+            List[str]: A list of chunked text strings (extracted from Document objects' page_content).
+        """
+        headers_to_split_on = [
+            ("h1", "Header 1"),
+            ("h2", "Header 2"),
+            # ("h3", "Header 3"), # This header was explicitly commented out in the request
+        ]
+        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        return [doc.page_content for doc in html_splitter.split_text(text)]
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embeddings for text using the initialized SentenceTransformer model.
+        Args:
+            text: The text string to embed.
+        Returns:
+            np.ndarray: The embedding vector for the input text as a NumPy array.
+        """
+        try:
+            return self.embedding_model_instance.encode(text)
+        except Exception as e:
+            print(f"Warning: Embedding failed for text: '{text[:50]}...', using random embedding: {e}")
+            return None
+    def search_similar_chunks(self,
+                              query: str,
+                              chunks: List[str],
+                              embeddings: np.ndarray) -> List[str]:
+        """
+        Find the most similar chunks to the query within the given list of chunks
+        by calculating cosine similarity between their embeddings.
+        Args:
+            query (str): The query text whose embedding will be used for similarity comparison.
+            chunks (List[str]): A list of text chunks to search within.
+            embeddings (np.ndarray): Precomputed embeddings for the chunks, corresponding to the 'chunks' list.
+        Returns:
+            List[str]: A list of the 'top_k' most similar chunks to the query.
+        """
+        query_embedding = self.embed_text(query)
+        similarities = []
+        if query_embedding.ndim > 1:
+            query_embedding = query_embedding.flatten()
+        for i, chunk_embedding in enumerate(embeddings):
+            if chunk_embedding.ndim > 1:
+                chunk_embedding = chunk_embedding.flatten()
+            norm_query = np.linalg.norm(query_embedding)
+            norm_chunk = np.linalg.norm(chunk_embedding)
+            if norm_query == 0 or norm_chunk == 0:
+                similarity = 0.0
+            else:
+                similarity = np.dot(query_embedding, chunk_embedding) / (norm_query * norm_chunk)
+            similarities.append((similarity, i))
+        similarities.sort(key=lambda x: x[0], reverse=True)
+        top_indices = [idx for _, idx in similarities[:self.top_k]]
+        return [chunks[i] for i in top_indices]
+    def extract(self, content: str, schema: BaseModel, query: str = None) -> str:
+        """
+        Overrides the base AIExtractor's method to implement RAG-enhanced extraction.
+        This function first chunks the input HTML content, then uses a query to find
+        the most relevant chunks via embedding similarity, and finally sends these
+        relevant chunks as context to the LLM for structured information extraction.
+        Args:
+            content (str): The raw HTML content from which to extract information.
+            schema (BaseModel): A Pydantic model defining the desired output structure for the LLM.
+            query (str, optional): An optional query string to guide the retrieval of relevant chunks.
+                                   If not provided, a default query based on the schema will be used.
+        Returns:
+            str: The structured JSON object as a string, as generated by the LLM.
+        """
+        start_time = time.time()
+        if not query:
+            query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
+            # print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
+        chunks = self._langchain_HHTS(content)
+        print(f"Content successfully chunked into {len(chunks)} pieces.")
+        combined_content_for_llm = ""
+        if not chunks:
+            print("Warning: No chunks were generated from the provided content. The entire original content will be sent to the LLM.")
+            combined_content_for_llm = content
+        else:
+            chunk_embeddings = np.array([self.embed_text(chunk) for chunk in chunks])
+            print(f"Generated embeddings for {len(chunks)} chunks.")
+            similar_chunks = self.search_similar_chunks(query, chunks, chunk_embeddings)
+            print(f"Retrieved {len(similar_chunks)} similar chunks based on the query.")
+            combined_content_for_llm = "\n\n".join(similar_chunks)
+            print(f"Combined content for LLM (truncated): '{combined_content_for_llm[:200]}...'")
+        prompt = self.prompt_template.format(content=combined_content_for_llm, schema=schema.model_json_schema())
+        print(f"Sending prompt to LLM (truncated): '{prompt[:500]}...'")
+        llm_response = self.llm_client.call_api(prompt)
+        execution_time = (time.time() - start_time) * 1000
+        print(f"Extraction process completed in {execution_time:.2f} milliseconds.")
+        print(f"LLM's final response: {llm_response}")
+        print("=" * 78)
+        return llm_response

web2json/contentextractors.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import re
+import json
+import pdfkit
+import requests
+import warnings
+import tempfile
+# import textract
+import html2text
+import inscriptis
+import trafilatura
+from pathlib import Path
+from markdownify import markdownify
+from json_repair import repair_json
+from bs4 import BeautifulSoup, Comment
+from html_chunking import get_html_chunks
+from urllib.error import URLError, HTTPError
+from html_to_markdown import convert_to_markdown
+from readabilipy import simple_json_from_html_string
+from docling.document_converter import DocumentConverter
+from dateparser_scripts.update_supported_languages_and_locales import to_string
+def clean_html(html_content: str) -> str:
+    """
+    Cleans up the given HTML content by:
+      - Removing <script> and <style> tags and their content.
+      - Removing HTML comments.
+      - Extracting and returning the visible text with normalized whitespace.
+    Args:
+        html_content (str): The HTML content to clean.
+    Returns:
+        str: The cleaned, visible text from the HTML.
+    """
+    # Parse the HTML content
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove script and style elements
+    # Remove unwanted tags
+    for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody",
+                     "tfoot", "header", "footer", "link", "rel"]):
+        tag.decompose()
+    # Remove elements that do not contain any visible text
+    for element in soup.find_all():
+        # If the element has no text (after stripping whitespace), remove it
+        if not element.get_text(strip=True):
+            element.decompose()
+    # Remove HTML comments
+    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+        comment.extract()
+    # Extract text and normalize whitespace
+    # text = soup.get_text(separator=" ", strip=True)
+    # clean_text = re.sub(r'\s+', ' ', text)
+    # return clean_text
+    return str(soup)
+def print_content_extractors():
+    print(
+        [
+            "Default: the plain text of the HTML page",
+            "Inscriptis",
+            "Trafilatura",
+        ]
+    )
+class ContentExtractor:
+    def get_text(self, html):
+        return clean_html(html)
+    # TODO: Clean this mess
+    def url_to_html(self, url,clean=False):
+        # Define custom headers to mimic a browser request
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.6",
+            "Cache-Control": "max-age=0",
+            "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
+            "Sec-Ch-Ua-Mobile": "?0",
+            "Sec-Ch-Ua-Platform": "\"Windows\"",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+            "Upgrade-Insecure-Requests": "1"
+        }
+        try:
+            # Create a Request object with custom headers
+            response = requests.get(url, headers=headers, timeout=10)
+            html = None
+            if response.status_code == 200:
+                html = response.text
+            else:
+                print(f"Failed to retrieve HTML. Status code: {response.status_code}")
+                return None
+            if clean:
+                return self.get_text(html)
+            return html
+        except HTTPError as e:
+            print(f"HTTP Error: {e.code} - {e.reason}")
+            return None
+        except URLError as e:
+            print(f"URL Error: {e.reason}")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            return None
+class Inscriptis(ContentExtractor):
+    def __init__(self):
+        super()
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0",
+            "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
+        }
+        warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.")
+    def get_text(self, html):
+        """Extract text from HTML using inscriptis."""
+        return inscriptis.get_text(html)
+    def url_to_html(self, url):
+        response = requests.get(url, headers=self.headers)
+        return response.text
+class Docling(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+    # TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now
+    def get_text(self, text_content):
+        result = None
+        with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
+            tmpfile.write(text_content)
+            tmpfile.flush()
+            tmpfile_path = tmpfile.name.replace("\\", "/")
+            tmpfile_path = Path(tmpfile_path)
+        try:
+            converter = DocumentConverter()
+            document = converter.convert(tmpfile_path).document
+            tables = []
+            for table_ix, table in enumerate(document.tables):
+                table_text = table.export_to_markdown()
+                tables.append(table_text)
+            result = document.export_to_markdown()
+            for table in tables:
+                result += "\n\n" + table
+        finally:
+            os.remove(tmpfile_path)
+        return result
+class ReadabiliPy(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+    def get_text(self, html):
+        content = simple_json_from_html_string(html, use_readability=True)
+        json_object = json.dumps(content, indent=4)
+        repaired = repair_json(json_object)
+        return repaired
+class Trafilatura(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+            "Accept-Language": "en-US,en;q=0.9",
+        }
+        warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.")
+        from copy import deepcopy
+        from trafilatura.settings import DEFAULT_CONFIG
+        config = deepcopy(DEFAULT_CONFIG)
+        # config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me
+        self.config = config
+    def url_to_html(self, url):
+        response = requests.get(url, headers=self.headers)
+        return response.text
+    def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000):
+        # self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}"
+        # self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}"
+        return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format)
+class Markdownify(ContentExtractor):
+    def get_text(self, html):
+        alt = re.sub(r"\n{3,}", "\n\n", html)
+        md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
+        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
+        # Remove extra newlines
+        md = re.sub(r"\n{3,}", "\n\n", md)
+        md = md.strip()
+        return md
+class HTML2Text(ContentExtractor):
+    def get_text(self, html):
+        converter = html2text.HTML2Text()
+        converter.ignore_tables=True
+        converter.ignore_links=True
+        converter.ignore_images=True
+        converter.ignore_mailto_links=True
+        return converter.handle(html)
+class HTML_TO_Markdown(ContentExtractor):
+    def get_text(self, html):
+        alt = re.sub(r"\n{3,}", "\n\n", html)
+        md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])
+        md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md)
+        # Remove extra newlines
+        md = re.sub(r"\n{3,}", "\n\n", md)
+        md = md.strip()
+        return md
+class PDFkitDocling(ContentExtractor):
+    def get_text(self, html):
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove <a>, <link>, <img>, and other unwanted tags
+        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        # PDF path to save
+        pdf_path = 'test.pdf'
+        # Create PDF
+        pdfkit.from_string(content, pdf_path)
+        converter = DocumentConverter()
+        return converter.convert(pdf_path).document.export_to_markdown()
+class TrafilatraCHUNKS(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove <a>, <link>, <img>, and other unwanted tags
+        for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+class TrafilaCHUNKSRobust(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
+            tag.decompose()
+        for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+class TrafilaCHUNKSRobustV2(ContentExtractor):
+    def __init__(self):
+        super().__init__()
+        # self.trafi = Trafilatura()
+    def get_text(self, html, max_tokens=1000):
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        content = str(soup)
+        chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)
+        cleaned = [trafilatura.extract(chunk) for chunk in chunks]
+        cleaned = [chunk for chunk in cleaned if chunk is not None]
+        combined_text = ""
+        for chunk in cleaned:
+            if chunk is None:
+                continue
+            combined_text += chunk + "\n"
+        return combined_text
+# Very Bad lol
+# class Textract(ContentExtractor):
+#     def get_text(self, html):
+#         with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
+#             tmpfile.write(html)
+#             tmpfile.flush()
+#             tmpfile_path = tmpfile.name.replace("\\", "/")
+#             tmpfile_path = Path(tmpfile_path)
+#         try:
+#             result = textract.process(tmpfile_path)
+#         finally:
+#             os.remove(tmpfile_path)
+#         return result

web2json/pipeline.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from web2json.ai_extractor import *
+from web2json.postprocessor import *
+from web2json.preprocessor import *
+from pydantic import BaseModel
+class Pipeline:
+    # constructor
+    def __init__(self,
+                 preprocessor: Preprocessor,
+                 ai_extractor: AIExtractor,
+                 postprocessor: PostProcessor):
+        self.preprocessor = preprocessor
+        self.ai_extractor = ai_extractor
+        self.postprocessor = postprocessor
+    def run(self, content: str, is_url: bool, schema:BaseModel, hf=False) -> dict:
+        """
+        Run the entire pipeline: preprocess, extract, and postprocess.
+        Args:
+            content (str): The raw content to process.
+            is_url (bool): Whether the content is a URL or raw text.
+            schema (BaseModel): The schema defining the structure of the expected output.
+        Returns:
+            dict: The final structured data after processing.
+        """
+        # Step 1: Preprocess the content
+        preprocessed_content = self.preprocessor.preprocess(content, is_url)
+        # print(f"Preprocessed content: {preprocessed_content}...")
+        print('+'*80)
+        # Step 2: Extract structured information using AI
+        extracted_data = self.ai_extractor.extract(preprocessed_content, schema, hf=hf)
+        # print(f"Extracted data: {extracted_data[:100]}...")
+        print('+'*80)
+        # Step 3: Post-process the extracted data
+        final_output = self.postprocessor.process(extracted_data)
+        print(f"Final output: {final_output}")
+        print('+'*80)
+        return final_output

web2json/postprocessor.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from json_repair import repair_json
+import json
+class PostProcessor:
+    def process(self, response: str) -> dict:
+        json_response = {}
+        try:
+            # Extract the JSON from the generated text.  Handle variations in output format.
+            json_string = response
+            if "```json" in response:
+                json_string = response.split("```json")[1].split("```")[0]
+            elif "{" in response and "}" in response:
+                # try to grab the json
+                start_index = response.find("{")
+                end_index = response.rfind("}") + 1
+                json_string = response[start_index:end_index]
+            json_response = json.loads(repair_json(json_string)) # Added for robustness
+        except Exception as e:
+            print(f"Error parsing JSON: {e}")
+            print(f"Generated text: {response}")
+            json_response = {}
+        return json_response

web2json/preprocessor.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import re
+import requests
+from bs4 import BeautifulSoup , Comment
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+from htmlrag import clean_html
+class HTMLCleaner:
+    DEFAULT_REMOVE_TAGS = [
+        "script", "style"
+    ]
+    def __init__(self, config: dict = None):
+        self.config = config or {}
+        # allow custom tags to remove
+        self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
+    def _clean_html(self, html_content: str) -> str:
+        """
+        Cleans up the given HTML content by:
+        - Removing specified tags and their content.
+        - Stripping HTML comments.
+        - Optionally stripping out all attributes.
+        - Optionally flattening hyperlinks.
+        - Removing empty tags.
+        - Extracting and returning cleaned HTML or visible text.
+        Args:
+            html_content (str): The HTML content to clean.
+        Returns:
+            str: The cleaned HTML (if keep_tags=True) or normalized text.
+        """
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove unwanted tags entirely
+        for tag_name in self.remove_tags:
+            for tag in soup.find_all(tag_name):
+                tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # Strip attributes if requested
+        if self.config.get("strip_attrs", False):
+            for tag in soup.find_all(True):
+                tag.attrs = {}
+        # Flatten hyperlinks if requested
+        if self.config.get("strip_links", False):
+            for a in soup.find_all('a'):
+                a.replace_with(a.get_text())
+        # Remove empty tags (no text and no non-empty children)
+        for tag in soup.find_all(True):
+            if not tag.get_text(strip=True):
+                tag.decompose()
+        # Convert soup to HTML string if preserving tags
+        if self.config.get('keep_tags', False):
+            html_str = str(soup)
+            # Remove any empty lines
+            html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
+            return html_str.strip()
+        # Extract visible text
+        text = soup.get_text(separator="\n", strip=True)
+        # Remove empty lines
+        lines = [line for line in text.splitlines() if line.strip()]
+        clean_text = "\n".join(lines)
+        # Normalize whitespace within lines
+        clean_text = re.sub(r'\s+', ' ', clean_text)
+        return clean_text.strip()
+class Preprocessor(ABC):
+    """
+    Abstract base class for preprocessors.
+    Defines the interface for transforming raw inputs into structured data.
+    """
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        """
+        Initialize the preprocessor with optional configuration.
+        Args:
+            config: A dictionary of configuration settings.
+            - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
+        """
+        self.config = config if config is not None else {'keep_tags': False}
+    def _fetch_content(self, url: str) -> str:
+        """
+        Fetches and parses the text content from a URL.
+        Args:
+            url: The URL to fetch content from.
+        Returns:
+            The clean, extracted text content from the page.
+        Raises:
+            ValueError: If the URL cannot be fetched or processed.
+        """
+        try:
+            # Set a User-Agent header to mimic a browser, which can help avoid
+            # being blocked by some websites.
+            # Inside _fetch_content method
+            headers =  headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.6",
+                "Cache-Control": "max-age=0",
+                "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
+                "Sec-Ch-Ua-Mobile": "?0",
+                "Sec-Ch-Ua-Platform": "\"Windows\"",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "none",
+                "Sec-Fetch-User": "?1",
+                "Upgrade-Insecure-Requests": "1",
+            }
+            # Make the HTTP GET request with a timeout.
+            response = requests.get(url, headers=headers, timeout=15)
+            return response.text
+        except requests.exceptions.RequestException as e:
+            # Catch any network-related errors (DNS, connection, timeout, etc.)
+            # and re-raise them as a more user-friendly ValueError.
+            raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
+    @abstractmethod
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        pass
+class BasicPreprocessor(Preprocessor):
+    """
+    Base preprocessor with common functionality.
+    Can be extended for specific preprocessing tasks.
+    """
+    # TODO: Might need to think of how to improve this later
+    def _clean_html(self, html_content: str) -> str:
+        """
+        Cleans up the given HTML content by:
+        - Removing <script> and <style> tags and their content.
+        - Removing HTML comments.
+        - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
+        Args:
+            html_content (str): The HTML content to clean.
+        Returns:
+            str: The cleaned, visible text from the HTML.
+        """
+        # Parse the HTML content
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove script and style elements
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # Extract text and normalize whitespace
+        if self.config.get('keep_tags', False):
+            # If keep_tags is True, return the raw HTML
+            return str(soup)
+        text = soup.get_text(separator=" ", strip=True)
+        clean_text = re.sub(r'\s+', ' ', text)
+        return clean_text
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        html_content = content
+        if is_url:
+            # Fetch content from the URL
+            html_content = self._fetch_content(content)
+        # Clean the HTML content
+        # cleaned_content = self._clean_html(html_content)
+        cleaner = HTMLCleaner({
+            'keep_tags': True if self.config.get('keep_tags', False) else False,
+            'strip_attrs': True,
+            'strip_links': True,
+            'extra_remove_tags': ['header', 'footer']
+        })
+        clean = cleaner._clean_html(html_content=html_content)
+        clean = clean_html(clean)
+        # clean = clean_html(html_content)
+        return clean.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace