Agent-evaluations

Sleeping

App Files Files Community

WilliamRabuel commited on 22 days ago

Commit

4ff7b57

verified ·

1 Parent(s): 145e1cf

Create tools.py

Browse files

Files changed (1) hide show

tools.py +482 -0

tools.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import os
+import re
+import json
+import time
+import requests
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+import tempfile
+from io import BytesIO
+# Imports pour les outils
+from duckduckgo_search import DDGS
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+from fake_useragent import UserAgent
+# Imports pour la lecture de fichiers
+import PyPDF2
+from docx import Document
+import openpyxl
+import pandas as pd
+import markdown
+import magic
+from PIL import Image
+import numpy as np
+from smolagents import tool
+class AdvancedWebSearchTool:
+    """Outil de recherche web avancé avec DuckDuckGo"""
+    def __init__(self):
+        self.ddgs = DDGS()
+        self.ua = UserAgent()
+    @tool
+    def search_web(self, query: str, max_results: int = 10, region: str = "fr-fr") -> List[Dict[str, str]]:
+        """
+        Recherche avancée sur le web avec DuckDuckGo
+        Args:
+            query: Requête de recherche
+            max_results: Nombre maximum de résultats (défaut: 10)
+            region: Région de recherche (défaut: fr-fr)
+        Returns:
+            Liste de dictionnaires avec title, url, snippet
+        """
+        try:
+            print(f"🔍 Recherche: '{query}' (max: {max_results})")
+            results = []
+            search_results = self.ddgs.text(
+                keywords=query,
+                region=region,
+                max_results=max_results,
+                timelimit="m"  # Résultats récents
+            )
+            for result in search_results:
+                results.append({
+                    "title": result.get("title", ""),
+                    "url": result.get("href", ""),
+                    "snippet": result.get("body", "")
+                })
+            print(f"✅ Trouvé {len(results)} résultats")
+            return results
+        except Exception as e:
+            print(f"❌ Erreur lors de la recherche: {e}")
+            return [{"error": str(e)}]
+class AdvancedWebScrapingTool:
+    """Outil de scraping web avancé avec Selenium et BeautifulSoup"""
+    def __init__(self):
+        self.ua = UserAgent()
+        self._driver = None
+    def _get_driver(self):
+        """Initialise le driver Selenium si nécessaire"""
+        if self._driver is None:
+            options = Options()
+            options.add_argument("--headless")
+            options.add_argument("--no-sandbox")
+            options.add_argument("--disable-dev-shm-usage")
+            options.add_argument(f"--user-agent={self.ua.random}")
+            options.add_argument("--disable-gpu")
+            options.add_argument("--window-size=1920,1080")
+            try:
+                self._driver = webdriver.Chrome(
+                    options=options
+                )
+            except Exception as e:
+                print(f"❌ Erreur driver Selenium: {e}")
+                return None
+        return self._driver
+    @tool
+    def scrape_website(self, url: str, extract_text: bool = True, extract_links: bool = False) -> Dict[str, Any]:
+        """
+        Scrape un site web et extrait le contenu
+        Args:
+            url: URL à scraper
+            extract_text: Extraire le texte principal
+            extract_links: Extraire les liens
+        Returns:
+            Dictionnaire avec le contenu extrait
+        """
+        try:
+            print(f"🌐 Scraping: {url}")
+            # Tentative avec requests d'abord (plus rapide)
+            headers = {
+                'User-Agent': self.ua.random,
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+                'Accept-Language': 'fr-FR,fr;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+            }
+            response = requests.get(url, headers=headers, timeout=30)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            result = {
+                "url": url,
+                "title": "",
+                "text": "",
+                "links": [],
+                "status": "success"
+            }
+            # Titre
+            title_tag = soup.find('title')
+            if title_tag:
+                result["title"] = title_tag.get_text().strip()
+            # Texte principal
+            if extract_text:
+                # Supprime les scripts et styles
+                for script in soup(["script", "style", "nav", "footer", "header"]):
+                    script.decompose()
+                # Extrait le texte principal
+                main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article'))
+                if main_content:
+                    text = main_content.get_text(separator=' ', strip=True)
+                else:
+                    text = soup.get_text(separator=' ', strip=True)
+                # Nettoie le texte
+                text = re.sub(r'\s+', ' ', text)
+                result["text"] = text[:10000]  # Limite à 10k caractères
+            # Liens
+            if extract_links:
+                links = []
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.startswith('http'):
+                        links.append({
+                            "url": href,
+                            "text": link.get_text().strip()
+                        })
+                result["links"] = links[:50]  # Limite à 50 liens
+            print(f"✅ Scraping réussi: {len(result['text'])} caractères")
+            return result
+        except requests.exceptions.RequestException as e:
+            # Tentative avec Selenium si requests échoue
+            return self._scrape_with_selenium(url, extract_text, extract_links)
+        except Exception as e:
+            print(f"❌ Erreur scraping: {e}")
+            return {"error": str(e), "url": url}
+    def _scrape_with_selenium(self, url: str, extract_text: bool, extract_links: bool) -> Dict[str, Any]:
+        """Scraping avec Selenium en fallback"""
+        try:
+            print("🤖 Tentative avec Selenium...")
+            driver = self._get_driver()
+            if not driver:
+                return {"error": "Impossible d'initialiser le driver", "url": url}
+            driver.get(url)
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            result = {
+                "url": url,
+                "title": driver.title,
+                "text": "",
+                "links": [],
+                "status": "success_selenium"
+            }
+            if extract_text:
+                # Supprime les éléments indésirables
+                driver.execute_script("""
+                    var elements = document.querySelectorAll('script, style, nav, footer, header');
+                    for (var i = 0; i < elements.length; i++) {
+                        elements[i].remove();
+                    }
+                """)
+                text = driver.find_element(By.TAG_NAME, "body").text
+                result["text"] = text[:10000]
+            if extract_links:
+                links = []
+                for link in driver.find_elements(By.TAG_NAME, "a"):
+                    href = link.get_attribute("href")
+                    if href and href.startswith("http"):
+                        links.append({
+                            "url": href,
+                            "text": link.text.strip()
+                        })
+                result["links"] = links[:50]
+            return result
+        except Exception as e:
+            print(f"❌ Erreur Selenium: {e}")
+            return {"error": str(e), "url": url}
+    def __del__(self):
+        """Nettoie le driver à la destruction"""
+        if self._driver:
+            try:
+                self._driver.quit()
+            except:
+                pass
+class FileReaderTool:
+    """Outil de lecture de fichiers multi-format"""
+    @tool
+    def read_file(self, file_path: str) -> Dict[str, Any]:
+        """
+        Lit un fichier et extrait son contenu selon le format
+        Args:
+            file_path: Chemin vers le fichier
+        Returns:
+            Dictionnaire avec le contenu du fichier
+        """
+        try:
+            print(f"📄 Lecture du fichier: {file_path}")
+            if not os.path.exists(file_path):
+                return {"error": f"Fichier non trouvé: {file_path}"}
+            # Détection du type de fichier
+            file_ext = Path(file_path).suffix.lower()
+            result = {
+                "file_path": file_path,
+                "file_type": file_ext,
+                "content": "",
+                "metadata": {},
+                "status": "success"
+            }
+            # PDF
+            if file_ext == '.pdf':
+                result.update(self._read_pdf(file_path))
+            # Word Documents
+            elif file_ext in ['.docx', '.doc']:
+                result.update(self._read_docx(file_path))
+            # Excel
+            elif file_ext in ['.xlsx', '.xls']:
+                result.update(self._read_excel(file_path))
+            # CSV
+            elif file_ext == '.csv':
+                result.update(self._read_csv(file_path))
+            # JSON
+            elif file_ext == '.json':
+                result.update(self._read_json(file_path))
+            # Markdown
+            elif file_ext in ['.md', '.markdown']:
+                result.update(self._read_markdown(file_path))
+            # Texte simple
+            elif file_ext in ['.txt', '.log', '.py', '.js', '.html', '.css']:
+                result.update(self._read_text(file_path))
+            # Images
+            elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
+                result.update(self._read_image(file_path))
+            else:
+                # Tentative de lecture comme texte
+                result.update(self._read_text(file_path))
+            print(f"✅ Fichier lu avec succès: {len(str(result['content']))} caractères")
+            return result
+        except Exception as e:
+            print(f"❌ Erreur lecture fichier: {e}")
+            return {"error": str(e), "file_path": file_path}
+    def _read_pdf(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier PDF"""
+        try:
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                text = ""
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+                return {
+                    "content": text.strip(),
+                    "metadata": {
+                        "pages": len(pdf_reader.pages),
+                        "title": pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else ''
+                    }
+                }
+        except Exception as e:
+            return {"error": f"Erreur PDF: {e}"}
+    def _read_docx(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier Word"""
+        try:
+            doc = Document(file_path)
+            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+            return {
+                "content": text,
+                "metadata": {
+                    "paragraphs": len(doc.paragraphs),
+                    "core_properties": str(doc.core_properties.title) if doc.core_properties.title else ""
+                }
+            }
+        except Exception as e:
+            return {"error": f"Erreur DOCX: {e}"}
+    def _read_excel(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier Excel"""
+        try:
+            df = pd.read_excel(file_path, sheet_name=None)
+            content = {}
+            for sheet_name, sheet_df in df.items():
+                content[sheet_name] = {
+                    "data": sheet_df.to_dict('records')[:100],  # Limite à 100 lignes
+                    "shape": sheet_df.shape,
+                    "columns": list(sheet_df.columns)
+                }
+            return {
+                "content": content,
+                "metadata": {
+                    "sheets": list(df.keys()),
+                    "total_sheets": len(df)
+                }
+            }
+        except Exception as e:
+            return {"error": f"Erreur Excel: {e}"}
+    def _read_csv(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier CSV"""
+        try:
+            df = pd.read_csv(file_path)
+            return {
+                "content": {
+                    "data": df.head(100).to_dict('records'),  # Premières 100 lignes
+                    "shape": df.shape,
+                    "columns": list(df.columns),
+                    "dtypes": df.dtypes.to_dict()
+                },
+                "metadata": {
+                    "rows": len(df),
+                    "columns": len(df.columns)
+                }
+            }
+        except Exception as e:
+            return {"error": f"Erreur CSV: {e}"}
+    def _read_json(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier JSON"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                return {
+                    "content": data,
+                    "metadata": {
+                        "type": type(data).__name__,
+                        "size": len(str(data))
+                    }
+                }
+        except Exception as e:
+            return {"error": f"Erreur JSON: {e}"}
+    def _read_markdown(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier Markdown"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                html = markdown.markdown(content)
+                return {
+                    "content": content,
+                    "metadata": {
+                        "html_version": html,
+                        "lines": len(content.split('\n'))
+                    }
+                }
+        except Exception as e:
+            return {"error": f"Erreur Markdown: {e}"}
+    def _read_text(self, file_path: str) -> Dict[str, Any]:
+        """Lit un fichier texte"""
+        try:
+            # Détection de l'encodage
+            encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+            for encoding in encodings:
+                try:
+                    with open(file_path, 'r', encoding=encoding) as file:
+                        content = file.read()
+                        return {
+                            "content": content,
+                            "metadata": {
+                                "encoding": encoding,
+                                "lines": len(content.split('\n')),
+                                "characters": len(content)
+                            }
+                        }
+                except UnicodeDecodeError:
+                    continue
+            return {"error": "Impossible de décoder le fichier"}
+        except Exception as e:
+            return {"error": f"Erreur texte: {e}"}
+    def _read_image(self, file_path: str) -> Dict[str, Any]:
+        """Lit une image et extrait les métadonnées"""
+        try:
+            with Image.open(file_path) as img:
+                return {
+                    "content": f"Image {img.format} - Taille: {img.size[0]}x{img.size[1]}",
+                    "metadata": {
+                        "format": img.format,
+                        "size": img.size,
+                        "mode": img.mode,
+                        "path": file_path
+                    }
+                }
+        except Exception as e:
+            return {"error": f"Erreur image: {e}"}
+# Initialisation globale des outils
+web_search_tool = AdvancedWebSearchTool()
+web_scraping_tool = AdvancedWebScrapingTool()
+file_reader_tool = FileReaderTool()
+# Export des outils pour smolagents
+search_web = web_search_tool.search_web
+scrape_website = web_scraping_tool.scrape_website
+read_file = file_reader_tool.read_file