rag-chat-botapi

Sleeping

App Files Files Community

Pamudu13 commited on Mar 31

Commit

61467ea

verified ·

1 Parent(s): 83f0301

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -316

app.py CHANGED Viewed

@@ -14,30 +14,6 @@ import logging
 import queue
 from huggingface_hub import HfApi
-# Create a logging filter to suppress socket warnings
-class SocketWarningFilter(logging.Filter):
-    def filter(self, record):
-        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
-# Create a queue for log messages
-log_queue = queue.Queue()
-# Custom log handler that puts messages in the queue
-class QueueHandler(logging.Handler):
-    def emit(self, record):
-        log_entry = self.format(record)
-        log_queue.put(log_entry)
-# Set up logging with the custom handler
-logger = logging.getLogger()
-queue_handler = QueueHandler()
-queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
-queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
-logger.addHandler(queue_handler)
-logger.setLevel(logging.INFO)
-# Also add the filter to the root logger to catch all socket warnings
-logging.getLogger().addFilter(SocketWarningFilter())
 app = Flask(__name__)
 # Enable CORS with specific settings
@@ -49,81 +25,6 @@ CORS(app, resources={
     }
 })
-def search_images(query, num_images=5):
-    # Headers to mimic a browser request
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    # Format the query for URL
-    formatted_query = urllib.parse.quote(query)
-    # Google Images URL
-    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
-    try:
-        # Get the HTML content
-        response = requests.get(url, headers=headers, timeout=30)
-        response.raise_for_status()
-        # Find all image URLs using regex
-        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
-        # Remove duplicates while preserving order
-        image_urls = list(dict.fromkeys(image_urls))
-        # Store results
-        results = []
-        downloaded = 0
-        for img_url in image_urls:
-            if downloaded >= num_images:
-                break
-            try:
-                # Skip small thumbnails and icons
-                if 'gstatic.com' in img_url or 'google.com' in img_url:
-                    continue
-                # Download image
-                img_response = requests.get(img_url, headers=headers, timeout=10)
-                img_response.raise_for_status()
-                # Check if the response is actually an image
-                content_type = img_response.headers.get('Content-Type', '')
-                if not content_type.startswith('image/'):
-                    continue
-                # Convert image to base64
-                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
-                # Add to results
-                results.append({
-                    'image_url': img_url,
-                    'base64_data': f"data:{content_type};base64,{image_base64}"
-                })
-                downloaded += 1
-                # Add a random delay between downloads
-                time.sleep(random.uniform(0.5, 1))
-            except Exception as e:
-                logger.error(f"Error downloading image: {str(e)}")
-                continue
-        return results
-    except Exception as e:
-        logger.error(f"An error occurred: {str(e)}")
-        return []
 HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure you set the HF_TOKEN in your environment
@@ -189,223 +90,6 @@ def get_live_space_status():
     return Response(stream_with_context(generate()), mimetype='text/event-stream')
-@app.route('/search_images', methods=['GET'])
-def api_search_images():
-    try:
-        # Get query parameters
-        query = request.args.get('query', '')
-        num_images = int(request.args.get('num_images', 5))
-        if not query:
-            return jsonify({'error': 'Query parameter is required'}), 400
-        if num_images < 1 or num_images > 20:
-            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
-        # Search for images
-        results = search_images(query, num_images)
-        response = jsonify({
-            'success': True,
-            'query': query,
-            'results': results
-        })
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-    except Exception as e:
-        logger.error(f"Error in search_images: {str(e)}")
-        response = jsonify({
-            'success': False,
-            'error': str(e)
-        }), 500
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-def scrape_site_content(query, num_sites=5):
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    results = []
-    scraped = 0
-    retries = 2  # Number of retries per URL
-    timeout = 5  # Reduced timeout to 5 seconds
-    try:
-        # Get more URLs than needed to account for failures
-        search_results = list(search(query, num_results=num_sites * 2))
-        # Process each found URL
-        for url in search_results:
-            if scraped >= num_sites:
-                break
-            success = False
-            for attempt in range(retries):
-                try:
-                    # Get the HTML content
-                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
-                    logger.info(f"Scraping URL: {url}")
-                    response = requests.get(
-                        url,
-                        headers=headers,
-                        timeout=timeout,
-                        verify=False  # Skip SSL verification
-                    )
-                    response.raise_for_status()
-                    # Verify it's HTML content
-                    content_type = response.headers.get('Content-Type', '').lower()
-                    if 'text/html' not in content_type:
-                        logger.info(f"Skipping {url} - not HTML content")
-                        break
-                    # Parse the HTML content
-                    soup = BeautifulSoup(response.text, 'html.parser')
-                    # Remove script and style elements
-                    for script in soup(["script", "style"]):
-                        script.decompose()
-                    # Extract text content (limit to first 10000 characters)
-                    text_content = soup.get_text(separator='\n', strip=True)[:10000]
-                    # Skip if not enough content
-                    if len(text_content.split()) < 100:  # Skip if less than 100 words
-                        logger.info(f"Skipping {url} - not enough content")
-                        break
-                    # Extract all links (limit to first 10)
-                    links = []
-                    for link in soup.find_all('a', href=True)[:10]:
-                        href = link['href']
-                        if href.startswith('http'):
-                            links.append({
-                                'text': link.get_text(strip=True),
-                                'url': href
-                            })
-                    # Extract meta information
-                    title = soup.title.string if soup.title else ''
-                    meta_description = ''
-                    meta_keywords = ''
-                    meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
-                    if meta_desc_tag:
-                        meta_description = meta_desc_tag.get('content', '')
-                    meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-                    if meta_keywords_tag:
-                        meta_keywords = meta_keywords_tag.get('content', '')
-                    results.append({
-                        'url': url,
-                        'title': title,
-                        'meta_description': meta_description,
-                        'meta_keywords': meta_keywords,
-                        'text_content': text_content,
-                        'links': links
-                    })
-                    scraped += 1
-                    success = True
-                    # Add a random delay between scrapes
-                    time.sleep(random.uniform(0.5, 1))
-                    break  # Break retry loop on success
-                except requests.Timeout:
-                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
-                    if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} timeout attempts")
-                except requests.RequestException as e:
-                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
-                    if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} failed attempts")
-                # Add a longer delay between retries
-                if not success and attempt < retries - 1:
-                    time.sleep(random.uniform(1, 2))
-            # If we haven't found enough valid content and have more URLs, continue
-            if scraped < num_sites and len(results) < len(search_results):
-                continue
-        return results
-    except Exception as e:
-        print(f"Error in search/scraping process: {str(e)}")
-        # Return whatever results we've managed to gather
-        return results
-@app.route('/scrape_sites', methods=['GET'])
-def api_scrape_sites():
-    try:
-        # Get query parameters
-        query = request.args.get('query', '')
-        num_sites = int(request.args.get('num_sites', 10))
-        if not query:
-            return jsonify({'error': 'Query parameter is required'}), 400
-        if num_sites < 1 or num_sites > 20:
-            return jsonify({'error': 'Number of sites must be between 1 and 20'}), 400
-        # Scrape the websites
-        results = scrape_site_content(query, num_sites)
-        response = jsonify({
-            'success': True,
-            'query': query,
-            'results': results
-        })
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-    except Exception as e:
-        logger.error(f"Error in api_scrape_sites: {str(e)}")
-        response = jsonify({
-            'success': False,
-            'error': str(e)
-        }), 500
-        # Add CORS headers
-        response.headers['Access-Control-Allow-Origin'] = '*'
-        return response
-@app.route('/logs/stream')
-def stream_logs():
-    def generate():
-        while True:
-            try:
-                # Get log message from queue, timeout after 1 second
-                log_message = log_queue.get(timeout=1)
-                yield f"data: {log_message}\n\n"
-            except queue.Empty:
-                # Send a heartbeat to keep the connection alive
-                yield "data: heartbeat\n\n"
-            except GeneratorExit:
-                break
-    response = Response(stream_with_context(generate()), mimetype='text/event-stream')
-    response.headers['Cache-Control'] = 'no-cache'
-    response.headers['Connection'] = 'keep-alive'
-    return response
 if __name__ == '__main__':
     logger.info("Starting Flask API server...")
     app.run(host='0.0.0.0', port=5001, debug=True)

 import queue
 from huggingface_hub import HfApi
 app = Flask(__name__)
 # Enable CORS with specific settings
     }
 })
 HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure you set the HF_TOKEN in your environment
     return Response(stream_with_context(generate()), mimetype='text/event-stream')
 if __name__ == '__main__':
     logger.info("Starting Flask API server...")
     app.run(host='0.0.0.0', port=5001, debug=True)