import fetch from 'node-fetch'; import { JSDOM } from 'jsdom'; import pdfParse from 'pdf-parse'; import puppeteer from 'puppeteer'; export const extractTextFromPDF = async (buffer: Buffer): Promise => { const data = await pdfParse(buffer); return data.text; } export const handleContentText = async (targetUrl: string) => { const response = await fetch(targetUrl); const status = response.status; const contentType = response.headers.get('content-type') || ''; let content; if (status >= 400) { // If status is 400 or greater, try using puppeteer const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished. content = await page.evaluate(() => document.body.innerText); await browser.close(); return content; } else if (contentType.includes('application/pdf')) { const buffer = await response.arrayBuffer(); content = await extractTextFromPDF(buffer as any); } else if (contentType.includes('text/html')) { const html = await response.text(); const dom = new JSDOM(html); const scripts = dom.window.document.querySelectorAll('script, style'); scripts.forEach(element => element.remove()); content = dom.window.document.body.textContent || ''; } else { content = await response.text(); } return content.trim(); }