matt HOFFNER
refactor
a73e8b4
raw
history blame
No virus
1.53 kB
import fetch from 'node-fetch';
import { JSDOM } from 'jsdom';
import pdfParse from 'pdf-parse';
import puppeteer from 'puppeteer';
export const extractTextFromPDF = async (buffer: Buffer): Promise<string> => {
const data = await pdfParse(buffer);
return data.text;
}
export const handleContentText = async (targetUrl: string) => {
const response = await fetch(targetUrl);
const status = response.status;
const contentType = response.headers.get('content-type') || '';
let content;
if (status >= 400) {
// If status is 400 or greater, try using puppeteer
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished.
content = await page.evaluate(() => document.body.innerText);
await browser.close();
return content;
} else if (contentType.includes('application/pdf')) {
const buffer = await response.arrayBuffer();
content = await extractTextFromPDF(buffer as any);
} else if (contentType.includes('text/html')) {
const html = await response.text();
const dom = new JSDOM(html);
const scripts = dom.window.document.querySelectorAll('script, style');
scripts.forEach(element => element.remove());
content = dom.window.document.body.textContent || '';
} else {
content = await response.text();
}
return content.trim();
}