|
import fetch from 'node-fetch'; |
|
import { JSDOM } from 'jsdom'; |
|
import pdfParse from 'pdf-parse'; |
|
import puppeteer from 'puppeteer'; |
|
|
|
|
|
export const extractTextFromPDF = async (buffer: Buffer): Promise<string> => { |
|
const data = await pdfParse(buffer); |
|
return data.text; |
|
} |
|
|
|
export const handleContentText = async (targetUrl: string) => { |
|
const response = await fetch(targetUrl); |
|
const status = response.status; |
|
const contentType = response.headers.get('content-type') || ''; |
|
let content; |
|
|
|
if (status >= 400) { |
|
|
|
const browser = await puppeteer.launch(); |
|
const page = await browser.newPage(); |
|
await page.goto(targetUrl, { waitUntil: 'networkidle0' }); |
|
content = await page.evaluate(() => document.body.innerText); |
|
await browser.close(); |
|
return content; |
|
} else if (contentType.includes('application/pdf')) { |
|
const buffer = await response.arrayBuffer(); |
|
content = await extractTextFromPDF(buffer as any); |
|
} else if (contentType.includes('text/html')) { |
|
const html = await response.text(); |
|
const dom = new JSDOM(html); |
|
const scripts = dom.window.document.querySelectorAll('script, style'); |
|
scripts.forEach(element => element.remove()); |
|
content = dom.window.document.body.textContent || ''; |
|
} else { |
|
content = await response.text(); |
|
} |
|
return content.trim(); |
|
} |
|
|