File size: 1,530 Bytes
a73e8b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import fetch from 'node-fetch';
import { JSDOM } from 'jsdom';
import pdfParse from 'pdf-parse';
import puppeteer from 'puppeteer';


export const extractTextFromPDF = async (buffer: Buffer): Promise<string> => {
    const data = await pdfParse(buffer);
    return data.text;
}

export const handleContentText = async (targetUrl: string) => {
    const response = await fetch(targetUrl);
    const status = response.status;
    const contentType = response.headers.get('content-type') || '';
    let content;
  
    if (status >= 400) {
      // If status is 400 or greater, try using puppeteer
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      await page.goto(targetUrl, { waitUntil: 'networkidle0' }); // waits for the network to be idle before considering the navigation to be finished.
      content = await page.evaluate(() => document.body.innerText);
      await browser.close();
      return content;
    } else if (contentType.includes('application/pdf')) {
      const buffer = await response.arrayBuffer();
      content = await extractTextFromPDF(buffer as any);
    } else if (contentType.includes('text/html')) {
      const html = await response.text();
      const dom = new JSDOM(html);
      const scripts = dom.window.document.querySelectorAll('script, style');
      scripts.forEach(element => element.remove());
      content = dom.window.document.body.textContent || '';
    } else {
      content = await response.text();
    }
    return content.trim();
}