Text_Summarzier / scraper.py
ksvmuralidhar's picture
Update scraper.py
152aad4 verified
raw
history blame
1.26 kB
from selenium import webdriver
from selenium.webdriver.common.by import By
import streamlit as st
from selenium.webdriver import FirefoxOptions
import re
import logging
def scrape_text(url, n_words=15):
try:
driver = None
logging.warning("Initiated Scraping")
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
driver.set_page_load_timeout(60)
driver.get(url)
elem = driver.find_element(By.TAG_NAME, "body").text
# h1 = driver.find_element(By.TAG_NAME, "h1").text
# elem = h1 + "\n" + elem
sents = elem.split("\n")
sentence_list = []
for sent in sents:
sent = sent.strip()
if (len(sent.split()) >= n_words) and (len(re.findall(r"^\w.+[^\w\)\s]$", sent))>0):
sentence_list.append(sent)
driver.quit()
logging.warning("Closed Webdriver")
logging.warning("Successfully scraped text")
if len(sentence_list) < 3:
raise Exception("Found nothing to scrape.")
return " \n\n\n".join(sentence_list)
except:
if driver:
driver.close()
logging.warning("Closed Webdriver")
raise