TranslateDocuments/scrapewebsite.py at main · TitaniumKnight1/TranslateDocuments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re

def get_links(url):
    # set up the webdriver options to run in headless mode
    options = Options()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    # create a new instance of the webdriver
    driver = webdriver.Chrome(options=options)
    # navigate to the url
    driver.get(url)
    # wait for the page to load
    driver.implicitly_wait(10)
    # extract all the links in the page
    links = driver.find_elements_by_tag_name('a')
    # list to store the links to the document files
    doc_links = []
    # loop through all the links and filter only the document files
    for link in links:
        href = link.get_attribute('href')
        if href:
            # check if the link ends with one of the document file extensions
            if re.search(r'\.(pdf|docx|doc|pptx|ppt|xlsx|xls)$', href, re.IGNORECASE):
                # add the link to the document file to the list
                doc_links.append(href)
    # close the webdriver
    driver.quit()
    return doc_links

if __name__ == '__main__':
    url = input('Enter the target URL: ')
    links = get_links(url)
    print('Links to document files:')
    for link in links:
        print(link)