-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapewebsite.py
More file actions
40 lines (38 loc) · 1.4 KB
/
scrapewebsite.py
File metadata and controls
40 lines (38 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
def get_links(url):
# set up the webdriver options to run in headless mode
options = Options()
options.headless = True
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# create a new instance of the webdriver
driver = webdriver.Chrome(options=options)
# navigate to the url
driver.get(url)
# wait for the page to load
driver.implicitly_wait(10)
# extract all the links in the page
links = driver.find_elements_by_tag_name('a')
# list to store the links to the document files
doc_links = []
# loop through all the links and filter only the document files
for link in links:
href = link.get_attribute('href')
if href:
# check if the link ends with one of the document file extensions
if re.search(r'\.(pdf|docx|doc|pptx|ppt|xlsx|xls)$', href, re.IGNORECASE):
# add the link to the document file to the list
doc_links.append(href)
# close the webdriver
driver.quit()
return doc_links
if __name__ == '__main__':
url = input('Enter the target URL: ')
links = get_links(url)
print('Links to document files:')
for link in links:
print(link)