-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathp1.py
More file actions
86 lines (72 loc) · 3.16 KB
/
p1.py
File metadata and controls
86 lines (72 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# Chrome driver path
chrome_driver_path = r"D:/5th trimester/Data Wrangling/Wrangling Project/chromedriver-win64/chromedriver-win64/chromedriver.exe" # ← Update path if needed
# Initialize driver
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # Optional: run in background
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
# Target URL
base_url = "https://www.daraz.com.bd/catalog/?spm=a2a0e.tm80335411.search.d_go&q=Tv"
driver.get(base_url)
# Dismiss cookie popup if exists
try:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='I understand']"))).click()
print("✅ Cookie popup dismissed.")
except:
print("ℹ️ No cookie popup.")
product_links = []
page = 1
max_retries = 3
retries = 0
# Open CSV file to save continuously
with open("TV_links.csv", "w", newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Product Link"])
while True:
print(f"\n🔄 Scraping page {page}...")
try:
WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.RfADt a"))
)
products = driver.find_elements(By.CSS_SELECTOR, "div.RfADt a")
new_links = 0
for p in products:
href = p.get_attribute("href")
if href:
full_link = "https:" + href if href.startswith("//") else href
if full_link not in product_links:
product_links.append(full_link)
writer.writerow([full_link])
new_links += 1
print(f"✅ Collected {len(product_links)} product links so far. ({new_links} new on this page)")
# Check if "Next Page" is disabled
next_li = driver.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
if next_li.get_attribute("aria-disabled") == "true":
print("🚫 No more pages.")
break
# Click the next page button
next_button = next_li.find_element(By.TAG_NAME, "button")
driver.execute_script("arguments[0].click();", next_button)
page += 1
retries = 0
time.sleep(random.uniform(2.5, 5.5))
except TimeoutException:
print("⚠️ Timeout or error occurred.")
retries += 1
if retries >= max_retries:
print("❌ Max retries reached. Stopping.")
break
else:
print(f"🔁 Retrying page {page} ({retries}/{max_retries})...")
time.sleep(random.uniform(5, 8))
print(f"\n🎉 Done! Total product links collected: {len(product_links)}")
driver.quit()