Data-Wrangling-Project/p1.py at main · Az-main/Data-Wrangling-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import time
import random
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Chrome driver path
chrome_driver_path = r"D:/5th trimester/Data Wrangling/Wrangling Project/chromedriver-win64/chromedriver-win64/chromedriver.exe"  # ← Update path if needed

# Initialize driver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Optional: run in background
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# Target URL
base_url = "https://www.daraz.com.bd/catalog/?spm=a2a0e.tm80335411.search.d_go&q=Tv"
driver.get(base_url)

# Dismiss cookie popup if exists
try:
    WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='I understand']"))).click()
    print("✅ Cookie popup dismissed.")
except:
    print("ℹ️ No cookie popup.")

product_links = []
page = 1
max_retries = 3
retries = 0

# Open CSV file to save continuously
with open("TV_links.csv", "w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Product Link"])

    while True:
        print(f"\n🔄 Scraping page {page}...")

        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.RfADt a"))
            )

            products = driver.find_elements(By.CSS_SELECTOR, "div.RfADt a")
            new_links = 0
            for p in products:
                href = p.get_attribute("href")
                if href:
                    full_link = "https:" + href if href.startswith("//") else href
                    if full_link not in product_links:
                        product_links.append(full_link)
                        writer.writerow([full_link])
                        new_links += 1

            print(f"✅ Collected {len(product_links)} product links so far. ({new_links} new on this page)")

            # Check if "Next Page" is disabled
            next_li = driver.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
            if next_li.get_attribute("aria-disabled") == "true":
                print("🚫 No more pages.")
                break

            # Click the next page button
            next_button = next_li.find_element(By.TAG_NAME, "button")
            driver.execute_script("arguments[0].click();", next_button)
            page += 1
            retries = 0
            time.sleep(random.uniform(2.5, 5.5))

        except TimeoutException:
            print("⚠️ Timeout or error occurred.")
            retries += 1
            if retries >= max_retries:
                print("❌ Max retries reached. Stopping.")
                break
            else:
                print(f"🔁 Retrying page {page} ({retries}/{max_retries})...")
                time.sleep(random.uniform(5, 8))

print(f"\n🎉 Done! Total product links collected: {len(product_links)}")
driver.quit()