Data-Wrangling-Project/scraping.py at main · Az-main/Data-Wrangling-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

# ---------- CONFIG ----------
URL = "https://www.daraz.com.bd/catalog/?from=hp_categories&page=1&q=Smartphones&sort=pricedesc"
OUTPUT_FILE = "daraz_smartphones_page1.csv"

# ---------- SETUP ----------
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--ignore-certificate-errors")

driver = webdriver.Chrome(options=chrome_options)

# ---------- OPEN PAGE ----------
driver.get(URL)

# ---------- WAIT & SCROLL ----------
WebDriverWait(driver, 30).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-qa-locator='general-products']"))
)

# Scroll down to load lazy-loaded products
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# ---------- SELECT PRODUCTS ----------
products_container = driver.find_element(By.CSS_SELECTOR, "div[data-qa-locator='general-products']")
products = products_container.find_elements(By.TAG_NAME, "a")  # Each product is inside an <a> tag

all_products = []

for product in products:
    try:
        # Name from title attribute or visible text inside product link
        name = product.get_attribute("title")
        if not name:
            name = product.text.split('\n')[0]  # fallback: first line of text
    except:
        name = ""
    time.sleep(2)
    try:
        # Price is inside a <span> with data-qa-locator='product-item-price'
        price_span = product.find_element(By.CSS_SELECTOR, "span.ooOxS")
        price = price_span.text
    except:
        price = ""

    time.sleep(2)

    try:
        # Image is inside an <img> tag inside the <a>
        # The src might be in data-src or src attribute (lazy loading)
        img_elem = product.find_element(By.TAG_NAME, "img")
        image = img_elem.get_attribute("src")
    except:
        image = ""

    if name:  # ignore empty titles
        all_products.append({
            "name": name,
            "price": price,
            "image": image
        })

print(f"Total products found on page 1: {len(all_products)}")

# ---------- SAVE TO CSV ----------
keys = all_products[0].keys() if all_products else ["name", "price", "image"]
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as output_file:
    dict_writer = csv.DictWriter(output_file, fieldnames=keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_products)

print(f"Saved to {OUTPUT_FILE}")

# ---------- CLEAN UP ----------
driver.quit()