-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping.py
More file actions
89 lines (73 loc) · 2.9 KB
/
scraping.py
File metadata and controls
89 lines (73 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
# ---------- CONFIG ----------
URL = "https://www.daraz.com.bd/catalog/?from=hp_categories&page=1&q=Smartphones&sort=pricedesc"
OUTPUT_FILE = "daraz_smartphones_page1.csv"
# ---------- SETUP ----------
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--ignore-certificate-errors")
driver = webdriver.Chrome(options=chrome_options)
# ---------- OPEN PAGE ----------
driver.get(URL)
# ---------- WAIT & SCROLL ----------
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-qa-locator='general-products']"))
)
# Scroll down to load lazy-loaded products
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# ---------- SELECT PRODUCTS ----------
products_container = driver.find_element(By.CSS_SELECTOR, "div[data-qa-locator='general-products']")
products = products_container.find_elements(By.TAG_NAME, "a") # Each product is inside an <a> tag
all_products = []
for product in products:
try:
# Name from title attribute or visible text inside product link
name = product.get_attribute("title")
if not name:
name = product.text.split('\n')[0] # fallback: first line of text
except:
name = ""
time.sleep(2)
try:
# Price is inside a <span> with data-qa-locator='product-item-price'
price_span = product.find_element(By.CSS_SELECTOR, "span.ooOxS")
price = price_span.text
except:
price = ""
time.sleep(2)
try:
# Image is inside an <img> tag inside the <a>
# The src might be in data-src or src attribute (lazy loading)
img_elem = product.find_element(By.TAG_NAME, "img")
image = img_elem.get_attribute("src")
except:
image = ""
if name: # ignore empty titles
all_products.append({
"name": name,
"price": price,
"image": image
})
print(f"Total products found on page 1: {len(all_products)}")
# ---------- SAVE TO CSV ----------
keys = all_products[0].keys() if all_products else ["name", "price", "image"]
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(all_products)
print(f"Saved to {OUTPUT_FILE}")
# ---------- CLEAN UP ----------
driver.quit()