-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrap.py
More file actions
63 lines (49 loc) · 1.63 KB
/
scrap.py
File metadata and controls
63 lines (49 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import requests
from lxml import html
import unicodecsv as csv
INITIAL_URL='http://www.creprice.cn/'
#http://www.creprice.cn/market/distrank/city/zy.html?flag=1
DATAURL='http://www.creprice.cn/market/distrank/city/'
areas = dict()
def getAreasList(url):
resp = requests.get(url)
tree = html.fromstring(resp.content)
results = dict()
for i, tag in enumerate(tree.xpath('//*[self::span]')):
code_id = tag.xpath('.//@code')
area = tag.xpath('.//text()')
if code_id and code_id[0] not in results:
results[code_id[0]] = area[0]
return results
def getAreaDataRange(area):
url = DATAURL + area + '.html?flag=1'
resp = requests.get(url)
tree = html.fromstring(resp.content)
results = []
for i, tag in enumerate(tree.xpath('//li/a')):
link = tag.xpath('.//@href')
if link and "month" in link[0]:
results.append(link[0])
return results
def getAreaData(area, url):
resp = requests.get(url)
year, month = (url.split('month=',1)[1]).split('-',1)
tree = html.fromstring(resp.content)
results = dict()
for i, tag in enumerate(tree.xpath('//tbody/tr')):
l = tag.xpath('.//text()')
for j, val in enumerate(l):
l[j] = val.strip()
date = l[1]+'/'+month+'/'+year
results[i] = (date,area,l[3],l[5],l[9])
return results
areas = getAreasList(INITIAL_URL)
f = open('report.csv', 'wb')
writer = csv.writer(f)
for key, value in areas.iteritems():
print('Processing ' + key + ' ' + value)
for url in getAreaDataRange(key):
print('Processing url: ' + url)
for line in getAreaData(value,url).itervalues():
writer.writerow(line)