-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapper.py
More file actions
61 lines (54 loc) · 2.15 KB
/
scrapper.py
File metadata and controls
61 lines (54 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# https://api.pushshift.io/reddit/search/submission/?subreddit=learnpython&sort=desc&sort_type=created_utc&before=1523934121&size=1000
import requests
import pandas as pd
subreddit_list = ["nfl", "AskReddit", "worldnews", "politics", "gaming", "science", "interestingasfuck", "showerthoughts",
"freefolk", "todayilearned", "anime", "nba", "The_Donald", "teenagers", "soccer", "jokes",
"EarthPorn", "askreddit", "movies", "mma"]
base = "https://api.pushshift.io/reddit/search/submission/?subreddit="
#before=1523934121&size=1000
i = 19
sub_data = []
earliest_utc = 1000000000000000
threshold = 50000
while i < len(subreddit_list):
print(subreddit_list[i])
subreddit = subreddit_list[i]
if earliest_utc == 1000000000000000:
trail = "&sort=desc&sort_type=created_utc&size=1000&score=>50"
r = requests.get(base + subreddit + trail)
else:
trail = "&before=" + str(earliest_utc) + "&sort=desc&sort_type=created_utc&size=1000&score=>50"
r = requests.get(base + subreddit + trail)
posts = r.json()
print(r.status_code)
if len(posts['data']) == 0:
print(len(sub_data))
i += 1
sub_data = []
earliest_utc = 1000000000000000
continue
j = 0
while j < len(posts['data']):
if j == 0:
print(earliest_utc)
earliest_utc = min(earliest_utc, int(posts['data'][j]['created_utc']))
data_point = [posts['data'][j]['title']]
if 'url' in posts['data'][j]:
data_point.append(posts['data'][j]['url'])
else:
data_point.append("")
if 'selftext' in posts['data'][j]:
data_point.append(posts['data'][j]['selftext'][0:300].replace("\n", ""))
else:
data_point.append("")
if len(posts['data'][j]['title']) > 15:
sub_data.append(data_point)
j += 1
if len(sub_data) >= threshold:
df = pd.DataFrame(sub_data[0:threshold])
df.to_csv(str(i) + "_raw_data.csv")
sub_data = []
earliest_utc = 1000000000000000
i += 1
# implement pager -->
# save raw data