hackatech_ovh/scrap.py at master · tdenimal/hackatech_ovh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from bs4 import BeautifulSoup
import os
import requests
import time
import sys
import json


def extract_doc(rev,company):
        #Doc dict
        doc = {}

        doc["company"] = company

        #Review id
        doc["review_id"] = rev['data-review']

        #Review epoch date
        doc["date"] = time.strftime('%d/%m/%Y', time.localtime(int(rev['data-date'])))

        #Review language
        doc["language"] = rev['data-language']

        #Review header
        doc["title"] = rev.find("h3", class_="review_header").text.strip()


        #Retrieve content
        content_raw = rev.find("div", class_="review-summary")\
                        .find("div", class_="toggle-content")

        #Cleanup tags
        for tag in content_raw.find_all('a'):
            tag.replaceWith('')
        for tag in content_raw.find_all('span'):
            tag.replaceWith('')

        doc["content"] = content_raw.text.replace("\r", "")\
                                         .replace("\t", "")\
                                         .replace("\n", "")\
                                         .strip()

        #Retrieve scoring
        doc["overall_rating"] = float(rev.find("div", class_="review-rating")\
                                         .find("span", class_="review-score").text)

        #Retrieve various scoring
        ratings = rev.find("ul", class_="user_review_rating_list")\
                     .find_all("div",class_="stars-rating")

        #Rating details
        doc["reliability_rating"] = float(ratings[0]['data-rating'])
        doc["pricing_rating"] = float(ratings[1]['data-rating'])
        doc["user_friendly_rating"] = float(ratings[2]['data-rating'])
        doc["support_rating"] = float(ratings[3]['data-rating'])
        doc["features_rating"] = float(ratings[4]['data-rating'])

        return doc


#Write to file
def write_json(doc_array,company):

    with open("/scrap_data/"+company+'.json', 'w') as json_file:
        json.dump(doc_array, json_file)


def main():
        # print command line arguments
        if len(sys.argv[1:]) != 1:
            print("Usage : python3 scrap.py url")

        url = sys.argv[1]

        #Array that will contain all reviews
        doc_array = []

        #Test page
        base_url = url+"page/"

        current_page = 1
        page = requests.get(base_url+str(current_page))

        #Test page existence
        if page.status_code == 404:
            return 1

        #Retrieve number of pages for that company to loop on it
        soup = BeautifulSoup(page.content,'html.parser')
        try:
            num_links = int(soup.find_all("a",class_='page-numbers')[-1].text)
        except:
            num_links=1

        #Retreive company name from url
        company = base_url.split("/")[4].replace("-reviews","")

        #Loop on pages list
        for current_page in range(1,num_links+1):

            #TODO REPLACE WITH LOGGING
            print(str(current_page)+"/"+str(num_links))

            #Parse page content for reviews
            page = requests.get(base_url+str(current_page))
            soup = BeautifulSoup(page.content,'html.parser')
            review_contents = soup.find_all("article",class_="user_review_holder")

            for rev in review_contents:
                #Extract reviews in json format
                doc = extract_doc(rev,company)
                doc_array += [doc]

        #Write down to file
        write_json(doc_array,company)


if __name__ == "__main__":
    main()