-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstat_scraper.py
More file actions
145 lines (131 loc) · 5.44 KB
/
stat_scraper.py
File metadata and controls
145 lines (131 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from sys import stdout
import logging
from datetime import datetime
import time
from openpyxl import load_workbook
import undetected_chromedriver as uc
from selenium_stealth import stealth
import os
from os import path
from scraper_meta import ScraperBase
import traceback
from configparser import ConfigParser
bin_dir = 'chrome-bin/chrome.exe'
version_number = 102
use_proxy = True
debug_mode = False
link_list = 'link_list.xlsx'
def check_create_dir(dirname):
'''
Checks if directory exists and if it doesn't creates a new directory
:param dirname: Path to directory
'''
if not path.exists(dirname):
if '/' in dirname:
os.makedirs(dirname)
else:
os.mkdir(dirname)
def read_links(inputfile):
wb = load_workbook(inputfile)
ws = wb.active
ret = [x.value for x in ws['A']]
return ret
if __name__ == '__main__':
print('StatScraper')
# Init logging
rootLogger = logging.getLogger()
consoleHandler = logging.StreamHandler(stdout)
check_create_dir('logs')
log_timestamp = datetime.now()
fileHandler = logging.FileHandler(
path.join('logs', 'StatScraper{0}.log'.format(log_timestamp.strftime('%d-%m-%y-%H-%M-%S'))))
fileHandler.setFormatter(logging.Formatter('%(asctime)s:-[%(name)s] - %(levelname)s - %(message)s'))
rootLogger.addHandler(consoleHandler)
rootLogger.addHandler(fileHandler)
rootLogger.setLevel(logging.DEBUG)
logging.getLogger('seleniumwire.handler').setLevel(logging.ERROR)
logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.ERROR)
logging.getLogger('seleniumwire.server').setLevel(logging.ERROR)
logging.getLogger('hpack.hpack').setLevel(logging.ERROR)
logging.getLogger('hpack.table').setLevel(logging.ERROR)
logging.getLogger('seleniumwire.storage').setLevel(logging.ERROR)
if debug_mode:
consoleHandler.setLevel(logging.DEBUG)
else:
consoleHandler.setLevel(logging.INFO)
fileHandler.setLevel(logging.DEBUG)
consoleHandler.setFormatter(logging.Formatter('[%(name)s] - %(levelname)s - %(message)s'))
rootLogger.info('Reading config')
config = ConfigParser()
config.read('masterconfig.ini')
link_list = config['links']['link_file']
bin_dir = config['browser']['bin_dir']
version_number = int(config['browser']['version'])
use_proxy = config.getboolean('script_options', 'use_proxy')
debug_mode = config.getboolean('script_options', 'debug_mode')
rootLogger.debug(f'Version: 19-10-22(1)')
rootLogger.debug('Starting config report')
rootLogger.debug('link_list: {}'.format(link_list))
rootLogger.debug('bin_dir: {}'.format(bin_dir))
rootLogger.debug('version_number: {}'.format(version_number))
rootLogger.debug('use_proxy: {}'.format(use_proxy))
rootLogger.debug('debug_mode: {}'.format(debug_mode))
rootLogger.debug('Preparing dump file')
check_create_dir('dump')
out_path = os.path.join('dump', 'ScrapedStats{}.xlsx'.format(log_timestamp.strftime('%d-%m-%y-%H-%M-%S')))
if use_proxy:
cookie_file = 'zenmate-cookies'
else:
cookie_file = 'chrome-data'
rootLogger.info('Initiating driver')
options = uc.ChromeOptions()
options.user_data_dir = cookie_file
options.binary_location = bin_dir
options.add_argument('--no-first-run --no-service-autorun --password-store=basic')
options.add_argument('--window-size={}'.format('1920,1080'))
options.add_argument('--disk-cache-size=1073741824')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--dns-prefetch-disable')
options.add_argument('--hide-scrollbars')
options.add_argument("--disable-infobars")
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-browser-side-navigation')
options.add_argument('--log-level=0')
options.add_argument('--ignore-certificate-errors')
options.add_argument("--disable-plugins-discovery")
options.add_argument("--start-maximized")
if use_proxy:
options.add_argument(
f"--load-extension={os.path.join(os.path.dirname(os.path.abspath(__file__)), 'plugin', 'zenmate')}")
driver = uc.Chrome(headless=False, options=options, version_main=version_number)
rootLogger.debug('Setting up stealth')
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
for link_ele in read_links(link_list):
rootLogger.info('Scanning: {}'.format(link_ele))
try:
time.sleep(10)
scrape_obj = None
for scrape_ele in ScraperBase.__subclasses__():
if scrape_ele.get_keyword() in link_ele:
rootLogger.info('{} website detected'.format(scrape_ele.get_keyword()))
scrape_obj = scrape_ele(link_ele, out_path, driver)
break
if scrape_obj is not None:
scrape_obj.run_browser()
else:
raise Exception('Website not supported')
except Exception as exc:
rootLogger.error('Error with execution. Omitting link')
rootLogger.debug('Details: {}'.format(str(exc)))
rootLogger.debug('Traceback: {}'.format(traceback.format_exc()))
driver.close()
driver.quit()
rootLogger.info('Goodbye')