-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTestTask.py
More file actions
79 lines (72 loc) · 2.33 KB
/
TestTask.py
File metadata and controls
79 lines (72 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
import sys
import validators
import requests
from bs4 import BeautifulSoup
import os
def format(text):
#print(("format"))
text = text.replace('\n', '') # удалять переносы
strlen = len(text)
strstart = 0
prev = 0
last = 0
while last < strlen: # ставить переносы, делая строки по 80 символов
last = text.find(' ', last)
if last < 0:
break
if ((last - strstart > 80)):
text = text[:prev] + '\n' + text[prev + 1:]
strstart = prev + 1
prev = last
last += 1
return text
def parse(text):
soup = BeautifulSoup(text, "html.parser")
for link in soup.select('a'): # Заменяем ссылки на текст и адрес в скобках
text = link.getText()
try:
text += ' [' + link['href'] + ']'
except KeyError:
continue
link.replaceWith(text)
res = ''
for text in soup.find_all('h1'): # Добавляем заголовок
res += text.getText()
res += '\n\n'
for text in soup.find_all('p'): # Добавляем текст статьи
res += format(text.getText())
res += '\n\n'
return res
def saveFile(url, text):
homedir = os.getcwd()
partsOfURL = url.split('/')
partlength = len(partsOfURL) - 1
addr = ''
filename = ''
if partsOfURL[partlength] == '':
partlength -= 1
for part in partsOfURL[2:]:
if part != partsOfURL[partlength]:
addr += '\\' + part
else:
filename = part.split('.')[0] + '.txt'
break
try:
os.makedirs(homedir + addr)
except WindowsError:
pass
os.chdir(homedir + addr)
print(("save: " + addr + filename + '\n'))
with open(filename, 'w') as outFile:
outFile.write(text) # сохраняем в файл
os.chdir(homedir)
for param in sys.argv[1:]:
if not ((validators.url(param))): # проверяем, что это валидный урл
print ((param + ' is not valid url'))
else:
print(("get"))
r = requests.get(param) # получаем страницу
print(("parse"))
result = parse(r.text) # преобразуем
saveFile(param, result.encode('utf-8'))