抱歉,您的浏览器无法访问本站
本页面需要浏览器支持(启用)JavaScript
了解详情 >

爬取菜鸟教程,在学校无聊时看

这两百行代码是我在手机上打的,用手机那个小键盘打的!😡


使用方法

index_url 输入教程网址,如 https://www.runoob.com/css/css-tutorial.html.
downdir 输入下载的目录,就会把教程如 Python 的全部下载到 downdir
然后享受去吧

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import markdownify
import requests
from bs4 import BeautifulSoup
import re
import os
import sys
import signal

# 全局变量

# index_url = 'https://www.runoob.com/python/'
# downdir = '/storage/emulated/0/git/python-code/h/'

# 后续记得写速率控制,和input

index_url = str(input("index_url: "))
downdir = str(input("downdir: "))

if not downdir.endswith('/'):
downdir = downdir + "/"
# 文件路径生成
if not os.path.exists(downdir):
os.makedirs(downdir)
#------------------------

# 网络请求,直接返回text
def get_url(url,headers=None,encoding="",return_content_and_reality_url=False):
# 设置请求头
if headers is None:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# 发送请求
response = requests.get(url,headers=headers)
#设置编码
if encoding != "":
response.encoding = encoding
if return_content_and_reality_url:
x=[]
x.append(response.url) # url
x.append(response.content) # 内容
return(x)
return(response.content)



# 转 markdown
def to_markdown(html_text):
html_text = change_example_code_divs(html_text) # 处理实例代码
return(markdownify.markdownify(html_text))

# 写入文件,必须用绝对路径
def write_file(text,path,encoding='utf-8'):
with open(path, 'w', encoding=encoding) as f:
f.write(text)
f.close()
# def write_file(text,path):
# with open(path, 'w') as f:
# f.write(text)
# f.close()

# 追加写入文件
def write_file_a(text,path):
with open(path, 'a', encoding='utf-8') as file_a:
file_a.write(text)
file_a.close()

# 读取文件
def read_file(path,encoding='utf-8'):
x = open(path, 'r', encoding=encoding)
t = x.read()
x.close()
return(t)

# url_root
def get_root_url(url):
# 去掉任何末尾的路径部分,只保留域名和最后的路径部分
url = re.sub(r'/[^/]*$', '/', url)
return url

# 首页url
def get_base_url(url):
# 正则表达式匹配协议和域名部分
match = re.match(r'(https?://[^/]+)', url)
if match:
return match.group(1)
return None

# 安全文件名
def safe_filename(filename):
invalid_chars = '\\/:*?"<>|'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename

# 删除被<>包裹的内容
def remove_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text()

def change_example_code_divs(html_content):
# 解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有包含代码的容器
example_code_divs = soup.find_all('div', class_='example_code')

# 如果找到了包含代码的容器,进行处理
for example_code_div in example_code_divs:
# 提取所有文本内容并去除多余的HTML标签
code_lines = example_code_div.get_text()

# 去除多余的空行与空格
cleaned_code = re.sub(r'\n\s*\n', '\n', code_lines).strip()

# 构建标准化的 <pre><code> 结构
new_code_block = soup.new_tag('pre')
code_tag = soup.new_tag('code')
code_tag.string = cleaned_code
new_code_block.append(code_tag)

# 将原始代码块替换为新的标准代码块
example_code_div.replace_with(new_code_block)

# 输出转换后的HTML
return((soup.prettify()))

# 退出程序
def exit_ctrl_c(sig, frame):
print("\n退出程序...")
sys.exit(0)

print('载入函数成功')
#-------------------------

def main():
index_response = get_url(index_url,return_content_and_reality_url=True)
index_html = index_response[1]
index_url = index_response[0]
index_url_root = get_root_url(index_url)
root_url = get_base_url(index_url)

# 解析网页内容
print("解析网页内容")
index_soup = BeautifulSoup(index_html, 'html.parser')

# 建立 index 列表
print("建立 index 列表")

index_links = []
for link in index_soup.select('div.sidebar-box.gallery-list a'):
# 去除两侧空格
list_title = link.string.strip() if link.string else "No Title"
list_link = link.get('href')
if list_link.startswith('/'):
list_link = root_url + list_link
else:
list_link = index_url_root + list_link
index_links.append(list_title + ':::' + list_link)

print("-----------------------------")
print(index_links)
print("-----------------------------")
# 建立目录文件
directory_file = downdir + '目录.md'
write_file_a(f"## {index_soup.title.string}\n",directory_file)
# with open(directory_file, 'a', encoding='utf-8') as directory_file_x:
# directory_file_x.write(f"## {index_soup.title.string}\n")

# 循环爬取
print("循环爬取")
index_links_len = len(index_links)
for i,link in enumerate(index_links):
# print(link)
link = link.split(':::')
link_url = link[1]
link_html = get_url(link_url)
# 解析
link_soup = BeautifulSoup(link_html, 'html.parser')
# print(link_soup)
# 提取相关信息
# title
link_title_len = len(link_soup.select('.article-body h1'))
if link_title_len == 0:
link_title_len = len(link_soup.select('.article-body h2'))
if link_title_len == 0:
link_title = link[0]
else:
link_title = remove_tags(str(link_soup.select('.article-body h2')[0])).strip()
if len(link_title) > 35:
link_title = link[0]
else:
link_title = remove_tags(str(link_soup.select('.article-body h1')[0])).strip()
if len(link_title) > 35:
link_title = link[0]
# link_title = link[0]
print(f"[{i+1}/{index_links_len}]:{link_title}")
link_content = str(link_soup.select('.article-body')[0])
link_content_markdown = to_markdown(link_content)
# 生成文本内容
link_file_content = str(f"## [{link_title}]({link_url})\n\n[index](目录.md)\n\n---\n{link_content_markdown}")
# 文件路径
link_file_path = downdir + safe_filename(link_title) + ".md"
# 写入文件
write_file(link_file_content,link_file_path)
# 更新目录
link_url_top = safe_filename(link_title) + ".md"
write_file_a(f"- [{link_title}]({link_url_top.replace(" ","%20")})\n",directory_file)

print('success !')
print("\n退出程序...")
sys.exit(0)

if __name__ == "__main__":
signal.signal(signal.SIGINT, exit_ctrl_c)
main()

评论