企业网站托管注意事项_提升学历一般多少钱_宁德市疫情最新消息_建立网站怎么搞

趣笔阁爬虫实验

用BeautifulSoup解析网页结构，爬取指定小说的页面，将每个章节的内容保存到txt文件中

可以改进的点：（待更新

1.反爬措施

2.多线程

3.保存为markdown格式更加美观

import os
import re
import requests
from bs4 import BeautifulSoup
import time
def getHtml(url, param=None, encoding=None):# 获取url内的html文本内容try:# 构造访问头header = {'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"}# 获取返回html = requests.get(url, headers=header, timeout=10)# 定义编码方式if encoding is None:encoding = html.apparent_encodinghtml.encoding = encoding# 保存html文本content = html.text# 关闭连接html.close()# 间隔1s 防止过快访问time.sleep(1)# 返回网页内容return contentexcept requests.RequestException as e:print(f"请求失败: {e}")return None  # 返回None表示失败def getLink(content):soup = BeautifulSoup(content, "html.parser")res = []# 查找所有章节标题的标签titles = soup.find_all('dd')# 提取每个章节的文本for title in titles:a_tag = title.find('a')if a_tag:name = a_tag.textres.append(name)print(res)return resdef save(name, passage_content, path):# 定义文件路径，文件名使用章节名称file_path = os.path.join(path, f"{name}.txt")# 将连续空格替换为换行符，并将其写入文件with open(file_path, 'w', encoding='utf-8') as file:for content in passage_content:# 替换连续空格为换行符formatted_content = re.sub(r'\s{2,}', '\n', content.text)file.write(formatted_content + "\n")  # 额外加上换行符，使得段落更清晰print(f"章节 {name} 已保存")
def saveImg(title,imgLink,path):# 定义文件路径，文件名使用章节名称file_path = os.path.join(path, f"{title}.jpg")response = requests.get(imgLink)# 将连续空格替换为换行符，并将其写入文件if response.status_code == 200:with open(file_path, 'wb') as file:file.write(response.content)  # 额外加上换行符，使得段落更清晰print(f"图片 {title} 已保存")else:print(f"图片 {title} 保存失败")def getMain(content):soup = BeautifulSoup(content, "html.parser")name = ''titles = soup.find_all('div',attrs={"class":"info"})# 提取每个章节的文本for title in titles:img_tag = title.find('img')img = img_tag['src']# print(img_tag['src'])name = img_tag['alt']# print(img_tag['alt'])return name,imgdef getConcent(root,titles,path):pat = re.compile(r'第(.*?)章')for title in titles:res = pat.search(title)if res:print(res.groups())page = res.group(1)url = root + page + ".html"content = getHtml(url)soup = BeautifulSoup(content, "html.parser")# print(content)passage = []passage_content = soup.find_all("div",attrs={"id":"chaptercontent"})for item in passage_content:passage.append(item.text)print(item.text)print(markdownify.markdownify(item.text))# print(passage_content)save(title, passage_content, path)
if __name__ == "__main__":try:# 目标贴吧url = "https://www.3bqg.cc/book/152484/"# 目标输出csv文件路径path = "./novel/"# 获取目标网页的源码content = getHtml(url)title,img = getMain(content)saveImg(title,img,path+title)titles = getLink(content)getConcent(url,titles,path+title)except Exception as e:print(f"程序运行出错: {e}")

企业网站托管注意事项_提升学历一般多少钱_宁德市疫情最新消息_建立网站怎么搞

趣笔阁爬虫实验

最新新闻

热搜词