""" 确定目标网站:https://www.wxscs.com/book/9422/ 内容页: """ #引入网页请求模块 import requests #网页主界面 url = "https://www.wxscs.com/book/9422/" #伪造亲求头部 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0" } # 发起伪造请求 response = requests.get(url, headers=headers) # 设置响应编码 response.encoding = "UTF-8" # 查看响应数据 content = response.text #的打印html页面 print(content)import re# <a href="/book/9422/1874033.html" title="第九章 壶娱中秋节" target="_blank">第九章 壶娱中秋节</a> #写出对应正则表达式 p = r'<a href="(/book/9422/187.*?)"\s+title=".*?"\s+target="_blank">(第.*?)</a>' chs = re.findall(p,content) print(chs)chapter = {} for ch in chs:chapter_url = "https://www.wxscs.com" + ch[0]chapter_title = ch[1]chapter[chapter_title] = chapter_url # 最终链接数据 print(chapter) import json with open("chapters.txt",mode="wt",encoding="UTF-8") as file:json.dump(chapter,file) #得到一个文件 文件内是章节目录
""" 章节内数据 """import requests,re import time,random import json #找到文件 with open("chapters.txt",encoding="UTF-8") as file:chs = json.load(file)# print(chs)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0" }#分离标题和网页链接 for title,url in chs.items():print(f"准备采集{title}\n")response = requests.get(url,headers=headers)response.encoding = "UTF-8"html = response.text# print(html)print("---------------------") #正则找到想要的内容p = r'<div id="cont-body"\s+class="cont-body 187.*?">.*?<script>.*?</script>(.*?)</div>'content = re.search(p,html,re.DOTALL)content = content.group(1).strip()# 数据清晰p2 = r'(<p>|</p>)'content = re.sub(p2, '\n', content, re.X)# content = "\n".join(content)# print(content) #将数据输出为一个文件with open("杨戬.txt",mode="at",encoding="UTF-8") as file:file.write("\n\n---------------\n\n")file.write("\n\n"+title+"\n\n")file.write(content) #休眠伪造真人操作time.sleep(random.randint(5,10))print(f"{title}采集完成")