文章目录
- 前言
- 一、演示
- 二、文件目录示意
- 三、使用步骤
- 1.引入库
- 2.界面控制程序
- 3.QT业务控制程序
- 4.批量修改文件名称
- 总结
前言
娱乐项目记载:爬取网络上的小说
一、演示
二、文件目录示意
三、使用步骤
1.引入库
代码如下(示例):
import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl
2.界面控制程序
main_pc.py:主要显示界面,消息发送,启动QT业务线程。
代码如下:
#_*_ coding:utf-8 _*_'''
#1.获取书名
#2.获取链接和目录名
#3.获取内容
#4.保存内容'''import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrlclass LanFei_show_window(QMainWindow,Ui_MainWindow): # 继承至界面文件的主窗口类def __init__(self):super().__init__() # 使用超类,继承父类的属性及方法self.setupUi(self) # 构造窗体界面self.setWindowIcon(QIcon("./IMG/icon/icon.jpg"))self.setWindowTitle("测试使用") # 设置窗体主体self.initUI() # 构造功能函数def initUI(self):self.pushButton.clicked.connect(self.openurl)self.pushButton_2.clicked.connect(self.dowtext)self.lineEdit.setText("https://www.xtyxsw.org/read/280637/")def click_textbrowser(self):self.msg = os.getcwd()QDesktopServices.openUrl(QUrl.fromLocalFile(self.msg))# self.textBrowser.append("<a href=\"C:/\">{}:{}</a>".format(self.gettime(),"完成下载")) def openurl(self):#此处添加功能函数geturl = self.lineEdit.text()print(geturl)print("打开网址:{}".format(geturl))if geturl != "":webbrowser.open(geturl)else:self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) def gettime(self):# 获取当前时间time_show = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())return time_show# 连接下载按钮def dowtext(self):geturl = self.lineEdit.text()# print(geturl)# print("下载数据:{}".format(geturl))if self.pushButton_2.text() == "下载":if geturl != "":self.test(geturl)self.pushButton_2.setText("停止")else:self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) elif self.pushButton_2.text() == "停止":self.worker.change_ret()def test(self,url):# url = 'https://www.clewx.com/book/202011/29/11263.html'# url = "https://www.xtyxsw.org/read/130638/" # 神秘世界# url = "https://www.xtyxsw.org/read/293323/" # 开局囤积SSS级卡牌book_name = self.get_book_url(url)print("获取书名:" + book_name)self.textBrowser.append("{}:".format(self.gettime())+"获取书名--" + book_name)htmls_list,name_list = self.get_dir(url) #获取链接#print(htmls_list)#print(name_list)self.data = [book_name,name_list,htmls_list]# 创建工作线程的工作对象self.worker = Worker(msg=self.data)# 连接信号与槽self.worker.finished.connect(self.receive)self.worker.start()def get_url(self,url):hl = requests.get(url) # 获取源码hl = hl.content.decode("utf-8")return hl'''获取书名'''def get_book_url(self,url):#首先咱们调用模块然后解析这个网页selector = html.fromstring(self.get_url(url))# shumin = selector.xpath('//div[@class = "con_top"]/h1/text()')shumin = selector.xpath('/html/body/div[3]/div[2]/div/span/text()')# print("获取书名:" + str(shumin[0]))return shumin[0]def get_dir(self,url):'''获取链接和目录名'''htmls_list = [] #创建一个空列表来存储所有章节链接names_list = []hl = self.get_url(url)selector = html.fromstring(hl)html_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/@href') #获得链接列表name_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/text()')#将链接与网页网址连接,形成每一章的网址for i in html_list:shuju = str(i)htmls_list.append(shuju)for i in name_list:shuju = str(i)names_list.append(shuju)print("每章节链接:" + str(htmls_list) )print("每章节目录:" + str(names_list))print(len(names_list))return htmls_list,names_listdef receive(self,text=[]):if text[0] == 1:self.textBrowser.append("<font color=\"#0000FF\">{}:{}</font> ".format(self.gettime(),text[1]))if text[0] == 2:self.textBrowser.append("<font color=\"#FF0000\">{}:{}</font> ".format(self.gettime(),text[1])) if text[0] == 3:self.textBrowser.setOpenLinks(False)self.textBrowser.setOpenExternalLinks(False)# self.textBrowser.append("<a href=\"%s\">超链接测试</a>" % ("完成下载"))self.textBrowser.append("<a href=\"%s\">{}:{}</a>".format(self.gettime(),text[1])) self.textBrowser.anchorClicked.connect(self.click_textbrowser) #连接函数self.pushButton_2.setText("下载")if __name__ == "__main__":app = QApplication(sys.argv)ui2 = LanFei_show_window()ui2.show()sys.exit(app.exec_())
Ui_dowondstory.py:pyqt程序
代码如下:
# -*- coding: utf-8 -*-# Form implementation generated from reading ui file 'd:\pythonitem\爬虫小说\dowondstory.ui'
#
# Created by: PyQt5 UI code generator 5.15.11
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.from PyQt5 import QtCore, QtGui, QtWidgetsclass Ui_MainWindow(object):def setupUi(self, MainWindow):MainWindow.setObjectName("MainWindow")MainWindow.resize(579, 368)self.centralwidget = QtWidgets.QWidget(MainWindow)self.centralwidget.setObjectName("centralwidget")self.gridLayout = QtWidgets.QGridLayout(self.centralwidget)self.gridLayout.setObjectName("gridLayout")self.label = QtWidgets.QLabel(self.centralwidget)self.label.setObjectName("label")self.gridLayout.addWidget(self.label, 0, 0, 1, 1)self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)self.lineEdit.setObjectName("lineEdit")self.gridLayout.addWidget(self.lineEdit, 0, 1, 1, 1)self.pushButton = QtWidgets.QPushButton(self.centralwidget)self.pushButton.setObjectName("pushButton")self.gridLayout.addWidget(self.pushButton, 0, 2, 1, 1)self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)self.pushButton_2.setObjectName("pushButton_2")self.gridLayout.addWidget(self.pushButton_2, 0, 3, 1, 1)self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)self.textBrowser.setObjectName("textBrowser")self.gridLayout.addWidget(self.textBrowser, 1, 0, 1, 4)MainWindow.setCentralWidget(self.centralwidget)self.menubar = QtWidgets.QMenuBar(MainWindow)self.menubar.setGeometry(QtCore.QRect(0, 0, 579, 23))self.menubar.setObjectName("menubar")MainWindow.setMenuBar(self.menubar)self.statusbar = QtWidgets.QStatusBar(MainWindow)self.statusbar.setObjectName("statusbar")MainWindow.setStatusBar(self.statusbar)self.retranslateUi(MainWindow)QtCore.QMetaObject.connectSlotsByName(MainWindow)def retranslateUi(self, MainWindow):_translate = QtCore.QCoreApplication.translateMainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))self.label.setText(_translate("MainWindow", "下载地址:"))self.pushButton.setText(_translate("MainWindow", "打开"))self.pushButton_2.setText(_translate("MainWindow", "下载"))
3.QT业务控制程序
pangchong.py:爬取章节小说的业务执行程序
代码如下:
import requests
import os
from lxml import html #调用lxml模块和requests模块
import time
import time
from PyQt5.QtCore import QThread,pyqtSignal
import threadingclass Worker(QThread):finished = pyqtSignal(list)def __init__(self,msg=None):super().__init__()self.msg = msgself.ret = "True"def run(self):# 在这里执行耗时的操作for number in range(0,int(len(self.msg[1]))):if self.ret == "break":i = 2self.finished.emit([i,"已停止下载!"])breakt1 = threading.Thread(target=self.save(self.msg[0],self.msg[1],self.msg[2],number))t1.start()t1.join()i = 3self.finished.emit([i,"完成下载!"])def change_ret(self):self.ret = "break"def get_url(self,url):hl = requests.get(url) # 获取源码hl = hl.content.decode("utf-8")return hl'''获取内容'''def get_neirong(self,htmls_list,number):url = htmls_list[number]# 主网址默认url = "https://www.xtyxsw.org" +urlprint("网址:" + url)txt = ""selector = html.fromstring(self.get_url(url))txt_list = selector.xpath('//div[@id="content"]/p/text()')#print(txt_list)liebiao = []for i in txt_list:i = i[0:]#print(i)liebiao.append(i)txts = selector.xpath('//a/text()') # // //div[@id="A3"]/a/text() //*[@id="A3"] # print(txts)if "下一页" in txts:dizhi = selector.xpath('//a/@href')print(dizhi)url = "https://www.xtyxsw.org" + dizhi[-4]dizhi = html.fromstring(self.get_url(url))txt_lists = dizhi.xpath('//div[@id="content"]/p/text()')for i in txt_lists:i = i[0:]#print(i)liebiao.append(i)#print(liebiao) #打印内容return liebiao'''保存内容'''def save(self,book_name,name_list,htmls_list,number):path1 = os.getcwd()path = path1+"\\" + str(book_name)if os.path.isdir(path): #判断文件夹目录是否存在#print(str(path)+":文件夹已经存在!")passelse:os.mkdir(path)if number < 0:returnliebiao = self.get_neirong(htmls_list,number)# print("文本内容:",liebiao)mulu = str(name_list[int(number)])mulu = mulu.replace("?","")paths = str(path) + "\\" + mulu +".txt"with open(paths,"w",encoding= "utf-8") as file:for wenzhi in liebiao:file.write(wenzhi +"\n")print("完成第" + str(int(number)+1) + "章写入!")i=1h= "完成第" + str(int(number)+1) + "章写入!"self.finished.emit([i,h])time.sleep(0.5)def finisheds(self,i,h=None):self.finished.emit([i,h])
4.批量修改文件名称
xiugainame.py:将汉数字皆转化为阿拉伯数字
修改前和修改后的显示图片
>xiugainame.py:修改文件名称程序
代码如下:
import os'''修改文件名称'''path = "./末日重生:开局囤积SSS级卡牌小说"
files = os.listdir(path)
print(files)liebiao1 = ["零","一","二","三","四","五","六","七","八","九"]liebiao2 = ["十","百","千"]liebiao3 = ["0","1","2","3","4","5","6","7","8","9"]for shuju in files:new_name = []for name in shuju:jishu=1if name in liebiao1:print(name)print(liebiao1.index(name))shuzhi = liebiao1.index(name) changnumber = liebiao3[shuzhi]new_name.append(changnumber)elif name in liebiao2:if shuju[1] == "十" and shuju[2] == "章":new_name.append("10")if shuju[1] == "十" and shuju[2] != "章":new_name.append("1")if shuju[2] == "十" and shuju[3] == "章":new_name.append("0")if shuju[2] == "百" and shuju[3] == "章":new_name.append("00")if shuju[2] == "百" and shuju[5] == "章":new_name.append("0")else:new_name.append(name)jishu += 1print(new_name)combined_string = ''for string in new_name:combined_string += stringprint(combined_string)# 源文件路径old_path = path + "/" + shuju# 新文件名new_name = path + "/" + combined_string# 修改文件名try:os.rename(old_path, new_name)except FileNotFoundError:print("源文件未找到")except PermissionError:print("权限不足,无法修改文件名")
总结
娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。