大学的时候选python课 课设就是让我们用爬虫去爬取数据 写入文件 然后再做数据分析 词云图 地图分类等 python已经记不清了 现在用Java尝试一下爬取数据
爬虫分为三步骤:1.获取你自己电脑访问网站的时候的请求头 2.目标网站的url 3.对爬出来的网页返回值进行切分出有用的部分
package com.example.concurrent;import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.io.FileOutputStream;
import java.io.IOException;import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;public class BookLibrary {// 目标榜单URL 序号 作者 作品 类型 进度 字数 作品积分private static final String TARGET_URL = "https://www.jjwxc.net/topten.php?orderstr=7&t=1";public static void main(String[] args) {List<Novel> novels = crawlRankList();
// novels.forEach(System.out::println);writeNovelsToExcel(novels,"/Users/xuejiangjing/Documents/novels.xlsx");}public static void writeNovelsToExcel(List<Novel> novelList, String outputPath) {try (Workbook workbook = new XSSFWorkbook()) { // 创建.xlsx格式工作簿Sheet sheet = workbook.createSheet("小说列表"); // 创建工作表// 创建表头行String[] headers = {"排名", "作者", "书名", "类型", "进度", "总字数", "投票数", "更新时间", "简介"};Row headerRow = sheet.createRow(0);for (int i = 0; i < headers.length; i++) {Cell cell = headerRow.createCell(i);cell.setCellValue(headers[i]);}// 填充数据行int rowNum = 1;for (Novel novel : novelList) {Row row = sheet.createRow(rowNum++);// 按字段顺序写入(需与Novel类构造参数顺序一致)row.createCell(0).setCellValue(novel.getRank());row.createCell(1).setCellValue(novel.getAuthor());row.createCell(2).setCellValue(novel.getTitle());row.createCell(3).setCellValue(novel.getType());row.createCell(4).setCellValue(novel.getProgress());row.createCell(5).setCellValue(novel.getTotalNUm());row.createCell(6).setCellValue(novel.getVotes());row.createCell(7).setCellValue(novel.getTime());row.createCell(8).setCellValue(novel.getJianjie());}// 自动调整列宽(可选)for (int i = 0; i < headers.length; i++) {sheet.autoSizeColumn(i);}// 写入文件try (FileOutputStream outputStream = new FileOutputStream(outputPath)) {workbook.write(outputStream);System.out.println("Excel文件已生成:" + outputPath);}} catch (IOException e) {throw new RuntimeException(e);}}/*** 爬取榜单数据*/public static List<Novel> crawlRankList() {List<Novel> novelList = new ArrayList<>();try {// 1. 模拟浏览器请求(关键反爬策略)- useragent放入你自己的请求头Document doc = Jsoup.connect(TARGET_URL).userAgent("Mozilla/5.0 Version/17.4 Safari").header("Accept-Language", "zh-CN,zh;q=0.9").timeout(10_000).get();String htmlContent = doc.html(); // 获取完整HTML内容(含格式)
// System.out.println(htmlContent);// 2. 定位榜单表格(需根据实际HTML结构调整选择器)
// Element table = doc.selectFirst("table.rank-table");Element targetTable = null;Elements tables = doc.select("table[width=984][border=0][align=center][cellpadding=0][cellspacing=1][bgcolor=#009900]");
// System.out.println("tables.size>>>" + tables.size());for (Element table : tables) {Element firstTd = table.selectFirst("td:eq(0)"); // 第一个tdif (firstTd != null && firstTd.text().trim().equals("序号")) {
// System.out.println("找到目标表格:\n" + table);targetTable = table;break;}}if (targetTable == null) {return new ArrayList<>();}Elements rows = targetTable.select("tr:has(td)"); // 跳过表头// 3. 解析每一行数据 序号 作者 作品 类型 进度 字数 作品积分 截止时间for (int i = 1; i < rows.size(); i++) {Elements cols = rows.get(i).select("td");if (cols.size() < 8) continue;
// System.out.println(cols.html() + ">>>>>>");String rank = cols.get(0).text();Element authorTd = cols.get(1);String author = "";Element links = authorTd.selectFirst("a");if (links != null) {author = links.text().trim();} else {System.out.println("未找到<a>标签");}Element authorTdss = cols.get(2);String bookName = "";String rawRel = "";Element link = authorTdss.selectFirst("a");if (link != null) {// 提取书名bookName = link.text().trim();// 提取并处理rel属性rawRel = link.attr("rel").replaceAll("<br>", "\n");// 输出结果
// System.out.println("书名: " + bookName);
// System.out.println("简介:\n" + rawRel);} else {System.out.println("未找到<a>标签");}String type = cols.get(3).text();String progress = cols.get(4).text();String totalNUm = cols.get(5).text();String votes = cols.get(6).text();String time = cols.get(7).text();novelList.add(new Novel(rank, author, bookName, type, progress, totalNUm, votes, time, rawRel));}// 4. 添加延迟防止封IPThread.sleep(8000);} catch (IOException | InterruptedException e) {e.printStackTrace();}return novelList;}/*** 小说数据实体类 序号 作者 作品 类型 进度 字数 作品积分*/static class Novel {private String rank;//序号private String author;//作者private String title;//作品private String type;//类型private String progress;//进度private String totalNUm;//字数private String votes;//积分private String time;//时间private String jianjie;//简介public Novel(String rank, String author, String title, String type, String progress, String totalNUm, String votes, String time, String jianjie) {this.rank = rank;this.author = author;this.title = title;this.type = type;this.progress = progress;this.totalNUm = totalNUm;this.votes = votes;this.time = time;this.jianjie = jianjie;}@Overridepublic String toString() {return "Novel{" +"rank='" + rank + '\'' +", author='" + author + '\'' +", title='" + title + '\'' +", type='" + type + '\'' +", progress='" + progress + '\'' +", totalNUm='" + totalNUm + '\'' +", votes='" + votes + '\'' +", time='" + time + '\'' +", jianjie='" + jianjie + '\'' +'}';}public String getRank() {return rank;}public void setRank(String rank) {this.rank = rank;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public String getType() {return type;}public void setType(String type) {this.type = type;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getProgress() {return progress;}public void setProgress(String progress) {this.progress = progress;}public String getTotalNUm() {return totalNUm;}public void setTotalNUm(String totalNUm) {this.totalNUm = totalNUm;}public String getTime() {return time;}public void setTime(String time) {this.time = time;}public String getVotes() {return votes;}public void setVotes(String votes) {this.votes = votes;}public String getJianjie() {return jianjie;}public void setJianjie(String jianjie) {this.jianjie = jianjie;}}}