Java实现HTML转PDF,主要为了解决将ai返回的html文本数据转为PDF文件方便用户下载查看。
一、deepSeek-AI提问词
基于以上个人数据。总结个人身体信息,分析个人身体指标信息。再按一个月为维度,详细列举一个月内训练计划,维度详细至每周每天,要求:不可省略表格内容以精简示例,文本结构顺序为标题个人信息,第一步,第二步。最终回答结果以标准的html形式返回结果,不能带有meta标签,字体为STSong-Light,SimSun,html内容禁止使用单标签。
二、表设计
CREATE TABLE `p_deep_seek_task` (`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '任务id',`user_id` bigint(20) NOT NULL COMMENT '用户id',`status` char(2) NOT NULL DEFAULT '0' COMMENT '任务状态(0:待处理,1:处理中,2:处理成功,3:异常处理失败,4:重试中,5:重试失败)',`try_time` int(11) NOT NULL DEFAULT '0' COMMENT '执行次数',`result_url` varchar(255) DEFAULT NULL COMMENT '结果文件url',`prompt` longtext NOT NULL COMMENT '提问内容',`content` longtext COMMENT '结果内容',`reasoning_content` longtext COMMENT '思考过程',`create_time` datetime DEFAULT NULL COMMENT '创建时间',`create_by` bigint(20) DEFAULT NULL COMMENT '创建人',`update_by` bigint(20) DEFAULT NULL COMMENT '更新人',`update_time` datetime DEFAULT NULL COMMENT '更新时间',`task_time` date DEFAULT NULL COMMENT '任务日期',`execute_time` datetime DEFAULT NULL COMMENT '执行时间',`exception_msg` longtext COMMENT '异常信息',`cost_time` bigint(20) NOT NULL DEFAULT '0' COMMENT '执行耗时(s)'PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8mb4 COMMENT='deepseek任务';
三、导入Jar包
<dependency><groupId>io.github.pig-mesh.ai</groupId><artifactId>deepseek-spring-boot-starter</artifactId></dependency><dependency><groupId>org.xhtmlrenderer</groupId><artifactId>flying-saucer-pdf</artifactId><version>9.1.22</version> </dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.14.3</version> </dependency>
四、工具类
ai请求工具
package com.company.project.service.client;import io.github.pigmesh.ai.deepseek.config.DeepSeekProperties;
import io.github.pigmesh.ai.deepseek.core.DeepSeekClient;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionRequest;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionResponse;
import org.springframework.stereotype.Service;import javax.annotation.Resource;/*** @author: reshui* description:DeepSeek服务* DateTime:2025/3/31-14:48*/
@Service
public class DeepSeekAiClient {@Resourceprivate DeepSeekClient deepSeekClient;@Resourceprivate DeepSeekProperties deepSeekProperties;/*** 提问接口* 获取deepseek的响应结果* @param prompt 提示词*/public ChatCompletionResponse syncChat(String prompt) {ChatCompletionRequest request = ChatCompletionRequest.builder()// 根据渠道模型名称动态修改这个参数.model(deepSeekProperties.getModel()).addUserMessage(prompt).build();return deepSeekClient.chatCompletion(request).execute();}}
-
特定html字符内容过滤工具
package com.company.project.service.tools;import cn.hutool.core.collection.CollUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.util.regex.Matcher;
import java.util.regex.Pattern;/*** @author reshui* description* dateTime 2025/04/17*/
public class HtmlFormatter {private String htmlContent;private HtmlFormatter(String htmlContent) {this.htmlContent = htmlContent;}// 入口方法,创建处理器实例public static HtmlFormatter process(String htmlContent) {return new HtmlFormatter(htmlContent);}// 链式方法:前置过滤干扰字符public HtmlFormatter beforeFilter() {this.htmlContent = beforeFilterInterferenceCharacters(this.htmlContent);return this;}// 链式方法:替换内容标签public HtmlFormatter replaceTags() {this.htmlContent = replaceContentTag(this.htmlContent);return this;}// 链式方法:后置过滤干扰字符public HtmlFormatter afterFilter() {this.htmlContent = afterFilterInterferenceCharacters(this.htmlContent);return this;}// 获取最终结果public String get() {return this.htmlContent;}public static String formatHtml(String htmlContent) {// 过滤掉html中的干扰字符String filteredHtml = beforeFilterInterferenceCharacters(htmlContent);// 去除内容中的大于小于号干扰String replaceContentTag = replaceContentTag(filteredHtml);//过滤html中的干扰标签return afterFilterInterferenceCharacters(replaceContentTag);}/*** 替换html中的干扰内容** @param html 文本*/public static String replaceContentTag(String html) {Document doc = Jsoup.parse(html);removeTag(doc);traverse(doc.body());doc.outputSettings().prettyPrint(false);return doc.html();}/*** 去除不支持的meta标签* @param doc jsoupdoc*/public static void removeTag(Document doc) {Elements meta = doc.getElementsByTag("meta");for (Element metaElement : meta) {metaElement.remove();}}public static void traverse(Element element) {if (CollUtil.isEmpty(element.children())) {String text = element.text().replace("<", "小于").replace(">", "大于");element.text(text);}for (Element child : element.children()) { // 遍历子元素traverse(child); // 递归调用以处理子元素及其子元素}}/*** 后置过滤掉html中的干扰字符** @param html 文本*/public static String afterFilterInterferenceCharacters(String html) {return html.replace("<br></br>", "<br/>").replace("<br>", "<br/>").replace("</br>", "<br/>");}/*** 前置过滤掉html中的干扰字符** @param html 文本*/public static String beforeFilterInterferenceCharacters(String html) {return html.replace("```html", "").replace("```", "").replace("<!DOCTYPE html>", "").replace("<!doctype html>", "");}/*** 将HTML字符串中的所有标签转为小写** @param html 原始HTML字符串* @return 转换后的HTML字符串*/public static String convertTagsToLowerCase(String html) {// 正则表达式匹配HTML标签Pattern pattern = Pattern.compile("</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");Matcher matcher = pattern.matcher(html);StringBuffer result = new StringBuffer();while (matcher.find()) {// 将匹配到的标签转为小写String lowerCaseTag = matcher.group().toLowerCase();matcher.appendReplacement(result, lowerCaseTag);}matcher.appendTail(result);return result.toString();}}
-
html转pdf工具
package com.company.project.service.tools;import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import com.company.project.common.utils.SpringUtils;
import com.company.project.service.properties.PdfFontProperties;
import lombok.extern.slf4j.Slf4j;
import org.xhtmlrenderer.pdf.ITextRenderer;import java.io.File;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Date;/*** @author: reshui* description: html转pdf工具类* DateTime:2025/3/31-15:55*/
@Slf4j
public class HtmlToPdfTools {/*** 文件暂存地址*/private static final String TEMP_FILE_PATH = System.getProperty("java.io.tmpdir");/*** pdf文件暂存地址*/private static final String PDF_FILE_PATH = TEMP_FILE_PATH + File.separator + "ai_train_pdf";/*** 时间格式*/private static final String TIMESTAMP_FORMAT = "yyyyMMddHHmmss";/*** pdf配置文件*/public final static PdfFontProperties CONFIG = SpringUtils.getBean(PdfFontProperties.class);/*** html转pdf文件** @param htmlContent html内容文本*/public static File convertHtmlToPdfFile(String htmlContent) throws Exception {String formatDateTimeStamp = DateUtil.format(new Date(), TIMESTAMP_FORMAT);String pdfFilePath = PDF_FILE_PATH + File.separator + formatDateTimeStamp + ".pdf";FileUtil.touch(pdfFilePath);String resultHtmlContent = HtmlFormatter.process(htmlContent).beforeFilter().replaceTags().afterFilter().get();generatePdfReport(pdfFilePath, resultHtmlContent);log.info("pdf文件储存地址:{}", pdfFilePath);return new File(pdfFilePath);}/*** 生成pdf文件** @param outputPath 输出文件地址* @param htmlContent html内容文本*/public static void generatePdfReport(String outputPath, String htmlContent) throws Exception {try (OutputStream outputStream = Files.newOutputStream(Paths.get(outputPath))) {ITextRenderer renderer = new ITextRenderer();renderer.getFontResolver().addFont(CONFIG.getPath(),CONFIG.getEncoding(),CONFIG.getEmbedded());
// renderer.getFontResolver().addFont(
// "c://Windows//Fonts//simsun.ttc",
// "Identity-H",
// true
// );renderer.setDocumentFromString(htmlContent);renderer.layout();renderer.createPDF(outputStream);}}public static void main(String[] args) throws Exception {String html = "";convertHtmlToPdfFile(html);}
}
配置文件yml
# deepseek配置文件
deepseek:base-url: https://api.deepseek.com/v1api-key: xxxxxxxxxxxxxmodel: deepseek-reasonerconnectTimeout: 60readTimeout: 240callTimeout: 360# windows-pdf字体配置
pdf:font:path: c://Windows//Fonts//simsun.ttcencoding: Identity-Hembedded: true# linux-pdf字体配置
pdf:font:path: /usr/share/fonts/simsun/simsun.ttcencoding: Identity-Hembedded: true