博客
关于我
aspose html转pdf_Java实现Word/Pdf/TXT转html
阅读量:806 次
发布时间:2023-01-24

本文共 8223 字,大约阅读时间需要 27 分钟。

Java实现文档类型(Word、PDF、TXT)转换为HTML的解决方案

作为一个从事网络课程模块开发的技术人员,处理文档类型(Word、PDF、TXT)的转换与展示问题是一个常见课题。为了满足在线学习和统计的需求,传统的文档下载方式已经不适用。因此,我们采取了将文档文件转换为HTML的方案,以便在网页上展示与学习。

一、Word转换为HTML

1. 引入依赖

为了实现Word文件到HTML的转换,我们需要引入以下依赖项:

fr.opensagres.xdocreport  fr.opensagres.xdocreport.document  1.0.5
fr.opensagres.xdocreport org.apache.poi.xwpf.converter.xhtml 1.0.5
org.apache.poi poi 3.12
org.apache.poi poi-scratchpad 3.12

2. 代码实现

package com.svse.controller;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi hạnPOI 3.12 poi-scratchpad 3.12
public class TestWordToHtml {
public static final String STORAGEPATH="C://works//files//";
public static final String IP="192.168.30.222";
public static final String PORT="8010";
public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
TestWordToHtml wt=new TestWordToHtml();
// wt.Word2003ToHtml("甲骨文考证.doc");
wt.Word2007ToHtml("甲骨文考证.docx");
}
/**
* 2003版本word转换为html
* @throws IOException
* @throws TransformerException
* @throws ParserConfigurationException
*/
public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
String imagepath = STORAGEPATH + "fileImage/";
String strRanString = getRandomNum();
String filepath = STORAGEPATH;
String htmlName = fileName.substring(0, fileName.indexOf(".")) + "2003.html";
String file = filepath + fileName;
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(new File(file)));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
File imgPath = new File(imagepath);
if (!imgPath.exists()) {
imgPath.mkdirs();
}
File file = new File(imagepath + strRanString + suggestedName);
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return "http://" + IP + ":" + PORT + "//uploadFile/fileImage/" + strRanString + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
File htmlFile = new File(filepath + strRanString + htmlName);
OutputStream outStream = new FileOutputStream(htmlFile);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
outStream.close();
System.out.println("生成html文件路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);
}
/**
* 2007版本word转换为html
* @throws IOException
*/
public void Word2007ToHtml(String fileName) throws IOException {
String strRanString = getRandomNum();
String filepath = STORAGEPATH + strRanString;
String htmlName = fileName.substring(0, fileName.indexOf(",")) + "2007.html";
File f = new File(STORAGEPATH + fileName);
if (!f.exists()) {
System.out.println("Sorry File does not Exists!");
} else {
if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
try {
InputStream in = new FileInputStream(f);
XWPFDocument document = new XWPFDocument(in);
File imageFolderFile = new File(filepath);
XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
options.setExtractor(new FileImageExtractor(imageFolderFile));
options.URIResolver(new IURIResolver() {
public String resolve(String uri) {
return "http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + "/" + uri;
}
});
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
OutputStream out = new FileOutputStream(new File(filepath + htmlName));
XHTMLConverter.getInstance().convert(document, out, options);
System.out.println("html路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);
} catch (Exception e) {
e.printStackTrace();
}
} else {
System.out.println("Enter only MS Office 2007+ files");
}
}
}
public static String getRandomNum() {
Date dt = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
String str = sdf.format(dt);
return str;
}
}

二、PDF转换为HTML

1. 引入依赖

net.sf.cssbox            pdf2dom            1.7
org.apache.pdfbox pdfbox 2.0.12
org.apache.pdfbox pdfbox-tools 2.0.12

2. 代码实现

public class PdfToHtml {
public void pdfToHtmlTest(String inPdfPath, String outputHtmlPath) {
try {
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)), "utf-8"));
PDDocument document = PDDocument.load(new File(inPdfPath));
PDFDomTree pdfDomTree = new PDFDomTree();
pdfDomTree.writeText(document, out);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
PdfToHtml ph = new PdfToHtml();
String pdfPath = "C:/works/files/武研中心行政考勤制度.pdf";
String outputPath = "C:/works/files/武研中心行政考勤制度.html";
ph.pdfToHtmlTest(pdfPath, outputPath);
}
}

三、TXT转换为HTML

代码实现

public static void txtToHtml(String filePath, String htmlPosition) {
try {
File file = new File(filePath);
if (file.isFile() && file.exists()) {
InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
BufferedReader bufferedReader = new BufferedReader(read);
FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
BufferedWriter bw = new BufferedWriter(osw);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
bw.write("   " + lineTxt + "");
}
bw.close();
osw.close();
fos.close();
read.close();
} else {
System.out.println("找不到指定的文件");
}
} catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
}

这份文档详细介绍了如何将不同类型文档(Word、PDF、TXT)转换为HTML的实现方法,适合用于在线教育平台的课程展示和学习统计需求。

转载地址:http://baeyk.baihongyu.com/

你可能感兴趣的文章
nodejs学习笔记一——nodejs安装
查看>>
NodeJS实现跨域的方法( 4种 )
查看>>
nodejs封装http请求
查看>>
nodejs常用组件
查看>>
nodejs开发公众号报错 40164,白名单配置找不到,竟然是这个原因
查看>>
Nodejs异步回调的处理方法总结
查看>>
NodeJS报错 Fatal error: ENOSPC: System limit for number of file watchers reached, watch ‘...path...‘
查看>>
Nodejs教程09:实现一个带接口请求的简单服务器
查看>>
nodejs服务端实现post请求
查看>>
nodejs框架,原理,组件,核心,跟npm和vue的关系
查看>>
Nodejs模块、自定义模块、CommonJs的概念和使用
查看>>
nodejs生成多层目录和生成文件的通用方法
查看>>
nodejs端口被占用原因及解决方案
查看>>
Nodejs简介以及Windows上安装Nodejs
查看>>
nodejs系列之express
查看>>
nodejs系列之Koa2
查看>>
Nodejs连接mysql
查看>>
nodejs连接mysql
查看>>
NodeJs连接Oracle数据库
查看>>
nodejs配置express服务器,运行自动打开浏览器
查看>>