Java实现文档类型（Word、PDF、TXT）转换为HTML的解决方案

作为一个从事网络课程模块开发的技术人员，处理文档类型（Word、PDF、TXT）的转换与展示问题是一个常见课题。为了满足在线学习和统计的需求，传统的文档下载方式已经不适用。因此，我们采取了将文档文件转换为HTML的方案，以便在网页上展示与学习。

一、Word转换为HTML

1. 引入依赖

为了实现Word文件到HTML的转换，我们需要引入以下依赖项：

fr.opensagres.xdocreport  fr.opensagres.xdocreport.document  1.0.5
fr.opensagres.xdocreport   org.apache.poi.xwpf.converter.xhtml  1.0.5
org.apache.poi  poi  3.12
org.apache.poi  poi-scratchpad  3.12

2. 代码实现

package com.svse.controller;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi hạnPOI 3.12  poi-scratchpad 3.12

public class TestWordToHtml {
    public static  final String STORAGEPATH="C://works//files//";
    public static  final String IP="192.168.30.222";
    public static  final String PORT="8010";
    public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
        TestWordToHtml wt=new TestWordToHtml();
        // wt.Word2003ToHtml("甲骨文考证.doc");
        wt.Word2007ToHtml("甲骨文考证.docx");
    }
    /** 
     * 2003版本word转换为html
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
        String imagepath = STORAGEPATH + "fileImage/"; 
        String strRanString = getRandomNum();
        String filepath = STORAGEPATH;
        String htmlName = fileName.substring(0, fileName.indexOf(".")) + "2003.html";
        String file = filepath + fileName;
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(new File(file)));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                File imgPath = new File(imagepath);
                if (!imgPath.exists()) {
                    imgPath.mkdirs();
                }
                File file = new File(imagepath + strRanString + suggestedName);
                try {
                    OutputStream os = new FileOutputStream(file);
                    os.write(content);
                    os.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return "http://" + IP + ":" + PORT + "//uploadFile/fileImage/" + strRanString + suggestedName;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        File htmlFile = new File(filepath + strRanString + htmlName);
        OutputStream outStream = new FileOutputStream(htmlFile);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        outStream.close();
        System.out.println("生成html文件路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);
    }
    /** 
     * 2007版本word转换为html
     * @throws IOException
     */
    public void Word2007ToHtml(String fileName) throws IOException {
        String strRanString = getRandomNum();
        String filepath = STORAGEPATH + strRanString;
        String htmlName = fileName.substring(0, fileName.indexOf(",")) + "2007.html";
        File f = new File(STORAGEPATH + fileName);
        if (!f.exists()) {
            System.out.println("Sorry File does not Exists!");
        } else {
            if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
                try {
                    InputStream in = new FileInputStream(f);
                    XWPFDocument document = new XWPFDocument(in);
                    File imageFolderFile = new File(filepath);
                    XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                    options.setExtractor(new FileImageExtractor(imageFolderFile));
                    options.URIResolver(new IURIResolver() {
                        public String resolve(String uri) {
                            return "http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + "/" + uri;
                        }
                    });
                    options.setIgnoreStylesIfUnused(false);
                    options.setFragment(true);
                    OutputStream out = new FileOutputStream(new File(filepath + htmlName));
                    XHTMLConverter.getInstance().convert(document, out, options);
                    System.out.println("html路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            } else {
                System.out.println("Enter only MS Office 2007+ files");
            }
        }
    }
    public static String getRandomNum() {
        Date dt = new Date();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
        String str = sdf.format(dt);
        return str;
    }
}

二、PDF转换为HTML

1. 引入依赖

net.sf.cssbox            pdf2dom            1.7
org.apache.pdfbox            pdfbox            2.0.12
org.apache.pdfbox            pdfbox-tools            2.0.12

2. 代码实现

public class PdfToHtml {
    public void pdfToHtmlTest(String inPdfPath, String outputHtmlPath) {
        try {
            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)), "utf-8"));
            PDDocument document = PDDocument.load(new File(inPdfPath));
            PDFDomTree pdfDomTree = new PDFDomTree();
            pdfDomTree.writeText(document, out);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) throws IOException {
        PdfToHtml ph = new PdfToHtml();
        String pdfPath = "C:/works/files/武研中心行政考勤制度.pdf";
        String outputPath = "C:/works/files/武研中心行政考勤制度.html";
        ph.pdfToHtmlTest(pdfPath, outputPath);
    }
}

三、TXT转换为HTML

代码实现

public static void txtToHtml(String filePath, String htmlPosition) {
    try {
        File file = new File(filePath);
        if (file.isFile() && file.exists()) {
            InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
            BufferedReader bufferedReader = new BufferedReader(read);
            FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
            OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
            BufferedWriter bw = new BufferedWriter(osw);
            String lineTxt = null;
            while ((lineTxt = bufferedReader.readLine()) != null) {
                bw.write("   " + lineTxt + "");
            }
            bw.close();
            osw.close();
            fos.close();
            read.close();
        } else {
            System.out.println("找不到指定的文件");
        }
    } catch (Exception e) {
        System.out.println("读取文件内容出错");
        e.printStackTrace();
    }
}

这份文档详细介绍了如何将不同类型文档（Word、PDF、TXT）转换为HTML的实现方法，适合用于在线教育平台的课程展示和学习统计需求。

转载地址：http://baeyk.baihongyu.com/

你可能感兴趣的文章