Java实现文档类型（Word、PDF、TXT）转换为HTML的解决方案

作为一个从事网络课程模块开发的技术人员，处理文档类型（Word、PDF、TXT）的转换与展示问题是一个常见课题。为了满足在线学习和统计的需求，传统的文档下载方式已经不适用。因此，我们采取了将文档文件转换为HTML的方案，以便在网页上展示与学习。

一、Word转换为HTML

1. 引入依赖

为了实现Word文件到HTML的转换，我们需要引入以下依赖项：

fr.opensagres.xdocreport  fr.opensagres.xdocreport.document  1.0.5fr.opensagres.xdocreport   org.apache.poi.xwpf.converter.xhtml  1.0.5org.apache.poi  poi  3.12org.apache.poi  poi-scratchpad  3.12

2. 代码实现

package com.svse.controller;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.parsers.ParserConfigurationException;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerException;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.PicturesManager;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi hạnPOI 3.12  poi-scratchpad 3.12

public class TestWordToHtml {    public static  final String STORAGEPATH="C://works//files//";    public static  final String IP="192.168.30.222";    public static  final String PORT="8010";    public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {        TestWordToHtml wt=new TestWordToHtml();        // wt.Word2003ToHtml("甲骨文考证.doc");        wt.Word2007ToHtml("甲骨文考证.docx");    }    /**      * 2003版本word转换为html     * @throws IOException     * @throws TransformerException     * @throws ParserConfigurationException     */    public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {        String imagepath = STORAGEPATH + "fileImage/";         String strRanString = getRandomNum();        String filepath = STORAGEPATH;        String htmlName = fileName.substring(0, fileName.indexOf(".")) + "2003.html";        String file = filepath + fileName;        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(new File(file)));        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());        wordToHtmlConverter.setPicturesManager(new PicturesManager() {            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {                File imgPath = new File(imagepath);                if (!imgPath.exists()) {                    imgPath.mkdirs();                }                File file = new File(imagepath + strRanString + suggestedName);                try {                    OutputStream os = new FileOutputStream(file);                    os.write(content);                    os.close();                } catch (FileNotFoundException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }                return "http://" + IP + ":" + PORT + "//uploadFile/fileImage/" + strRanString + suggestedName;            }        });        wordToHtmlConverter.processDocument(wordDocument);        Document htmlDocument = wordToHtmlConverter.getDocument();        File htmlFile = new File(filepath + strRanString + htmlName);        OutputStream outStream = new FileOutputStream(htmlFile);        DOMSource domSource = new DOMSource(htmlDocument);        StreamResult streamResult = new StreamResult(outStream);        TransformerFactory factory = TransformerFactory.newInstance();        Transformer serializer = factory.newTransformer();        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");        serializer.setOutputProperty(OutputKeys.INDENT, "yes");        serializer.setOutputProperty(OutputKeys.METHOD, "html");        serializer.transform(domSource, streamResult);        outStream.close();        System.out.println("生成html文件路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);    }    /**      * 2007版本word转换为html     * @throws IOException     */    public void Word2007ToHtml(String fileName) throws IOException {        String strRanString = getRandomNum();        String filepath = STORAGEPATH + strRanString;        String htmlName = fileName.substring(0, fileName.indexOf(",")) + "2007.html";        File f = new File(STORAGEPATH + fileName);        if (!f.exists()) {            System.out.println("Sorry File does not Exists!");        } else {            if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {                try {                    InputStream in = new FileInputStream(f);                    XWPFDocument document = new XWPFDocument(in);                    File imageFolderFile = new File(filepath);                    XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));                    options.setExtractor(new FileImageExtractor(imageFolderFile));                    options.URIResolver(new IURIResolver() {                        public String resolve(String uri) {                            return "http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + "/" + uri;                        }                    });                    options.setIgnoreStylesIfUnused(false);                    options.setFragment(true);                    OutputStream out = new FileOutputStream(new File(filepath + htmlName));                    XHTMLConverter.getInstance().convert(document, out, options);                    System.out.println("html路径: http://" + IP + ":" + PORT + "//uploadFile/" + strRanString + htmlName);                } catch (Exception e) {                    e.printStackTrace();                }            } else {                System.out.println("Enter only MS Office 2007+ files");            }        }    }    public static String getRandomNum() {        Date dt = new Date();        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");        String str = sdf.format(dt);        return str;    }}

二、PDF转换为HTML

1. 引入依赖

net.sf.cssbox            pdf2dom            1.7org.apache.pdfbox            pdfbox            2.0.12org.apache.pdfbox            pdfbox-tools            2.0.12

2. 代码实现

public class PdfToHtml {    public void pdfToHtmlTest(String inPdfPath, String outputHtmlPath) {        try {            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)), "utf-8"));            PDDocument document = PDDocument.load(new File(inPdfPath));            PDFDomTree pdfDomTree = new PDFDomTree();            pdfDomTree.writeText(document, out);        } catch (Exception e) {            e.printStackTrace();        }    }    public static void main(String[] args) throws IOException {        PdfToHtml ph = new PdfToHtml();        String pdfPath = "C:/works/files/武研中心行政考勤制度.pdf";        String outputPath = "C:/works/files/武研中心行政考勤制度.html";        ph.pdfToHtmlTest(pdfPath, outputPath);    }}

三、TXT转换为HTML

代码实现

public static void txtToHtml(String filePath, String htmlPosition) {    try {        File file = new File(filePath);        if (file.isFile() && file.exists()) {            InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");            BufferedReader bufferedReader = new BufferedReader(read);            FileOutputStream fos = new FileOutputStream(new File(htmlPosition));            OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");            BufferedWriter bw = new BufferedWriter(osw);            String lineTxt = null;            while ((lineTxt = bufferedReader.readLine()) != null) {                bw.write("   " + lineTxt + "");            }            bw.close();            osw.close();            fos.close();            read.close();        } else {            System.out.println("找不到指定的文件");        }    } catch (Exception e) {        System.out.println("读取文件内容出错");        e.printStackTrace();    }}

这份文档详细介绍了如何将不同类型文档（Word、PDF、TXT）转换为HTML的实现方法，适合用于在线教育平台的课程展示和学习统计需求。

转载地址：http://baeyk.baihongyu.com/

你可能感兴趣的文章