java

关注公众号 jb51net

关闭
首页 > 软件编程 > java > Java PDF转图片

Java实现PDF转图片的三种方法详解

作者:工贼sk

这篇文章主要为大家详细介绍了Java实现PDF转图片的三种方法,文中的示例代码讲解详细,具有一定的借鉴价值,有需要的小伙伴可以参考一下

使用PDFbox获取pdf文件的内容和图片

	<!-- 依赖 -->
    <dependency>
      <groupId>org.apache.pdfbox</groupId>
      <artifactId>pdfbox</artifactId>
      <version>2.0.23</version>
    </dependency>

获取传入的pdf的图片

我这里是分页读取的,更灵活

    public static void main(String[] args) throws IOException {
        Integer count = 0;
        File file = new File("D:\\Data\\电子图书馆_使用文档.pdf");
        FileInputStream fis = new FileInputStream(file);
        PDDocument document = PDDocument.load(fis);
        int allPages = document.getNumberOfPages();
        for (int i = 0; i < allPages; i++) {
            PDPage page = document.getPage(i);
            PDResources resources = page.getResources();
            Iterable<COSName> xObjectNames = resources.getXObjectNames();
            if (xObjectNames != null){
                Iterator<COSName> iterator = xObjectNames.iterator();
                while (iterator.hasNext()){
                    COSName key =  iterator.next();
                    if (resources.isImageXObject(key)){
                        PDImageXObject image = (PDImageXObject) resources.getXObject(key);
                        BufferedImage bImage = image.getImage();
                        ImageIO.write(bImage, "PNG", new File("D:\\image\\"+"image_"+ (i+1) + "页" + count + ".png"));
                        count++;
                    }
                }
            }
        }
        document.close();
    }

获取传入的pdf的文字内容

同样的分页读取

    public static void main(String[] args) throws IOException {
        PDDocument doc = new PDDocument();
        File file = new File("D:\\Data\\Java课件\\xxx.pdf");
        FileInputStream fis = new FileInputStream(file);
        doc = PDDocument.load(fis);
        PDFTextStripper pdfStripper = new PDFTextStripper();

        Splitter splitter = new Splitter();
        List<PDDocument> split = splitter.split(doc);
        for (int i = 0; i < split.size(); i++) {
            doc = split.get(i);
            PDFRenderer pdfRenderer = new PDFRenderer(doc);
            String text = pdfStripper.getText(doc);
            System.out.println("第"+(i+1)+"页内容:"+text);
        }
        doc.close();
    }

读取pdf中的图片信息

package org.fzzn.component.ai.guard.duplicatechecking.tool;

import cn.hutool.core.collection.CollUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.fzzn.component.ai.guard.AiGuardManager;
import org.fzzn.component.ai.guard.duplicatechecking.dto.CheckingDoc;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * pdf中图片提取器
 *
 * @author inkef
 * @since 2024/11/7 17:50
 */
@Slf4j
public class PdfImageExtractor {

    private List<CheckingDoc> imgList;

    public List<CheckingDoc> extractAndUploadImages(byte[] pdf) throws IOException {
        imgList = new ArrayList<>();
        PDDocument document = PDDocument.load(pdf);
        PDPageTree pages = document.getDocumentCatalog().getPages();
        for (PDPage page : pages) {
            PDResources resources = page.getResources();
            for (COSName name : resources.getXObjectNames()) {
                PDXObject xobject = resources.getXObject(name);
                if (xobject instanceof PDImageXObject) {
                    if (!name.getName().startsWith("Image")) {
                        continue;
                    }
                    PDImageXObject image = (PDImageXObject) xobject;
                    Integer number = Integer.parseInt(name.getName().substring(5));

                    String fileType = "png";
                    String fileName = number + "." + fileType;
                    byte[] imageBytes = bufferedImageToByteArray(image.getImage(), fileType);
                    String url = AiGuardManager.me().getHandler().handleUploadFile(imageBytes, fileName);

                    CheckingDoc doc = new CheckingDoc();
                    doc.setId((long) number);
                    doc.setUrl(url);
                    imgList.add(doc);
                }
            }
        }

        return imgList;
    }

    /**
     * 将 BufferedImage 转换为 byte[]
     *
     * @param bufferedImage 要转换的 BufferedImage 对象
     * @param formatName    图像格式(例如 "png", "jpg")
     * @return 字节数组
     */
    private static byte[] bufferedImageToByteArray(BufferedImage bufferedImage, String formatName) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try {
            ImageIO.write(bufferedImage, formatName, baos);
            baos.flush();
            byte[] imageBytes = baos.toByteArray();
            baos.close();
            return imageBytes;
        } catch (IOException e) {
            log.error("bufferedImageToByteArray error", e);
        }
        return null;
    }


    /**
     * 解析返回pdf中指定的 xrefs 列表中的图片
     *
     * @param pdf
     * @param xrefs
     * @return
     * @throws IOException
     */
    public List<CheckingDoc> extractAndUploadImages(byte[] pdf, List<Integer> xrefs) throws IOException {
        // 是否提取全部,如果没有指定,提取全部
        Boolean isExtractAll = CollUtil.isEmpty(xrefs);
        imgList = new ArrayList<>();
        PDDocument document = PDDocument.load(pdf);
        PDPageTree pages = document.getDocumentCatalog().getPages();

        for (PDPage page : pages) {
            PDResources resources = page.getResources();
            for (COSName name : resources.getXObjectNames()) {
                PDXObject xobject = resources.getXObject(name);
                if (xobject instanceof PDImageXObject) {
                    if (!name.getName().startsWith("Image")) {
                        continue;
                    }
                    PDImageXObject image = (PDImageXObject) xobject;
                    Integer number = Integer.parseInt(name.getName().substring(5));

                    // 如果不在指定的xrefs中,跳过
                    if(!isExtractAll && !xrefs.contains(number)){
                        continue;
                    }


                    String fileType = "png";
                    String fileName = number + "." + fileType;
                    byte[] imageBytes = bufferedImageToByteArray(image.getImage(), fileType);
                    String url = AiGuardManager.me().getHandler().handleUploadFile(imageBytes, fileName);

                    CheckingDoc doc = new CheckingDoc();
                    doc.setId((long) number);
                    doc.setUrl(url);
                    imgList.add(doc);
                }
            }
        }

        return imgList;
    }
}

Java实现PDF转图片的三种方法

提示:生成图片以后需要将文件流关闭,不然删除文件会删除失败很多人不知道怎么将pdf的文件转换成图片格式的,而且网上有很例子是跑不通的,同是也是方便自己在用到该需求的时候能够快速度地写出来,所以整理了几种pdf转换成图片的方法工具类。

一、使用开源库Apache PDFBox将PDF转换为图片

1、引入依赖库

<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>fontbox</artifactId>
			<version>2.0.9</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.9</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.2</version>
		</dependency>

2、实现pdf转换图片工具类(多页pdf会生成多页的图片,后缀会生成图片的位置序号)

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class Pdf2Png {
    /**
     * 使用pdfbox将整个pdf转换成图片
     *
     * @param fileAddress 文件地址 如:C:\\Users\\user\\Desktop\\test
     * @param filename    PDF文件名不带后缀名
     * @param type        图片类型 png 和jpg
     */
    public static void pdf2png(String fileAddress, String filename, String type) {
        long startTime = System.currentTimeMillis();
        // 将文件地址和文件名拼接成路径 注意:线上环境不能使用\\拼接
        File file = new File(fileAddress + "\\" + filename + ".pdf");
        try {
            // 写入文件
            PDDocument doc = PDDocument.load(file);
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            for (int i = 0; i < pageCount; i++) {
                // dpi为144,越高越清晰,转换越慢
                BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                // 将图片写出到该路径下
                ImageIO.write(image, type, new File(fileAddress + "\\" + filename + "_" + (i + 1) + "." + type));
            }
            long endTime = System.currentTimeMillis();
        	System.out.println("共耗时:" + ((endTime - startTime) / 1000.0) + "秒");  //转化用时
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {
        pdf2png("C:\\Users\\user\\Desktop\\test", "测试", "png");
    }
}

使用Apache PDFBox将PDF转换为图片成功

3、按照固定页数来将pdf转换成图片的工具类(自由选择pdf转换图片的页数)

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class Pdf2Png {
    /**
     * 自由确定起始页和终止页
      * @param fileAddress 文件地址 如:C:\\Users\\user\\Desktop\\test
     * @param filename    PDF文件名不带后缀名
     * @param indexOfStart 开始页  开始转换的页码,从0开始
     * @param indexOfEnd 结束页  停止转换的页码,-1为全部
     * @param type        图片类型 png 和jpg
     */
    public static void pdf2png(String fileAddress,String filename,int indexOfStart,int indexOfEnd,String type) {
        long startTime = System.currentTimeMillis();
         // 将文件地址和文件名拼接成路径 注意:线上环境不能使用\\拼接
        File file = new File(fileAddress+"\\"+filename+".pdf");
        try {
            PDDocument doc = PDDocument.load(file);
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            for (int i = indexOfStart; i < indexOfEnd; i++) {
            // dpi为144,越高越清晰,转换越慢
                BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                 // 将图片写出到该路径下
                ImageIO.write(image, type, new File(fileAddress+"\\"+filename+"_"+(i+1)+"."+type));
            }
            long endTime = System.currentTimeMillis();
            System.out.println("共耗时:" + ((endTime - startTime) / 1000.0) + "秒"); // 转换用时
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {
        pdf2png("C:\\Users\\user\\Desktop\\test", "思泰得流式检测报告-00420299-任蛆小-RA202302100117",2,3, "png");
    }
}

自由页数转换成功

二、使用PDF Box将多页的pdf转换一张长图片的方法

1、引入PDF Box需要的依赖

<dependency>
    <groupId>net.sf.cssbox</groupId>
    <artifactId>pdf2dom</artifactId>
    <version>1.7</version>
</dependency>
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.12</version>
</dependency>
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox-tools</artifactId>
    <version>2.0.12</version>
</dependency>

2、编写将多页PDF转换多张图片的工具类

import com.lowagie.text.pdf.PdfReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class Pdf2Png {
    /***
     * PDF文件转PNG图片,全部页数
     * @param pdfFilePath pdf完整路径:C:\\Users\\user\\Desktop\\test\\1234.pdf
     * @param dpi dpi越大转换后越清晰,相对转换速度越慢
     */
    public static void pdf2Image(String pdfFilePath, int dpi) {
        long startTime = System.currentTimeMillis();
        File file = new File(pdfFilePath);
        PDDocument pdDocument;
        try {
            String imgPdfPath = file.getParent();
            int dot = file.getName().lastIndexOf('.');
            // 获取图片文件名
            String imagePdfName = file.getName().substring(0, dot);
            pdDocument = PDDocument.load(file);
            PDFRenderer renderer = new PDFRenderer(pdDocument);
            /* dpi越大转换后越清晰,相对转换速度越慢 */
            PdfReader reader = new PdfReader(pdfFilePath);
            int pages = reader.getNumberOfPages();
            StringBuffer imgFilePath;
            for (int i = 0; i < pages; i++) {
                String imgFilePathPrefix = imgPdfPath + File.separator + imagePdfName;
                imgFilePath = new StringBuffer();
                imgFilePath.append(imgFilePathPrefix);
                imgFilePath.append("_");
                imgFilePath.append((i + 1));
                imgFilePath.append(".png");
                File dstFile = new File(imgFilePath.toString());
                BufferedImage image = renderer.renderImageWithDPI(i, dpi);
                ImageIO.write(image, "png", dstFile);
            }
            long endTime = System.currentTimeMillis();
            System.out.println("共耗时:" + ((endTime - startTime) / 1000.0) + "秒");  //转化用时
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

三、使用文件流整个pdf转换成图片 (生成图片,并将生成的图片路径返回)

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
public class Pdf2Png {
    /**
     * 使用文件流整个pdf转换成图片
     * @param fileAddress 文件地址 如:C:\\Users\\user\\Desktop\\test
     * @param filename    PDF文件名不带后缀名
     * @param type        图片类型 png 、jpg
     */
    public static List<Map<String, String>> pdfToImage(String fileAddress, String filename, String type) {
        long startTime = System.currentTimeMillis();
        List<Map<String, String>> list = new ArrayList<>();
        Map<String, String> resultMap = null;
        PDDocument pdDocument = null;
        String fileName = null;
        String imgPath = null;
        try {
            // 将文件地址和文件名拼接成路径 注意:线上环境不能使用\\拼接
            File FilePath = new File(fileAddress + "\\" + filename + ".pdf");
            // 文件流
            FileInputStream inputStream = new FileInputStream(FilePath);
            int dpi = 296;
            pdDocument = PDDocument.load(inputStream);
            PDFRenderer renderer = new PDFRenderer(pdDocument);
            int pageCount = pdDocument.getNumberOfPages();
            /* dpi越大转换后越清晰,相对转换速度越慢 */
            for (int i = 0; i < pageCount; i++) {
                resultMap = new HashMap<>();
                fileName = filename + "_" + (i + 1) + "." + type;
                imgPath = fileAddress + "\\" + fileName;
                BufferedImage image = renderer.renderImageWithDPI(i, dpi);
                ImageIO.write(image, type, new File(imgPath));
                resultMap.put("fileName", fileName);
                resultMap.put("filePath", imgPath); // 图片路径
                list.add(resultMap);
            }
            long endTime = System.currentTimeMillis();
            System.out.println("共耗时:" + ((endTime - startTime) / 1000.0) + "秒");  //转化用时
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                // 这里需要关闭PDDocument,不然如果想要删除pdf文件时会提示文件正在使用,无法删除的情况
                pdDocument.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return list;
    }
    public static void main(String[] args) throws FileNotFoundException {
        pdfToImage("C:\\Users\\user\\Desktop\\test", "测试", "png");
    }
}

pdf转图片

Apache PDFBox 库是一个开源、用于操作 PDF 文档的 Java 工具库。PDFBox 允许创建新的 PDF 文档、操作现有文档,以及从文档中提取内容。获取 Apache PDFBox 下载地址,目前版本 2.0.25,核心 jar 是 pdfbox-2.0.25.jar,其他还有几个 jar 可以根据需要进行导入。

pdf合并

package pdfUtity;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import java.io.File;
import java.io.IOException;
import java.util.Scanner;
//import org.apache.pdfbox.util.PDFMergerUtility;
public class pdfMerge {
    private static String[] getPdfs(String fileAddress) throws IOException {
        File file = new File(fileAddress);
        String[] pdfs;
        if (file.isDirectory()) {
            pdfs = file.list();
            return pdfs;
        } else {
            throw new IOException("输入的路径有问题");
        }
    }
    public static void main(String[] args) throws Exception {
        Scanner in = new Scanner(System.in);
        PDFMergerUtility mergePdf = new PDFMergerUtility();
        System.out.println("请输入要合并的PDF文件所在的文件夹路径");
        String fileAddress = in.nextLine();
        System.out.println("你输入的路径是:" + fileAddress);
        String destinationFileName = "javaweb2020.pdf";
        String[] pdfs = getPdfs(fileAddress);
        for (int i = 0; i < pdfs.length; i++)
            mergePdf.addSource(fileAddress + File.separator + pdfs[i]);
        mergePdf.setDestinationFileName(destinationFileName);
        System.out.println("合并比较费时间,请等待个几分钟吧!");
        mergePdf.mergeDocuments();
        System.out.print("合并完成");
    }
}

pdf拆分为单页

package pdfUtity;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Iterator;
public class pdfSplitter {
    public static void main(String[] args)throws IOException {
        //把需要拆分的pdf文件加载进来
        File file = new File("d://desktopfile//pdfs//1.pdf");
        PDDocument document = PDDocument.load(file);
        //创建一个拆分器对象
        Splitter splitter = new Splitter();
        //list中存放好被拆分的pdf对象 其中内容是pdf的每一页
        List<PDDocument>Pages = splitter.split(document);
        //创建迭代器对象
        Iterator<PDDocument>iterator = Pages.listIterator();
        //saving splits as individual PDF document
        int i = 1;
        while(iterator.hasNext()) {
            PDDocument pd = iterator.next();
            pd.save("d://desktopfile//pdfPhotos//"+i++ +".pdf");
        }
        System.out.println("pdf拆分成功");
        document.close();
    }
}

pdf转换为图片

package pdfUtity;/*
 * 读取 pdf,将其中的某一页另存为 png 图片
 */
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
public class PDFSavePNG
{
    public static void main(String[] args)
    {
        try
        {
            // 打开来源 pdf
            PDDocument pdfDocument = PDDocument.load(new File("d://desktopfile//pdfs//1.pdf"));
            PDFRenderer pdfRenderer = new PDFRenderer(pdfDocument);
            for(int pageNumber=0;pageNumber<pdfDocument.getNumberOfPages();pageNumber++){
            // 提取的页码
//            int pageNumber = 0;
            // 以300 dpi 读取存入 BufferedImage 对象
            int dpi = 300;
            BufferedImage buffImage = pdfRenderer.renderImageWithDPI(pageNumber, dpi, ImageType.RGB);
            // 将 BufferedImage 写入到 png
            ImageIOUtil.writeImage(buffImage, "d://desktopfile//pdfPhotos//"+pageNumber+".png", dpi);
            // 关闭文档
            }
            pdfDocument.close();
        }
        catch (InvalidPasswordException e)
        {
            e.printStackTrace();
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
    }
}

图片转换为pdf

package pdfUtity;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
/**
 * @author: Serendipity
 * Date: 2022/3/16 22:40
 * Description:
 */
public class photoToPDF {
    /**
     * 多图片合成pdf的限制后缀
     */
    private static final List IMAGE_SUFFIX = Arrays.asList("jpg", "png", "jpeg");
    /**
     * 多个图片合成一个pdf
     *
     * @param imgFolder 多图片的文件夹路径  例如:"D:\\image\\"
     * @param target    合并的图片路径          "D:\\image\\merge.pdf"
     * @throws IOException
     */
    public static void manyImageToOnePdf(String imgFolder, String target) throws IOException {
        PDDocument doc = new PDDocument();
        //创建一个空的pdf文件
        doc.save(target);
        PDPage page;
        PDImageXObject pdImage;
        PDPageContentStream contents;
        BufferedImage bufferedImage;
        String fileName;
        float w, h;
        String suffix;
        File tempFile;
        int index;
        File folder = new File(imgFolder);
        for (int i = 0; i < folder.listFiles().length; i++) {
            tempFile = folder.listFiles()[i];
            if (!tempFile.isFile()) {
                continue;
            }
            fileName = tempFile.getName();
            index = fileName.lastIndexOf(".");
            if (index == -1) {
                continue;
            }
            //获取文件的后缀
            suffix = fileName.substring(index + 1);
            //如果文件后缀不是图片格式,跳过当前循环
            if (!IMAGE_SUFFIX.contains(suffix)) {
                continue;
            }
            bufferedImage = ImageIO.read(folder.listFiles()[i]);
            //Retrieving the page
            pdImage = LosslessFactory.createFromImage(doc, bufferedImage);
            w = pdImage.getWidth();
            h = pdImage.getHeight();
            page = new PDPage(new PDRectangle(w, h));
            contents = new PDPageContentStream(doc, page);
            contents.drawImage(pdImage, 0, 0, w, h);
            System.out.println("Image inserted");
            contents.close();
            doc.addPage(page);
        }
        //保存pdf
        doc.save(target);
        //关闭pdf
        doc.close();
    }
    public static void main(String[] args) {
        try {
            manyImageToOnePdf("d://图片","photo.pdf");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

以上就是Java实现PDF转图片的三种方法详解的详细内容,更多关于Java PDF转图片的资料请关注脚本之家其它相关文章!

您可能感兴趣的文章:
阅读全文