首页 > 脚本专栏 > python > Python提取Excel嵌入图片

Python实现提取Excel嵌入图片并重命名

2025-04-10 11:08:37 作者：一晌小贪欢

我们在日常办公的时候经常需要将Excel中嵌入单元的图片进行提取,并在提取的时候将其中的某一列作为提取出图片的命名,本文将使用Python实现这一功能,需要的可以了解下

1. 背景介绍

我们在日常办公的时候经常需要将Excel中嵌入单元的图片进行提取，并在提取的时候将其中的某一列作为提取出图片的命名，然后将图片存放好！

为此我们可以利用Python将图片进行提取然后进行保存！

2. 库的安装

库	用途	安装
xmltodict	读取xml文件	`pip install xmltodict -i https://pypi.tuna.tsinghua.edu.cn/simple/`
pandas	Excel读写	`pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple/`
os	获取路径	内置库无需安装
json	读写json文件	内置库无需安装
re	正则表达式	内置库无需安装
shutil	文件操作	内置库无需安装
tempfile	创建临时文件和目录	内置库无需安装

3. 主要类与方法

ExcelImageProcessor 类

初始化 (__init__):接收Excel文件路径、用于重命名的依据列名（id_column）、包含图片的列名（image_column）。
创建临时工作空间 (create_temp_workspace):创建临时目录，将Excel文件复制到临时目录并解压为.zip格式。
清理临时工作空间 (cleanup_temp_workspace):删除临时目录及其内容。
提取图片ID (extract_image_id):从字符串中提取符合特定正则表达式的图片ID。
获取单元格与图片ID映射 (get_cell_image_mapping):解析sheet.xml文件，获取单元格位置与图片ID的映射。
获取图片ID与rId映射 (get_image_rId_mapping):解析cellimages.xml文件，获取图片ID与rId的映射。
获取rId与目标文件映射 (get_cellimages_rels):解析cellimages.xml.rels文件，获取rId与目标文件路径的映射。
处理Excel文件 (process_excel):调用上述方法，构建单元格到图片文件路径的直接映射。
复制并重命名图片 (copy_and_rename_images):根据映射关系，将图片复制到输出目录并重命名。
主流程 (process):调用上述方法完成整个处理流程。

main 函数

提供使用示例，设置默认参数（如“订单号”和“图片”列），调用ExcelImageProcessor类的process方法执行图片提取与重命名。

关键点

临时目录管理: 使用tempfile.mkdtemp()创建临时目录，确保操作完成后通过shutil.rmtree()清理。
XML解析: 使用xmltodict库解析Excel内部的XML文件，提取所需信息。
图片路径修正: 确保图片路径以media/开头，并正确拼接完整路径。
异常处理: 在主流程中捕获异常并打印错误信息，确保程序健壮性。

输出结果

提取的图片保存在图片输出目录下，文件名基于指定列（如“订单号”）重命名。

4、完整代码

# -*- coding: UTF-8 -*-
'''
@Project ：45-Excel嵌入图片提取 
@File    ：通用版本.py
@IDE     ：PyCharm 
@Author  ：一晌小贪欢（278865463@qq.com）
@Date    ：2025/3/18 20:49 
'''
import pandas as pd
import os
import re
import xmltodict
import shutil
import tempfile


class ExcelImageProcessor:
    def __init__(self, excel_path, id_column, image_column):
        """
        初始化处理器
        Args:
            excel_path: Excel文件路径
            id_column: 用于重命名的依据列名
            image_column: 包含图片的列名
        """
        self.excel_path = excel_path
        self.id_column = id_column
        self.image_column = image_column
        self.temp_dir = None
        self.extract_dir = None
        self.output_dir = "图片输出"

    def create_temp_workspace(self):
        """创建临时工作空间并复制Excel文件"""
        self.temp_dir = tempfile.mkdtemp()

        # 复制Excel文件到临时目录
        excel_name = os.path.basename(self.excel_path)
        temp_excel = os.path.join(self.temp_dir, excel_name)
        shutil.copy2(self.excel_path, temp_excel)

        # 创建解压目录
        self.extract_dir = os.path.join(self.temp_dir, 'extracted')
        os.makedirs(self.extract_dir)

        # 复制Excel为zip并解压
        zip_path = os.path.join(self.temp_dir, 'temp.zip')
        shutil.copy2(temp_excel, zip_path)
        shutil.unpack_archive(zip_path, self.extract_dir, 'zip')

    def cleanup_temp_workspace(self):
        """清理临时工作空间"""
        if self.temp_dir and os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

    def extract_image_id(self, text):
        """从字符串中提取图片ID"""
        match = re.search(r'ID_[A-F0-9]+', text)
        return match.group() if match else None

    def get_cell_image_mapping(self, sheet_xml_path):
        """
        从sheet.xml文件中提取单元格和图片ID的映射关系
        Args:
            sheet_xml_path: sheet.xml文件路径
        Returns:
            dict: 单元格位置和图片ID的映射字典
        """
        cell_image_dict = {}

        # 检查文件是否存在
        if not os.path.exists(sheet_xml_path):
            return cell_image_dict

        # 读取并解析XML文件
        with open(sheet_xml_path, 'r', encoding='utf-8') as file:
            xml_content = file.read()
            sheet_dict = xmltodict.parse(xml_content)

        # 遍历worksheet中的sheetData下的row数据
        for row in sheet_dict['worksheet']['sheetData']['row']:
            # 遍历每行中的单元格数据
            if 'c' in row:
                for cell in row['c']:
                    # 检查单元格是否包含DISPIMG函数
                    if 'v' in cell and 'DISPIMG' in cell['v']:
                        cell_pos = cell['@r']
                        image_id = self.extract_image_id(cell['v'])
                        if image_id:
                            cell_image_dict[cell_pos] = image_id

        return cell_image_dict

    def get_image_rId_mapping(self, cellimages_xml):
        """
        从cellimages.xml文件中提取图片ID和rId的映射关系
        Args:
            cellimages_xml: cellimages.xml文件路径
        Returns:
            dict: 图片ID和rId的映射字典
        """
        image_rId = {}

        # 读取并解析XML文件
        with open(cellimages_xml, 'r', encoding='utf-8') as file:
            xml_content = file.read()
            cellimages_dict = xmltodict.parse(xml_content)

        # 遍历cellimages_dict中的图片数据
        for image in cellimages_dict['etc:cellImages']['etc:cellImage']:
            image_id = image['xdr:pic']['xdr:nvPicPr']['xdr:cNvPr']['@name']
            r_id = image['xdr:pic']['xdr:blipFill']['a:blip']['@r:embed']
            image_rId[image_id] = r_id

        return image_rId

    def get_cellimages_rels(self, cellimages_rels_xml):
        """
        从cellimages.xml.rels文件中读取并解析关系映射
        Args:
            cellimages_rels_xml: cellimages.xml.rels文件路径
        Returns:
            dict: rId和目标文件的映射字典
        """
        rels_dict = {}

        # 读取并解析XML文件
        with open(cellimages_rels_xml, 'r', encoding='utf-8') as file:
            xml_content = file.read()
            rels = xmltodict.parse(xml_content)

        # 遍历Relationships中的Relationship数据
        for rel in rels['Relationships']['Relationship']:
            r_id = rel['@Id']
            # 确保路径以 media/ 开头
            target = rel['@Target']
            if not target.startswith('media/'):
                target = f"media/{target}"
            rels_dict[r_id] = target

        return rels_dict

    def process_excel(self):
        """处理Excel文件并提取图片映射关系"""
        # 创建临时工作空间
        self.create_temp_workspace()

        # 构建解压后的文件路径
        sheet_xml = os.path.join(self.extract_dir, "xl/worksheets/sheet1.xml")
        cellimages_xml = os.path.join(self.extract_dir, "xl/cellimages.xml")
        rels_xml = os.path.join(self.extract_dir, "xl/_rels/cellimages.xml.rels")

        # 获取各层映射
        cell_to_id = self.get_cell_image_mapping(sheet_xml)
        id_to_rid = self.get_image_rId_mapping(cellimages_xml)
        rid_to_file = self.get_cellimages_rels(rels_xml)

        # 构建单元格到文件位置的直接映射
        cell_to_file = {}
        for cell, image_id in cell_to_id.items():
            rid = id_to_rid[image_id]
            file_path = rid_to_file[rid]
            cell_to_file[cell] = file_path

        # 读取Excel文件
        df = pd.read_excel(self.excel_path)

        return df, cell_to_file

    def copy_and_rename_images(self, df, cell_to_file):
        """
        复制并重命名图片
        Args:
            df: DataFrame对象
            cell_to_file: 单元格到文件路径的映射
        """
        # 创建输出目录
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # 获取图片列的列号
        image_column_idx = None
        for idx, col in enumerate(df.columns):
            if col == self.image_column:
                image_column_idx = idx
                break
        column_letter = chr(65 + image_column_idx)

        # 处理每一行
        for index, row in df.iterrows():
            id_value = row[self.id_column]
            cell_ref = f"{column_letter}{index + 2}"

            if cell_ref in cell_to_file:
                # 获取源图片路径
                image_rel_path = cell_to_file[cell_ref]
                # 修正图片路径，确保包含完整的xl路径
                image_path = os.path.join(self.extract_dir, "xl", image_rel_path)

                # 获取文件扩展名
                _, ext = os.path.splitext(image_rel_path)

                # 构建新的文件名和路径
                new_filename = f"{id_value}{ext}"
                new_path = os.path.join(self.output_dir, new_filename)

                # 复制并重命名图片
                if os.path.exists(image_path):
                    shutil.copy2(image_path, new_path)
                else:
                    print(f"警告: 找不到图片文件 {image_path}")

    def process(self):
        """处理主流程"""
        try:
            # 处理Excel获取映射关系
            df, cell_to_file = self.process_excel()

            # 复制并重命名图片
            self.copy_and_rename_images(df, cell_to_file)

            print(f"处理完成！图片已保存到 {self.output_dir} 目录")

        except Exception as e:
            print(f"处理过程中出现错误: {str(e)}")
            raise
        finally:
            # 在所有操作完成后再清理临时目录
            self.cleanup_temp_workspace()


def main():
    # 使用示例
    """
        传入 Excel 路径、命名列、图片列 三个参数
        自动根据顺序命名图片，并导出到 “图片输出” 文件夹
    """
    if not os.path.exists('./图片输出/'):
        os.makedirs('./图片输出/')
    excel_path = "./数据源/" + os.listdir('./数据源/')[0]
    processor = ExcelImageProcessor(
        excel_path=excel_path,
        id_column="订单号",
        image_column="图片"
    )
    processor.process()


if __name__ == "__main__":
    main()

5. 注意事项

请各位友友按照自己的情况修改一下参数

效果如下

到此这篇关于Python实现提取Excel嵌入图片并重命名的文章就介绍到这了,更多相关Python提取Excel嵌入图片内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！