python批量提取PDF文件中指定区域的文本

Page content

   python批量提取PDF文件中指定区域的文本。

   print_formatted_text_blocks打印PDF中所有文本块的坐标,extract_text_from_pdf提取指定区域的文本,batch_rename_pdfs批量根据提取的文本重命名PDF文件。

import fitz  # PyMuPDF
import os

def print_formatted_text_blocks(pdf_path):
    """打印PDF中所有文本块的坐标,格式化为 {'x1':..., 'y1':..., 'x2':..., 'y2':..., 'name':...}"""
    doc = fitz.open(pdf_path)
    print(f"\n# PDF文本块坐标列表(可直接复制用于提取)\nregions = [")
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict", flags=0)["blocks"]
        
        for block_idx, block in enumerate(blocks):
            for line in block["lines"]:
                for span in line["spans"]:
                    rect = span["bbox"]  # [x1, y1, x2, y2]
                    text = span["text"].strip()
                    if text:  # 忽略空文本
                        # 格式化输出为字典
                        print(f"    {{'x1': {rect[0]:.1f}, 'y1': {rect[1]:.1f}, "
                              f"'x2': {rect[2]:.1f}, 'y2': {rect[3]:.1f} }},  # 第{page_num + 1}页 区块{block_idx + 1}")
    
    print("]\n")
    doc.close()


def extract_text_from_pdf(pdf_path, regions,page_num=0):
    """从PDF文件中提取指定区域的文本,按regions顺序返回元组
    支持两种坐标格式:
    1. 左上角+右下角: {'x1':..., 'y1':..., 'x2':..., 'y2':...}
    2. 左上角+宽高:   {'x':..., 'y':..., 'width':..., 'height':...}
    未提取到内容的区域返回空字符串
    """
    doc = fitz.open(pdf_path)
    results = []  # 存储提取结果,按regions顺序
    
    if len(doc) == 0:
        print("错误: PDF文件没有页面!")
        doc.close()
        return tuple([''] * len(regions))  # 全空元组
    
    # 默认为第一页,如需处理多页可扩展参数
    page = doc.load_page(page_num)
    
    for region in regions:
        # 解析坐标
        if 'x1' in region and 'y1' in region and 'x2' in region and 'y2' in region:
            # 格式1:直接使用左上角(x1,y1)和右下角(x2,y2)
            x1, y1 = region['x1'], region['y1']
            x2, y2 = region['x2'], region['y2']
        elif 'x' in region and 'y' in region and 'width' in region and 'height' in region:
            # 格式2:通过左上角(x,y)和宽高计算右下角
            x1, y1 = region['x'], region['y']
            x2, y2 = x1 + region['width'], y1 + region['height']
        else:
            print(f"警告: 区域坐标无效,跳过: {region}")
            results.append('')  # 无效区域返回空字符串
            continue
        
        # 提取区域文本
        rect = fitz.Rect(x1, y1, x2, y2)
        text = page.get_text("text", clip=rect).strip()
        
        # 结果存入列表(空文本返回空字符串)
        results.append(text if text else '')
    
    doc.close()
    return tuple(results)  # 转换为元组返回

def batch_rename_pdfs(folder_path):
    """
    批量重命名指定文件夹中的PDF文件
    
    参数:
    folder_path (str): 包含PDF文件的文件夹路径
    
    返回:
    list: 包含重命名结果的字典列表,每个字典包含原始文件名、新文件名和操作结果
    """
    # 定义提取区域(按顺序)
    regions = [
        {'x1': 106.5, 'y1': 81.1, 'x2': 133.5, 'y2': 90.1},
        {'x1': 496.0, 'y1': 109.1, 'x2': 541.0, 'y2': 118.1},
    ]
    
    results = []
    
    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        # 检查是否为PDF文件
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            
            # 跳过文件夹
            if os.path.isdir(pdf_path):
                continue
            
            # 提取文本信息
            try:
                _name, _date = extract_text_from_pdf(pdf_path, regions)
            except Exception as e:
                results.append({
                    'original': filename,
                    'new': None,
                    'status': f'提取失败: {str(e)}'
                })
                continue
            
            # 替换文件名中的非法字符
            valid_name = _name.replace('/', '_').replace('\\', '_').replace(':', '_') \
                              .replace('*', '_').replace('?', '_').replace('"', '_') \
                              .replace('<', '_').replace('>', '_').replace('|', '_')
            
            valid_date = _date.replace('/', '_').replace('\\', '_').replace(':', '_') \
                              .replace('*', '_').replace('?', '_').replace('"', '_') \
                              .replace('<', '').replace('>', '_').replace('|', '_')
            
            # 构建新文件名
            new_filename = f"{valid_name}_{valid_date}.pdf"
            
            # 避免空文件名
            if new_filename == "_":
                new_filename = f"extracted_{os.path.splitext(filename)[0]}.pdf"
            
            new_path = os.path.join(folder_path, new_filename)
            
            # 检查新文件是否已存在
            if os.path.exists(new_path):
                results.append({
                    'original': filename,
                    'new': new_filename,
                    'status': '已存在,未重命名'
                })
                continue
            
            # 执行重命名
            try:
                os.rename(pdf_path, new_path)
                results.append({
                    'original': filename,
                    'new': new_filename,
                    'status': '重命名成功'
                })
            except Exception as e:
                results.append({
                    'original': filename,
                    'new': new_filename,
                    'status': f'重命名失败: {str(e)}'
                })
    
    return results


# # 示例用法
# if __name__ == "__main__":
#     # 定义提取区域(按顺序)
#     regions = [
#         {'x1': 106.5, 'y1': 81.1, 'x2': 133.5, 'y2': 90.1},
#         {'x1': 496.0, 'y1': 109.1, 'x2': 541.0, 'y2': 118.1},
#     ]
    
#     pdf_path = "1752567478483.pdf"  # 替换为你的PDF路径
#     extracted = extract_text_from_pdf(pdf_path, regions)
    
#     print("提取结果(元组):")
#     print(extracted)

# 示例用法
if __name__ == "__main__":
    folder_path = r"C:\Users\Administrator\Downloads"  # 替换为你的PDF文件夹路径
    results = batch_rename_pdfs(folder_path)
    
    print("批量重命名结果:")
    for result in results:
        print(f"原文件: {result['original']}")
        if result['new']:
            print(f"新文件: {result['new']}")
        print(f"状态: {result['status']}")
        print("-" * 50)