Python查找大文件的实用脚本分享

2024-11-04 10:57:07 作者：zhongcx01

这篇文章主要为大家分享一个利用Python查找大文件的实用脚本,文中的示例代码讲解详细,感兴趣的小伙伴可以跟随小编一起学习一下

C盘满了，写了一个python脚本，2分多钟能找到比较大的文件，然后手动删除或者迁移D盘，最后发现是微信小程序开发工具缓存文件太多了，腾出来10个G念头通达了，这里备份一下脚本。

运行工具：PyCharm 2024.1.3 (Community Edition)

完整代码

import os
import threading
import time
import sys
from threading import Event
 
def is_large(file_path, threshold_mb):
    """判断文件大小是否超过指定MB阈值"""
    return os.path.getsize(file_path) / (1024 * 1024) > threshold_mb
 
def show_loading_animation(stop_event, interval=0.5):
    """显示简易的文本加载动画，直到接收到停止信号"""
    loading_chars = ['.', '..', '...', '....']
    total_cycles = int(interval * 10)
    cycle_length = 10
 
    for _ in range(total_cycles):
        for char in loading_chars:
            for _ in range(cycle_length):
                sys.stdout.write('\r正在查找大文件... ' + char)
                sys.stdout.flush()
                time.sleep(interval / cycle_length)
            sys.stdout.write('\r正在查找大文件... ' + loading_chars[0])
            sys.stdout.flush()
 
def filter_files(files, skip_file_keywords, include_file_keywords, extension=None):
    """根据文件名关键词和扩展名过滤文件列表"""
    filtered_files = [file for file in files if (not skip_file_keywords or all(keyword not in file for keyword in skip_file_keywords)) and
                      (not include_file_keywords or any(keyword in file for keyword in include_file_keywords))]
    if extension is not None:
        filtered_files = [file for file in filtered_files if file.endswith('.' + extension)]
    return filtered_files
 
def filter_dirs(dirs, skip_dir_keywords, include_dir_keywords):
    """根据目录名关键词过滤目录列表"""
    return [dir for dir in dirs if (not skip_dir_keywords or all(keyword not in dir for keyword in skip_dir_keywords)) and
             (not include_dir_keywords or any(keyword in dir for keyword in include_dir_keywords))]
 
def get_all_large_files_with_loading(dir_path, threshold_mb, skip_dir_keywords, skip_file_keywords, include_dir_keywords, include_file_keywords, extension=None, interval=0.5):
    """查找目录下所有大于指定大小的文件，同时跳过或仅包括特定关键词的文件夹及文件名称，并显示加载动画直到完成"""
    start_time = time.time()
    stop_event = Event()
    large_files = []
    loading_thread = threading.Thread(target=show_loading_animation, args=(stop_event, interval))
    loading_thread.daemon = True
    loading_thread.start()
 
    try:
        for root, dirs, files in os.walk(dir_path):
            dirs[:] = filter_dirs(dirs, skip_dir_keywords, include_dir_keywords)
            filtered_files = filter_files(files, skip_file_keywords, include_file_keywords, extension)
            for file in filtered_files:
                full_path = os.path.join(root, file)
                try:
                    if is_large(full_path, threshold_mb):
                        file_info = {'path': full_path, 'size': os.path.getsize(full_path) / 1024 / 1024}
                        large_files.append(file_info)
                except Exception as e:
                    print(f"警告访问文件出错 {full_path} 出错信息: {e}")
 
    finally:
        stop_event.set()
        loading_thread.join()
    large_files.sort(key=lambda x: x['size'], reverse=True)
    for file_info in large_files:
        print(f"文件路径: {file_info['path']} | 文件大小: {file_info['size']:.2f} MB")
 
    end_time = time.time()
    print(f"\n查找共耗时: {end_time - start_time:.2f} 秒")
 
def main():
    dir_path = input("请输入要检查的目录路径: ")
    try:
        threshold_mb = float(input("请输入文件大小阈值(单位: MB): "))
        skip_dir_keywords = input("请输入要跳过的文件夹名关键词，用逗号分隔(直接回车跳过，推荐modules,~~,.gradle): ").split(',')
        skip_file_keywords = input("请输入要跳过的文件名关键词，用逗号分隔(直接回车跳过，推荐$): ").split(',')
        include_dir_keywords = input("请输入要包含的文件夹名关键词，用逗号分隔(直接回车跳过): ").split(',')
        include_file_keywords = input("请输入要包含的文件名关键词，用逗号分隔(直接回车跳过): ").split(',')
        extension = input("请输入要筛选的文件扩展名(例如：txt，可选，直接回车跳过): ").strip('.') or None
        get_all_large_files_with_loading(dir_path, threshold_mb, skip_dir_keywords, skip_file_keywords, include_dir_keywords, include_file_keywords, extension)
        print("搜索结束.")
    except ValueError:
        print("错误：请输入有效的数字作为文件大小阈值.")
    except OSError as e:
        print(e)
 
if __name__ == '__main__':
    main()

方法补充

除了上文的方法，小编还为大家整理了其他Python查找大文件的方法，希望对大家有所帮助

完整代码如下

#! python3
#chapter09-test02.py - 找出一个文件夹内的大文件，并打印出大文件的绝对路径<br>#-----为了防止运行时间过长，我把程序设置为了只检查前1000个超过size的文件，他们并不是最大的1000个
 
import os,pprint,sys
import timeit,time
 
 
#装饰器--计算程序运行时间
def colocked_decorator(func):
    def colock(*args):
        startTime=timeit.default_timer()
        result=func(*args)  #运行程序
        spendTime=timeit.default_timer()-startTime
        name=func.__name__  #获取程序名字
        arg_str=','.join(repr(arg) for arg in args) #注意不是*args  组成程序参数的字符串
        print('[0.7fs] %s(%s) '%(spendTime,name,arg_str),end='')
        print('%r',result)
        return result
    return colock
 
#寻找指定文件夹内的的大文件
#返回包含所有大文件的绝对地址的一个列表
#folder-指定的文件夹地址
#size-阈值，超过这个为大文件
@colocked_decorator
def findBigFile(folder,size):
    bigFileAbs=[]
    for foldername,subfolders,filenames in os.walk(folder):
        #对文件进行遍历
        for filename in filenames:
            #.getsize(path)必须是完整路径
            fileAbs=os.path.join(foldername,filename)
            if os.path.getsize(fileAbs)>size and len(bigFileAbs)<100:   
                #fileAbs=os.path.join(foldername,filename)
                fileAbs=os.path.abspath(fileAbs)
                bigFileAbs.append(fileAbs)
    return bigFileAbs
 
#定义一个函数用来将尺寸变为KB、MB这样的单位，但是没有在这个程序中使用
#size-是os.getsize()返回的文件尺寸数值
#is_1024_byte 代表以1024去转化还是1000去转化，默认是1024
#先定义的后缀
SUFFIXES = {1000:['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
            1024:['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
def humanReadable_size(size,is_1024_byte=True):
    #mutiple默认是1000
    mutiple=1000 if is_1024_byte else 1024
    #与for遍历结合起来，这样来进行递级的转换
    for suffix in SUFFIXES[mutiple]:
        size/=mutiple
        #直到Size小于能往下一个单位变的数值
        if size<mutiple:
            return '{0:.1f}{1}'.format(size,suffix)
    raise ValueError('number too large')
 
         
 
path='F:\DCIM'
size=1000000    #设定的阈值
#先判断路径是否存在
if os.path.exists(path):   
    resultList=findBigFile(path,size)
    pprint.pprint(resultList)
     
else:
    print('You enter path does not exist')
    sys.exit()

到此这篇关于Python查找大文件的实用脚本分享的文章就介绍到这了,更多相关Python查找大文件内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！

Python查找大文件的实用脚本分享

您可能感兴趣的文章: