python

关注公众号 jb51net

关闭
首页 > 脚本专栏 > python > Python下载文件

从基础到进阶详解Python下载文件的方法完整指南

作者:detayun

在Python中下载文件是一项常见任务,本文将系统介绍Python下载文件的多种方法,涵盖基础实现,高级技巧和常见问题解决方案,感兴趣的小伙伴可以跟随小编一起学习一下

在Python中下载文件是一项常见任务,无论是从网页下载图片、文档,还是通过API获取数据,掌握文件下载技术都是开发者的必备技能。本文将系统介绍Python下载文件的多种方法,涵盖基础实现、高级技巧和常见问题解决方案。

一、基础方法:使用标准库下载文件

1. 使用urllib.request(Python内置库)

import urllib.request

url = "https://example.com/file.zip"
filename = "downloaded_file.zip"

try:
    urllib.request.urlretrieve(url, filename)
    print(f"文件已下载到: {filename}")
except Exception as e:
    print(f"下载失败: {e}")

特点

2. 使用requests库(推荐)

import requests

url = "https://example.com/file.zip"
filename = "downloaded_file.zip"

try:
    response = requests.get(url, stream=True)  # 使用流式下载大文件
    response.raise_for_status()  # 检查请求是否成功
    
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):  # 分块写入
            if chunk:  # 过滤掉keep-alive新块
                f.write(chunk)
    
    print(f"文件已下载到: {filename}")
except requests.exceptions.RequestException as e:
    print(f"下载失败: {e}")

优势

二、进阶技巧:增强下载功能

1. 显示下载进度

import requests
from tqdm import tqdm  # 需要安装: pip install tqdm

url = "https://example.com/large_file.zip"
filename = "large_file.zip"

try:
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as f, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
            bar.update(len(chunk))
    
    print("\n下载完成!")
except Exception as e:
    print(f"下载失败: {e}")

2. 断点续传功能

import os
import requests

url = "https://example.com/large_file.zip"
filename = "large_file.zip"

# 检查是否已部分下载
downloaded_size = 0
if os.path.exists(filename):
    downloaded_size = os.path.getsize(filename)

headers = {'Range': f'bytes={downloaded_size}-'}

try:
    response = requests.get(url, headers=headers, stream=True)
    response.raise_for_status()
    
    with open(filename, 'ab') as f:  # 以追加模式打开
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    
    print("下载完成!")
except Exception as e:
    print(f"下载失败: {e}")

3. 多线程/异步下载(加速下载)

import requests
from concurrent.futures import ThreadPoolExecutor
import os

def download_chunk(url, start, end, filename, chunk_num):
    headers = {'Range': f'bytes={start}-{end}'}
    try:
        response = requests.get(url, headers=headers, stream=True)
        with open(f"{filename}.part{chunk_num}", 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"分块{chunk_num}下载失败: {e}")
        return False

def merge_files(filename, num_chunks):
    with open(filename, 'wb') as outfile:
        for i in range(num_chunks):
            part_filename = f"{filename}.part{i}"
            if os.path.exists(part_filename):
                with open(part_filename, 'rb') as infile:
                    outfile.write(infile.read())
                os.remove(part_filename)

url = "https://example.com/very_large_file.zip"
filename = "very_large_file.zip"
file_size = 1024 * 1024 * 100  # 假设文件100MB
chunk_size = 1024 * 1024 * 10  # 每块10MB
num_chunks = file_size // chunk_size

# 创建线程池下载各分块
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = []
    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size - 1 if i != num_chunks - 1 else file_size - 1
        futures.append(executor.submit(
            download_chunk, url, start, end, filename, i
        ))
    
    # 等待所有分块下载完成
    for future in futures:
        future.result()

# 合并分块
merge_files(filename, num_chunks)
print("下载并合并完成!")

三、常见场景解决方案

1. 下载网页上的所有资源

import requests
from bs4 import BeautifulSoup
import os

def download_resources(url, output_folder="downloads"):
    os.makedirs(output_folder, exist_ok=True)
    
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 下载图片
        for img in soup.find_all('img'):
            img_url = img.get('src')
            if img_url and not img_url.startswith('data:'):
                if not img_url.startswith(('http://', 'https://')):
                    img_url = f"{url}/{img_url}" if not url.endswith('/') else f"{url}{img_url}"
                try:
                    img_data = requests.get(img_url).content
                    img_name = os.path.join(output_folder, img_url.split('/')[-1])
                    with open(img_name, 'wb') as f:
                        f.write(img_data)
                except Exception as e:
                    print(f"图片下载失败: {e}")
        
        # 可以类似地下载CSS/JS等资源
        print("资源下载完成!")
    except Exception as e:
        print(f"网页下载失败: {e}")

download_resources("https://example.com")

2. 使用代理下载

import requests

proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}

url = "https://example.com"
try:
    response = requests.get(url, proxies=proxies)
    with open("page.html", 'w', encoding='utf-8') as f:
        f.write(response.text)
    print("通过代理下载成功!")
except Exception as e:
    print(f"代理下载失败: {e}")

3. 处理下载重定向

import requests

url = "http://example.com/redirecting_link"
try:
    response = requests.get(url, allow_redirects=True)  # 默认允许重定向
    final_url = response.url  # 获取最终URL
    print(f"最终URL: {final_url}")
    
    # 下载最终文件
    with open("final_file.txt", 'wb') as f:
        f.write(response.content)
except Exception as e:
    print(f"下载失败: {e}")

四、最佳实践与注意事项

  1. 错误处理:始终添加异常处理,特别是网络请求可能因各种原因失败
  2. 资源清理:使用with语句确保文件正确关闭
  3. 大文件处理:使用流式下载(stream=True)和分块写入
  4. 安全性
    • 验证SSL证书(默认行为)
    • 对用户提供的URL进行验证
    • 限制文件类型和保存路径
  5. 性能优化
    • 合理设置分块大小(通常8KB-1MB)
    • 多线程下载适合高延迟网络
    • 考虑使用异步IO(如aiohttp)提高并发性能

五、完整示例:带进度条的下载函数

import requests
from tqdm import tqdm
import os

def download_file(url, filename=None, chunk_size=8192):
    """
    下载文件并显示进度条
    
    :param url: 文件URL
    :param filename: 保存文件名(可选,默认从URL提取)
    :param chunk_size: 分块大小(字节)
    :return: 保存的文件路径
    """
    try:
        # 获取文件名(如果未提供)
        if filename is None:
            filename = os.path.basename(url.split('?')[0])  # 去除查询参数
        
        # 发送请求
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # 获取总大小(如果服务器提供)
        total_size = int(response.headers.get('content-length', 0))
        
        # 创建进度条
        progress_bar = tqdm(
            desc=filename,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        )
        
        # 写入文件
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                progress_bar.update(len(chunk))
        
        progress_bar.close()
        print(f"\n文件已保存到: {os.path.abspath(filename)}")
        return filename
    
    except requests.exceptions.RequestException as e:
        print(f"下载失败: {e}")
        return None

# 使用示例
download_file("https://example.com/sample.pdf", "my_document.pdf")

总结

Python提供了多种下载文件的方法,从简单的urllib到功能强大的requests库,再到结合多线程/异步的优化方案。根据实际需求选择合适的方法:

掌握这些技术后,你可以轻松应对各种文件下载场景,构建更健壮的Python应用程序。

到此这篇关于从基础到进阶详解Python下载文件的方法完整指南的文章就介绍到这了,更多相关Python下载文件内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!

您可能感兴趣的文章:
阅读全文