让Gemini写了个爬虫,试了下可以下载,用图片描述来命名下载的图片。
将下面的代码保存到 download.py
命令行中输入 Python3 download.py
按回车下载,会自动创建文件夹。
(需要安装python3,命令行中输入 pip3 install requests,lxml 安装依赖库)
import requests
from lxml import html
import os
import time
import re
# 定义错误日志文件的名称
ERROR_LOG_FILE = "error.md"
def download_images_with_combined_names():
"""
爬取 Duke 大学仓库中的 Gamble 系列图片,并使用“页码_序号_图片描述”格式命名。
下载失败时,将信息记录到 error.md 文件中。
"""
# 主页基础 URL,在此基础上添加分页参数
base_url = "https://repository.duke.edu/dc/gamble?f%5Bcommon_model_name_ssi%5D%5B%5D=Item"
# 本地保存图片的文件夹
download_folder = "gamble_combined_images"
# 创建保存图片的文件夹
if not os.path.exists(download_folder):
os.makedirs(download_folder)
print(f"创建了文件夹: {download_folder}")
# 遍历所有250页
total_pages = 250
for page_num in range(1, total_pages + 1):
page_url = f"{base_url}&page={page_num}"
print(f"\n--- 正在处理第 {page_num} 页, URL: {page_url} ---")
try:
# 获取当前分页内容
response = requests.get(page_url, timeout=10)
response.raise_for_status() # 检查请求是否成功
except requests.exceptions.RequestException as e:
print(f"获取第 {page_num} 页失败: {e}")
continue
# 解析分页 HTML
tree = html.fromstring(response.content)
image_blocks = tree.xpath('//div[contains(@class, "blacklight-image")]')
if not image_blocks:
print(f"第 {page_num} 页未找到任何图片信息块。")
continue
for i, block in enumerate(image_blocks, 1):
# 找到缩略图链接
thumbnail_link = block.xpath('.//img[contains(@class, "img-thumbnail")]/@src')
if not thumbnail_link:
continue
thumbnail_link = thumbnail_link[0]
# 格式化页码和序号
page_str = f"{page_num:03d}"
image_str = f"{i:02d}"
# 查找图片描述
image_description = block.xpath('.//a[@itemprop="name"]/text()')
# --- 生成最终文件名 ---
if image_description:
base_filename = image_description[0].strip()
base_filename = re.sub(r'[\\/:*?"<>|]', '', base_filename)
base_filename = base_filename[:100].strip()
final_filename = f"{page_str}_{image_str}_{base_filename}.jpg"
else:
final_filename = f"{page_str}_{image_str}.jpg"
# --- 文件名生成结束 ---
# 将缩略图链接替换为原图链接
if '!350,350' in thumbnail_link:
high_res_link = thumbnail_link.replace('!350,350', 'full')
else:
continue
full_download_url = f"https://repository.duke.edu{high_res_link}"
file_path = os.path.join(download_folder, final_filename)
print(f" > 正在处理第 {i} 个图片,文件名: {final_filename}")
# 下载图片
try:
if not os.path.exists(file_path):
image_data = requests.get(full_download_url, stream=True, timeout=30)
image_data.raise_for_status()
with open(file_path, 'wb') as f:
for chunk in image_data.iter_content(chunk_size=8192):
f.write(chunk)
print(f" > 下载成功: {final_filename}")
else:
print(f" > 文件已存在,跳过: {final_filename}")
except requests.exceptions.RequestException as e:
# --- 核心修改:下载失败时记录日志 ---
print(f" > 下载失败: {e}")
log_entry = f"- 序号: {page_str}_{image_str}, 描述: '{image_description[0].strip() if image_description else '无'}' 下载链接: {full_download_url}\n"
with open(ERROR_LOG_FILE, 'a', encoding='utf-8') as log_file:
log_file.write(log_entry)
print(f" > 错误信息已记录到 {ERROR_LOG_FILE}")
# --- 记录日志结束 ---
time.sleep(0.5)
print(f"\n第 {page_num} 页处理完毕。")
time.sleep(2)
print("\n所有图片处理完毕。")
print(f"所有下载失败的记录已保存在 {ERROR_LOG_FILE} 文件中。")
if __name__ == "__main__":
download_images_with_combined_names()