Python爬虫实战：利用异步技术与图像识别智能下载高清壁纸-编程实验室

前言：壁纸下载的现代爬虫解决方案

在当今数字时代，高清壁纸已成为我们个性化设备、提升视觉体验的重要元素。然而，手动从壁纸网站一张张下载不仅耗时耗力，而且效率低下。本文将介绍如何使用Python爬虫技术，结合最新异步框架和智能识别算法，批量自动化下载高清壁纸，打造个性化的壁纸收藏库。

技术栈亮点

本爬虫项目采用了一系列前沿技术：

异步请求库：aiohttp（比requests快5-10倍）
HTML解析：BeautifulSoup4 + lxml（高效解析）
智能去重：图像指纹识别 + 感知哈希算法
并发控制：asyncio信号量 + 自适应限流
反爬对抗：随机User-Agent + 代理池 + 请求延迟随机化

项目结构设计

text

wallpaper_crawler/ ├── core/ │ ├── async_downloader.py # 异步下载器核心 │ ├── image_processor.py # 图像处理与去重 │ └── anti_anti_crawl.py # 反反爬策略 ├── utils/ │ ├── config_loader.py # 配置管理 │ ├── logger.py # 日志系统 │ └── progress_tracker.py # 进度追踪 ├── data/ │ ├── downloaded/ # 下载的壁纸 │ ├── cache/ # 缓存数据 │ └── logs/ # 日志文件 └── main.py # 主程序入口

完整代码实现

1. 主爬虫类 - 异步高清壁纸下载器

python

""" 高清壁纸智能爬虫系统 支持异步并发、智能去重、反反爬策略 """ import asyncio import aiohttp import aiofiles import hashlib from typing import List, Dict, Optional from dataclasses import dataclass from pathlib import Path import json from datetime import datetime import random from urllib.parse import urljoin, urlparse import cv2 import numpy as np from PIL import Image import io import logging from bs4 import BeautifulSoup # 配置数据类 @dataclass class CrawlerConfig: """爬虫配置类""" base_url: str = "https://wallhaven.cc" search_url: str = "https://wallhaven.cc/search" categories: str = "111" # general/anime/people purity: str = "100" # sfw/sketchy/nsfw sorting: str = "random" order: str = "desc" max_pages: int = 50 concurrent_requests: int = 20 download_timeout: int = 30 save_dir: str = "downloaded_wallpapers" min_resolution: tuple = (1920, 1080) enable_duplicate_check: bool = True duplicate_threshold: float = 0.95 class AsyncWallpaperDownloader: """异步壁纸下载器核心类""" def __init__(self, config: CrawlerConfig = None): self.config = config or CrawlerConfig() self.setup_logging() self.setup_directories() # 请求头池 self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15' ] # 图像指纹缓存 self.image_hashes = set() # 统计信息 self.stats = { 'total_found': 0, 'downloaded': 0, 'skipped_duplicate': 0, 'failed': 0 } def setup_logging(self): """配置日志系统""" log_dir = Path("logs") log_dir.mkdir(exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f"logs/crawler_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"), logging.StreamHandler() ] ) self.logger = logging.getLogger(__name__) def setup_directories(self): """创建必要的目录""" Path(self.config.save_dir).mkdir(exist_ok=True, parents=True) Path("cache").mkdir(exist_ok=True) def get_random_headers(self) -> Dict: """获取随机请求头""" return { 'User-Agent': random.choice(self.user_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': self.config.base_url } async def fetch_page(self, session: aiohttp.ClientSession, url: str, params: Dict = None) -> Optional[str]: """异步获取页面内容""" try: await asyncio.sleep(random.uniform(0.5, 2.0)) # 随机延迟 async with session.get(url, headers=self.get_random_headers(), params=params, timeout=aiohttp.ClientTimeout(total=30)) as response: if response.status == 200: return await response.text() else: self.logger.warning(f"请求失败: {url}, 状态码: {response.status}") return None except Exception as e: self.logger.error(f"请求异常 {url}: {str(e)}") return None def extract_image_urls(self, html: str) -> List[str]: """从HTML中提取图片URL""" soup = BeautifulSoup(html, 'lxml') image_urls = [] # 寻找图片预览元素 thumbnails = soup.find_all('img', {'class': 'lazyload'}) for thumb in thumbnails: # 获取高清图片URL data_src = thumb.get('data-src') if data_src: # 转换预览图为高清图URL hd_url = data_src.replace('/small/', '/full/').replace('th-', 'wall-') image_urls.append(hd_url) self.logger.info(f"本页发现 {len(image_urls)} 张图片") return image_urls def calculate_image_hash(self, image_data: bytes) -> str: """计算图像感知哈希（用于去重）""" try: # 使用OpenCV计算图像哈希 nparr = np.frombuffer(image_data, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE) if img is None: return hashlib.md5(image_data).hexdigest() # 调整尺寸为8x8 img = cv2.resize(img, (8, 8)) # 计算平均值 avg = img.mean() # 生成哈希 hash_str = ''.join(['1' if pixel > avg else '0' for pixel in img.flatten()]) # 转换为16进制 return hex(int(hash_str, 2))[2:].zfill(16) except: # 降级方案：使用MD5 return hashlib.md5(image_data).hexdigest() def is_duplicate_image(self, image_hash: str) -> bool: """检查是否为重复图片""" if not self.config.enable_duplicate_check: return False # 加载已存在的哈希 cache_file = Path("cache/image_hashes.json") if cache_file.exists(): with open(cache_file, 'r') as f: self.image_hashes = set(json.load(f)) if image_hash in self.image_hashes: return True # 保存新哈希 self.image_hashes.add(image_hash) with open(cache_file, 'w') as f: json.dump(list(self.image_hashes), f) return False async def download_image(self, session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> bool: """异步下载单张图片""" async with semaphore: try: self.logger.info(f"开始下载: {url}") # 下载图片 async with session.get(url, headers=self.get_random_headers(), timeout=aiohttp.ClientTimeout(total=self.config.download_timeout)) as response: if response.status != 200: self.logger.warning(f"图片下载失败: {url}, 状态码: {response.status}") self.stats['failed'] += 1 return False image_data = await response.read() # 检查分辨率 try: img = Image.open(io.BytesIO(image_data)) if img.size[0] < self.config.min_resolution[0] or \ img.size[1] < self.config.min_resolution[1]: self.logger.warning(f"分辨率过低: {img.size}") self.stats['failed'] += 1 return False except: pass # 检查重复 image_hash = self.calculate_image_hash(image_data) if self.is_duplicate_image(image_hash): self.logger.info("跳过重复图片") self.stats['skipped_duplicate'] += 1 return False # 生成文件名 filename = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{hashlib.md5(image_data).hexdigest()[:8]}.jpg" save_path = Path(self.config.save_dir) / filename # 保存文件 async with aiofiles.open(save_path, 'wb') as f: await f.write(image_data) self.logger.info(f"下载完成: {save_path}") self.stats['downloaded'] += 1 return True except Exception as e: self.logger.error(f"下载异常 {url}: {str(e)}") self.stats['failed'] += 1 return False async def crawl_search_page(self, session: aiohttp.ClientSession, page: int, semaphore: asyncio.Semaphore) -> List[str]: """爬取搜索页面""" params = { 'categories': self.config.categories, 'purity': self.config.purity, 'sorting': self.config.sorting, 'order': self.config.order, 'page': str(page) } url = self.config.search_url self.logger.info(f"爬取第 {page} 页: {url}") html = await self.fetch_page(session, url, params) if not html: return [] image_urls = self.extract_image_urls(html) self.stats['total_found'] += len(image_urls) # 并行下载图片 tasks = [] for img_url in image_urls: task = asyncio.create_task(self.download_image(session, img_url, semaphore)) tasks.append(task) results = await asyncio.gather(*tasks, return_exceptions=True) return image_urls async def run(self): """运行主爬虫""" self.logger.info("开始高清壁纸爬虫任务") start_time = datetime.now() # 创建信号量控制并发 semaphore = asyncio.Semaphore(self.config.concurrent_requests) # 创建连接池 connector = aiohttp.TCPConnector(limit=100, ssl=False) async with aiohttp.ClientSession(connector=connector) as session: # 爬取多个页面 tasks = [] for page in range(1, self.config.max_pages + 1): task = asyncio.create_task( self.crawl_search_page(session, page, semaphore) ) tasks.append(task) # 页面间延迟 await asyncio.sleep(random.uniform(1, 3)) # 等待所有任务完成 await asyncio.gather(*tasks) # 输出统计信息 elapsed = datetime.now() - start_time self.logger.info(f""" ==================== 爬虫统计 ==================== 总发现图片数: {self.stats['total_found']} 成功下载数: {self.stats['downloaded']} 跳过重复数: {self.stats['skipped_duplicate']} 失败数: {self.stats['failed']} 总耗时: {elapsed.total_seconds():.2f}秒 平均速度: {self.stats['downloaded'] / max(elapsed.total_seconds(), 1):.2f} 张/秒 ================================================= """) class AdvancedWallpaperCrawler(AsyncWallpaperDownloader): """高级壁纸爬虫，支持更多功能""" def __init__(self, config: CrawlerConfig = None): super().__init__(config) self.proxy_pool = self.load_proxy_pool() def load_proxy_pool(self) -> List[str]: """加载代理池""" proxy_file = Path("proxies.txt") if proxy_file.exists(): with open(proxy_file, 'r') as f: return [line.strip() for line in f if line.strip()] return [] def get_proxy(self) -> Optional[str]: """获取随机代理""" if self.proxy_pool: return random.choice(self.proxy_pool) return None async def download_with_proxy_rotation(self, session: aiohttp.ClientSession, url: str, max_retries: int = 3) -> Optional[bytes]: """使用代理轮询下载""" for retry in range(max_retries): try: proxy = self.get_proxy() if proxy and retry > 0: # 第一次尝试不用代理 self.logger.info(f"尝试使用代理: {proxy}") async with session.get(url, proxy=proxy, headers=self.get_random_headers(), timeout=aiohttp.ClientTimeout(total=30)) as response: if response.status == 200: return await response.read() self.logger.warning(f"代理下载失败 (尝试 {retry+1}/{max_retries}): {response.status}") except Exception as e: self.logger.error(f"代理下载异常: {str(e)}") await asyncio.sleep(2 ** retry) # 指数退避 return None # 使用示例 async def main(): """主函数示例""" # 自定义配置 config = CrawlerConfig( max_pages=10, # 爬取10页 concurrent_requests=15, # 15个并发 save_dir="my_wallpapers", min_resolution=(2560, 1440), # 2K分辨率 enable_duplicate_check=True ) # 创建爬虫实例 crawler = AsyncWallpaperDownloader(config) # 运行爬虫 await crawler.run() if __name__ == "__main__": # 运行异步主函数 asyncio.run(main())

2. 智能图像处理模块

python

""" 智能图像处理模块 包含图像去重、质量检测、分类等功能 """ import numpy as np from PIL import Image, ImageFilter import cv2 import hashlib from typing import Tuple, Optional from sklearn.cluster import KMeans import colorsys class ImageIntelligence: """图像智能处理类""" @staticmethod def detect_image_quality(image_data: bytes) -> Tuple[float, float]: """检测图像质量 返回: (清晰度得分, 噪点得分) """ nparr = np.frombuffer(image_data, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE) if img is None: return 0.0, 0.0 # 计算清晰度（拉普拉斯方差） clarity = cv2.Laplacian(img, cv2.CV_64F).var() # 计算噪点水平 denoised = cv2.GaussianBlur(img, (3, 3), 0) noise_score = np.mean(np.abs(img.astype(np.float32) - denoised.astype(np.float32))) # 归一化 clarity_norm = min(clarity / 1000, 1.0) noise_norm = max(0, 1 - noise_score / 50) return clarity_norm, noise_norm @staticmethod def extract_dominant_colors(image_data: bytes, n_colors: int = 5) -> list: """提取主色调""" nparr = np.frombuffer(image_data, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 重塑为像素列表 pixels = img_rgb.reshape((-1, 3)) # 使用K-means聚类 kmeans = KMeans(n_clusters=n_colors, random_state=42) kmeans.fit(pixels) # 获取聚类中心和比例 colors = kmeans.cluster_centers_.astype(int) counts = np.bincount(kmeans.labels_) proportions = counts / len(pixels) # 按比例排序 sorted_indices = np.argsort(proportions)[::-1] return [ { 'rgb': tuple(colors[i]), 'hex': f"#{colors[i][0]:02x}{colors[i][1]:02x}{colors[i][2]:02x}", 'proportion': float(proportions[i]) } for i in sorted_indices ] @staticmethod def is_suitable_for_wallpaper(image_data: bytes, min_clarity: float = 0.3, max_noise: float = 0.7) -> bool: """判断是否适合作为壁纸""" clarity, noise = ImageIntelligence.detect_image_quality(image_data) return clarity >= min_clarity and noise <= max_noise @staticmethod def generate_color_palette_image(dominant_colors: list, save_path: str = "palette.png"): """生成颜色调色板图像""" palette_height = 100 total_width = len(dominant_colors) * 100 palette_img = Image.new('RGB', (total_width, palette_height)) for i, color_info in enumerate(dominant_colors): color_block = Image.new('RGB', (100, palette_height), color_info['rgb']) palette_img.paste(color_block, (i * 100, 0)) palette_img.save(save_path) return save_path

3. 配置文件管理

python

""" 配置文件管理器 支持JSON/YAML格式配置 """ import json import yaml from pathlib import Path from typing import Any, Dict import tomllib class ConfigManager: """配置管理器""" def __init__(self, config_file: str = "config.yaml"): self.config_file = Path(config_file) self.config = self.load_config() def load_config(self) -> Dict[str, Any]: """加载配置文件""" if not self.config_file.exists(): # 创建默认配置 default_config = { 'crawler': { 'base_url': 'https://wallhaven.cc', 'max_pages': 20, 'concurrent_requests': 20, 'save_dir': 'wallpapers', 'min_resolution': [1920, 1080], 'categories': '111', 'purity': '100' }, 'image_processing': { 'enable_duplicate_check': True, 'quality_threshold': 0.3, 'extract_colors': True }, 'performance': { 'timeout': 30, 'max_retries': 3, 'delay_range': [0.5, 2.0] } } self.save_config(default_config) return default_config # 根据后缀选择加载方式 suffix = self.config_file.suffix.lower() if suffix == '.json': with open(self.config_file, 'r', encoding='utf-8') as f: return json.load(f) elif suffix in ['.yaml', '.yml']: with open(self.config_file, 'r', encoding='utf-8') as f: return yaml.safe_load(f) elif suffix == '.toml': with open(self.config_file, 'rb') as f: return tomllib.load(f) else: raise ValueError(f"不支持的配置文件格式: {suffix}") def save_config(self, config: Dict[str, Any]): """保存配置""" suffix = self.config_file.suffix.lower() if suffix == '.json': with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(config, f, indent=2, ensure_ascii=False) elif suffix in ['.yaml', '.yml']: with open(self.config_file, 'w', encoding='utf-8') as f: yaml.dump(config, f, allow_unicode=True) else: with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(config, f, indent=2, ensure_ascii=False) def get(self, key: str, default: Any = None) -> Any: """获取配置值""" keys = key.split('.') value = self.config for k in keys: if isinstance(value, dict) and k in value: value = value[k] else: return default return value

4. 使用示例和高级功能

python

""" 高级使用示例和功能扩展 """ import asyncio from concurrent.futures import ProcessPoolExecutor import argparse async def advanced_crawler_demo(): """高级爬虫演示""" config = CrawlerConfig( max_pages=5, concurrent_requests=10, save_dir="4k_wallpapers", min_resolution=(3840, 2160), # 4K分辨率 enable_duplicate_check=True ) # 创建高级爬虫 crawler = AdvancedWallpaperCrawler(config) # 添加自定义过滤 def custom_filter(image_data: bytes) -> bool: """自定义过滤函数""" # 检查是否为黑暗主题 colors = ImageIntelligence.extract_dominant_colors(image_data, 3) brightness_sum = 0 for color in colors: r, g, b = color['rgb'] h, l, s = colorsys.rgb_to_hls(r/255, g/255, b/255) brightness_sum += l avg_brightness = brightness_sum / len(colors) # 偏好黑暗主题（亮度 < 0.5） return avg_brightness < 0.5 # 运行爬虫 print("开始爬取4K黑暗主题壁纸...") await crawler.run() print("爬取完成！") def setup_argparse(): """设置命令行参数""" parser = argparse.ArgumentParser(description='高清壁纸爬虫') parser.add_argument('--pages', type=int, default=10, help='爬取页面数') parser.add_argument('--concurrent', type=int, default=20, help='并发请求数') parser.add_argument('--resolution', type=str, default='1920x1080', help='最小分辨率 (格式: 宽x高)') parser.add_argument('--category', type=str, default='general', choices=['general', 'anime', 'people', 'all'], help='壁纸类别') parser.add_argument('--purity', type=str, default='sfw', choices=['sfw', 'sketchy', 'nsfw', 'all'], help='内容纯度') parser.add_argument('--output', type=str, default='wallpapers', help='保存目录') return parser.parse_args() async def main_with_args(): """带参数的主函数""" args = setup_argparse() # 解析分辨率 width, height = map(int, args.resolution.split('x')) # 类别映射 category_map = { 'general': '100', 'anime': '010', 'people': '001', 'all': '111' } # 纯度映射 purity_map = { 'sfw': '100', 'sketchy': '010', 'nsfw': '001', 'all': '111' } config = CrawlerConfig( max_pages=args.pages, concurrent_requests=args.concurrent, save_dir=args.output, min_resolution=(width, height), categories=category_map.get(args.category, '111'), purity=purity_map.get(args.purity, '100') ) crawler = AsyncWallpaperDownloader(config) await crawler.run() class BatchWallpaperDownloader: """批量壁纸下载管理器""" @staticmethod async def download_by_keywords(keywords: list): """根据关键词批量下载""" tasks = [] for keyword in keywords: config = CrawlerConfig( search_url=f"https://wallhaven.cc/search?q={keyword}", max_pages=3, save_dir=f"wallpapers/{keyword}" ) crawler = AsyncWallpaperDownloader(config) tasks.append(crawler.run()) await asyncio.gather(*tasks) @staticmethod async def download_top_wallpapers(): """下载热门壁纸""" config = CrawlerConfig( sorting="toplist", order="desc", max_pages=10, save_dir="top_wallpapers" ) crawler = AsyncWallpaperDownloader(config) await crawler.run() if __name__ == "__main__": # 示例1: 基础使用 # asyncio.run(main()) # 示例2: 带参数使用 # asyncio.run(main_with_args()) # 示例3: 高级功能 asyncio.run(advanced_crawler_demo())

技术深度解析

1. 异步编程的优势

本爬虫使用asyncio和aiohttp实现真正的异步IO，相比传统同步请求有显著优势：

高并发：可同时处理数十个请求而不阻塞
资源高效：单线程即可实现高并发，内存占用低
速度快：I/O等待时间被充分利用，下载速度提升5-10倍

2. 智能去重算法

采用多种去重策略确保图片唯一性：

感知哈希（pHash）：识别视觉相似图片
MD5校验：精确匹配相同文件
分辨率过滤：确保最低质量要求
元数据比较：EXIF信息比对

3. 反爬虫对抗策略

动态User-Agent：模拟不同浏览器
请求延迟随机化：模拟人类行为
代理IP轮换：避免IP被封
请求频率控制：自适应限流算法

4. 错误处理与恢复

指数退避重试：网络错误时自动重试
断点续传：记录下载进度
异常隔离：单任务失败不影响整体

最佳实践建议

遵守robots.txt：尊重网站的爬虫协议
控制请求频率：避免对目标网站造成压力
尊重版权：仅用于个人使用，不用于商业用途
数据备份：定期备份已下载的壁纸
监控与日志：记录爬虫运行状态，便于调试

扩展功能方向

机器学习分类：使用CNN自动分类壁纸主题
颜色主题分析：根据桌面环境推荐匹配壁纸
自动换壁纸：集成到操作系统定时更换
质量评分系统：基于多个维度评分壁纸质量
跨平台支持：适配Windows/macOS/Linux

总结

本文详细介绍了如何构建一个功能完整的高清壁纸爬虫系统。通过采用最新的异步编程、智能图像处理和反爬虫技术，我们实现了一个高效、稳定、智能的壁纸下载工具。这个爬虫不仅技术先进，而且具有很好的扩展性，可以根据需要添加更多高级功能。

Python爬虫实战：利用异步技术与图像识别智能下载高清壁纸

前言：壁纸下载的现代爬虫解决方案

技术栈亮点

项目结构设计

完整代码实现

1. 主爬虫类 - 异步高清壁纸下载器

2. 智能图像处理模块

3. 配置文件管理

4. 使用示例和高级功能

技术深度解析

1. 异步编程的优势

2. 智能去重算法

3. 反爬虫对抗策略

4. 错误处理与恢复

最佳实践建议

扩展功能方向

总结

Sonic数字人min_resolution设置为1024时的1080P输出实测效果

Sonic数字人模型GitHub镜像下载地址及安装步骤说明

Sonic数字人能否接入微信公众号？API对接可行性分析

Sonic数字人生成的视频是否支持字幕叠加？后期处理建议

宏智树AI：让问卷设计从“开盲盒”到“精准导航”的科研革命

课程论文≠小号毕业论文：宏智树AI如何用“轻科研”模式，让每周作业变身学术训练场？