This commit is contained in:
huangzhenpc
2025-07-18 10:08:38 +08:00
parent 1d4f6f8c33
commit 9f9f44ecc7
4 changed files with 1700 additions and 7 deletions

360
real_user_database.py Normal file
View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
真实用户行为数据库
包含真实的用户代理、浏览器指纹、访问模式等数据
"""
import random
import json
from datetime import datetime
import time
class RealUserDatabase:
def __init__(self):
"""初始化真实用户数据库"""
# 真实的用户代理字符串(从真实浏览器收集)
self.user_agents = [
# Chrome Windows 用户代理
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Chrome Mac 用户代理
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Firefox 用户代理
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/120.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0",
# Safari 用户代理
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15",
# Edge 用户代理
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
# 移动端用户代理
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 14; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
]
# 真实的屏幕分辨率分布
self.screen_resolutions = [
(1920, 1080), # 最常见
(1366, 768), # 笔记本常见
(1440, 900), # MacBook
(1536, 864), # 高DPI
(2560, 1440), # 2K显示器
(1600, 900), # 宽屏
(1280, 720), # HD
(3840, 2160), # 4K显示器
(2560, 1600), # 16:10
(1920, 1200), # 16:10
]
# 真实的操作系统分布
self.operating_systems = [
{"name": "Windows 10", "weight": 0.4},
{"name": "Windows 11", "weight": 0.25},
{"name": "macOS", "weight": 0.15},
{"name": "Linux", "weight": 0.1},
{"name": "Android", "weight": 0.07},
{"name": "iOS", "weight": 0.03},
]
# 真实的浏览器分布
self.browser_distribution = [
{"name": "Chrome", "weight": 0.65},
{"name": "Firefox", "weight": 0.15},
{"name": "Safari", "weight": 0.12},
{"name": "Edge", "weight": 0.06},
{"name": "Other", "weight": 0.02},
]
# 真实的语言设置
self.languages = [
"zh-CN,zh;q=0.9,en;q=0.8", # 中文用户
"en-US,en;q=0.9", # 英文用户
"en-GB,en;q=0.9,en-US;q=0.8",
"zh-TW,zh;q=0.9,en;q=0.8",
"ja-JP,ja;q=0.9,en;q=0.8",
"ko-KR,ko;q=0.9,en;q=0.8",
]
# 真实的时区分布
self.timezones = [
"Asia/Shanghai",
"Asia/Tokyo",
"America/New_York",
"Europe/London",
"America/Los_Angeles",
"Europe/Berlin",
"Asia/Seoul",
"Australia/Sydney",
]
# 访问模式数据
self.visit_patterns = {
"工作时间": {
"hours": list(range(9, 18)),
"stay_time_multiplier": 0.8, # 工作时间停留时间较短
"scroll_frequency": 1.2, # 滚动更频繁
},
"休闲时间": {
"hours": list(range(19, 23)) + list(range(6, 9)),
"stay_time_multiplier": 1.5, # 休闲时间停留更久
"scroll_frequency": 0.8, # 滚动较慢
},
"深夜": {
"hours": list(range(0, 6)) + [23],
"stay_time_multiplier": 2.0, # 深夜停留很久
"scroll_frequency": 0.6, # 滚动很慢
}
}
def get_random_user_profile(self):
"""生成一个真实的用户配置文件"""
# 选择操作系统
os_choice = self._weighted_choice(self.operating_systems)
# 根据操作系统选择合适的用户代理
if "Windows" in os_choice:
ua_candidates = [ua for ua in self.user_agents if "Windows NT" in ua]
elif "macOS" in os_choice:
ua_candidates = [ua for ua in self.user_agents if "Macintosh" in ua]
elif "Android" in os_choice:
ua_candidates = [ua for ua in self.user_agents if "Android" in ua]
elif "iOS" in os_choice:
ua_candidates = [ua for ua in self.user_agents if "iPhone" in ua]
else:
ua_candidates = [ua for ua in self.user_agents if "X11; Linux" in ua]
if not ua_candidates:
ua_candidates = self.user_agents
user_agent = random.choice(ua_candidates)
# 选择屏幕分辨率
resolution = random.choice(self.screen_resolutions)
# 选择语言
language = random.choice(self.languages)
# 选择时区
timezone = random.choice(self.timezones)
# 生成其他浏览器指纹信息
profile = {
"user_agent": user_agent,
"operating_system": os_choice,
"screen_resolution": resolution,
"viewport_size": (
resolution[0] - random.randint(0, 100),
resolution[1] - random.randint(100, 200)
),
"language": language,
"timezone": timezone,
"color_depth": random.choice([24, 32]),
"platform": self._extract_platform(user_agent),
"cookie_enabled": True,
"java_enabled": random.choice([True, False]),
"hardware_concurrency": random.choice([2, 4, 8, 12, 16]),
"device_memory": random.choice([2, 4, 8, 16, 32]),
"connection_type": random.choice(["wifi", "ethernet", "cellular"]),
}
return profile
def get_realistic_headers(self, profile=None, referrer=None):
"""生成真实的HTTP头部"""
if not profile:
profile = self.get_random_user_profile()
headers = {
"User-Agent": profile["user_agent"],
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": profile["language"],
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none" if not referrer else "cross-site",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
"DNT": str(random.randint(0, 1)), # Do Not Track
}
if referrer:
headers["Referer"] = referrer
headers["Sec-Fetch-Site"] = "cross-site"
# 根据浏览器类型添加特定头部
if "Chrome" in profile["user_agent"]:
headers["sec-ch-ua"] = self._generate_chrome_sec_ch_ua(profile["user_agent"])
headers["sec-ch-ua-mobile"] = "?0" if "Mobile" not in profile["user_agent"] else "?1"
headers["sec-ch-ua-platform"] = f'"{profile["platform"]}"'
return headers
def get_visit_behavior(self):
"""获取基于时间的访问行为模式"""
current_hour = datetime.now().hour
# 确定当前时间段
pattern_type = "工作时间"
for pattern_name, pattern_data in self.visit_patterns.items():
if current_hour in pattern_data["hours"]:
pattern_type = pattern_name
break
pattern = self.visit_patterns[pattern_type]
return {
"pattern_type": pattern_type,
"stay_time_multiplier": pattern["stay_time_multiplier"],
"scroll_frequency": pattern["scroll_frequency"],
"reading_speed": random.uniform(200, 400), # 每分钟字数
"interaction_probability": random.uniform(0.3, 0.8),
}
def get_realistic_timing(self, base_time, behavior=None):
"""生成真实的访问时间模式"""
if not behavior:
behavior = self.get_visit_behavior()
# 应用时间段影响
adjusted_time = base_time * behavior["stay_time_multiplier"]
# 添加人类行为的随机性
variation = random.uniform(0.7, 1.5)
final_time = adjusted_time * variation
return max(final_time, 1.0) # 最少1秒
def simulate_human_delays(self, action_type="normal"):
"""模拟真实的人类操作延迟"""
delays = {
"thinking": (2, 8), # 思考时间
"reading": (3, 15), # 阅读时间
"scrolling": (0.5, 2), # 滚动间隔
"clicking": (0.8, 3), # 点击间隔
"typing": (0.1, 0.5), # 打字间隔
"normal": (1, 4), # 普通操作
}
min_delay, max_delay = delays.get(action_type, delays["normal"])
return random.uniform(min_delay, max_delay)
def _weighted_choice(self, choices):
"""根据权重选择"""
total = sum(choice["weight"] for choice in choices)
r = random.uniform(0, total)
upto = 0
for choice in choices:
if upto + choice["weight"] >= r:
return choice["name"]
upto += choice["weight"]
return choices[-1]["name"]
def _extract_platform(self, user_agent):
"""从用户代理中提取平台信息"""
if "Windows NT 10.0" in user_agent:
return "Windows"
elif "Windows NT 11.0" in user_agent:
return "Windows"
elif "Macintosh" in user_agent:
return "macOS"
elif "X11; Linux" in user_agent:
return "Linux"
elif "Android" in user_agent:
return "Android"
elif "iPhone" in user_agent:
return "iOS"
else:
return "Unknown"
def _generate_chrome_sec_ch_ua(self, user_agent):
"""生成Chrome的sec-ch-ua头部"""
# 从用户代理中提取Chrome版本
if "Chrome/" in user_agent:
version = user_agent.split("Chrome/")[1].split(".")[0]
return f'"Not_A Brand";v="8", "Chromium";v="{version}", "Google Chrome";v="{version}"'
return '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
def generate_session_data(self):
"""生成完整的会话数据"""
profile = self.get_random_user_profile()
behavior = self.get_visit_behavior()
session_data = {
"profile": profile,
"behavior": behavior,
"session_id": f"session_{int(time.time())}_{random.randint(1000, 9999)}",
"start_time": datetime.now().isoformat(),
"fingerprint": self._generate_browser_fingerprint(profile),
}
return session_data
def _generate_browser_fingerprint(self, profile):
"""生成浏览器指纹"""
fingerprint = {
"screen": {
"width": profile["screen_resolution"][0],
"height": profile["screen_resolution"][1],
"colorDepth": profile["color_depth"],
"pixelDepth": profile["color_depth"],
},
"navigator": {
"userAgent": profile["user_agent"],
"language": profile["language"].split(",")[0],
"languages": profile["language"].split(","),
"platform": profile["platform"],
"cookieEnabled": profile["cookie_enabled"],
"javaEnabled": profile["java_enabled"],
"hardwareConcurrency": profile["hardware_concurrency"],
"deviceMemory": profile["device_memory"],
},
"timezone": profile["timezone"],
"webgl_vendor": random.choice([
"Google Inc. (Intel)",
"Google Inc. (NVIDIA)",
"Google Inc. (AMD)",
"Apple Inc.",
]),
}
return fingerprint
# 使用示例
if __name__ == "__main__":
db = RealUserDatabase()
# 生成用户配置
profile = db.get_random_user_profile()
print("用户配置:")
print(json.dumps(profile, indent=2, ensure_ascii=False))
# 生成HTTP头部
headers = db.get_realistic_headers(profile)
print("\nHTTP头部:")
for key, value in headers.items():
print(f"{key}: {value}")
# 生成访问行为
behavior = db.get_visit_behavior()
print(f"\n访问行为: {behavior}")
# 生成完整会话数据
session = db.generate_session_data()
print("\n会话数据:")
print(json.dumps(session, indent=2, ensure_ascii=False))