添加邮件解码工具 decode_email.py，用于解析和显示.eml文件内容

2025-02-26 11:41:37 +08:00
parent e5ffc6310d
commit d523609b12
1 changed files with 265 additions and 0 deletions
--- a/decode_email.py
+++ b/decode_email.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+邮件解码工具
+用于解析.eml文件并显示可读的邮件内容
+"""
+
+import base64
+import sys
+import re
+import email
+from email import policy
+import os
+import argparse
+import html
+
+def decode_eml_file(filename):
+    """解析并显示.eml文件的内容"""
+    print(f"解析邮件文件: {filename}")
+    
+    # 如果文件不存在
+    if not os.path.exists(filename):
+        print(f"错误: 文件不存在 {filename}")
+        return False
+    
+    # 解析.eml文件
+    try:
+        with open(filename, 'r', encoding='utf-8', errors='replace') as f:
+            msg = email.message_from_file(f, policy=policy.default)
+    except Exception as e:
+        print(f"读取文件错误: {str(e)}")
+        return False
+    
+    # 显示邮件头信息
+    print("\n===== 邮件头信息 =====")
+    print(f"主题: {msg.get('Subject', '无主题')}")
+    print(f"发件人: {msg.get('From', '未知')}")
+    print(f"收件人: {msg.get('To', '未知')}")
+    print(f"日期: {msg.get('Date', '未知')}")
+    
+    # 提取并显示邮件内容
+    print("\n===== 邮件内容 =====")
+    
+    body_text = ""
+    body_html = ""
+    
+    # 处理多部分邮件
+    if msg.is_multipart():
+        for part in msg.iter_parts():
+            content_type = part.get_content_type()
+            
+            if content_type == "text/plain":
+                try:
+                    body_text = part.get_content()
+                except Exception as e:
+                    print(f"解析纯文本内容出错: {str(e)}")
+                    payload = part.get_payload(decode=True)
+                    if payload:
+                        charset = part.get_content_charset() or 'utf-8'
+                        try:
+                            body_text = payload.decode(charset, errors='replace')
+                        except:
+                            body_text = payload.decode('utf-8', errors='replace')
+            
+            elif content_type == "text/html":
+                try:
+                    body_html = part.get_content()
+                except Exception as e:
+                    print(f"解析HTML内容出错: {str(e)}")
+                    payload = part.get_payload(decode=True)
+                    if payload:
+                        charset = part.get_content_charset() or 'utf-8'
+                        try:
+                            body_html = payload.decode(charset, errors='replace')
+                        except:
+                            body_html = payload.decode('utf-8', errors='replace')
+    else:
+        # 处理单部分邮件
+        content_type = msg.get_content_type()
+        try:
+            if content_type == "text/plain":
+                body_text = msg.get_content()
+            elif content_type == "text/html":
+                body_html = msg.get_content()
+            else:
+                print(f"未知内容类型: {content_type}")
+                try:
+                    # 尝试作为纯文本处理
+                    body_text = msg.get_content()
+                except:
+                    pass
+        except Exception as e:
+            print(f"解析邮件内容出错: {str(e)}")
+            payload = msg.get_payload(decode=True)
+            if payload:
+                charset = msg.get_content_charset() or 'utf-8'
+                try:
+                    decoded = payload.decode(charset, errors='replace')
+                    if content_type == "text/plain":
+                        body_text = decoded
+                    elif content_type == "text/html":
+                        body_html = decoded
+                    else:
+                        body_text = decoded
+                except:
+                    body_text = payload.decode('utf-8', errors='replace')
+    
+    # 显示纯文本内容
+    if body_text:
+        print("\n----- 纯文本内容 -----")
+        print(body_text)
+    
+    # 显示HTML内容 (可选，HTML内容通常很长)
+    if body_html:
+        print("\n----- HTML内容摘要 -----")
+        # 只显示HTML内容的前500个字符
+        print(body_html[:500] + "..." if len(body_html) > 500 else body_html)
+    
+    # 尝试提取验证码
+    verification_code = None
+    
+    # 从HTML内容中提取
+    if body_html:
+        # 尝试多种正则表达式匹配可能的验证码格式
+        patterns = [
+            r'letter-spacing:\s*\d+px[^>]*>([^<]+)<',  # 通常验证码有特殊样式
+            r'<div[^>]*>(\d{4,8})</div>',              # 数字在div中
+            r'验证码[：:]\s*([A-Z0-9]{4,8})',           # 中文标记的验证码
+            r'code[^\d]+(\d{4,8})',                    # 英文标记的验证码
+            r'\b([A-Z0-9]{6})\b'                       # 6位大写字母或数字
+        ]
+        
+        for pattern in patterns:
+            matches = re.findall(pattern, body_html)
+            if matches:
+                verification_code = matches[0].strip()
+                break
+    
+    # 从纯文本中提取
+    if not verification_code and body_text:
+        patterns = [
+            r'验证码[：:]\s*([A-Z0-9]{4,8})',           # 中文格式
+            r'code[^\d]+(\d{4,8})',                    # 英文格式
+            r'\b(\d{6})\b'                             # 6位数字
+        ]
+        
+        for pattern in patterns:
+            matches = re.findall(pattern, body_text)
+            if matches:
+                verification_code = matches[0].strip()
+                break
+    
+    # 显示提取到的验证码
+    if verification_code:
+        print("\n===== 提取结果 =====")
+        print(f"验证码: {verification_code}")
+    
+    # 尝试提取验证链接
+    verification_link = None
+    if body_html:
+        link_match = re.search(r'href=[\'"]([^\'"]*(?:verify|confirm|activate)[^\'"]*)[\'"]', body_html)
+        if link_match:
+            verification_link = link_match.group(1)
+    
+    if not verification_link and body_text:
+        link_match = re.search(r'https?://\S+?(?:verify|confirm|activate)\S+', body_text)
+        if link_match:
+            verification_link = link_match.group(0)
+    
+    if verification_link:
+        print(f"验证链接: {verification_link}")
+    
+    return True
+
+def decode_all_emails(directory):
+    """解析指定目录下的所有.eml文件"""
+    print(f"扫描目录: {directory}")
+    email_files = []
+    
+    # 遍历目录
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".eml"):
+                email_files.append(os.path.join(root, file))
+    
+    # 按修改时间排序
+    email_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    
+    print(f"找到 {len(email_files)} 个邮件文件")
+    
+    # 如果文件过多，只显示最新的几个
+    if len(email_files) > 5:
+        print("只显示最新的5封邮件")
+        email_files = email_files[:5]
+    
+    # 解析每个文件
+    for i, email_file in enumerate(email_files, 1):
+        print(f"\n\n======= 邮件 {i}/{len(email_files)} =======")
+        print(f"文件: {email_file}")
+        decode_eml_file(email_file)
+
+def main():
+    parser = argparse.ArgumentParser(description='邮件解码工具')
+    parser.add_argument('path', nargs='?', help='邮件文件路径或目录路径')
+    parser.add_argument('--all', action='store_true', help='解码所有找到的邮件')
+    
+    args = parser.parse_args()
+    
+    # 如果指定了--all参数，扫描email_data目录
+    if args.all:
+        email_data_dir = 'email_data'
+        if not os.path.exists(email_data_dir):
+            email_data_dir = os.path.join(os.getcwd(), 'email_data')
+        
+        if not os.path.exists(email_data_dir):
+            print(f"错误: 找不到邮件数据目录 {email_data_dir}")
+            return 1
+        
+        decode_all_emails(email_data_dir)
+        return 0
+    
+    # 如果指定了路径
+    if args.path:
+        # 检查是文件还是目录
+        if os.path.isfile(args.path):
+            return 0 if decode_eml_file(args.path) else 1
+        elif os.path.isdir(args.path):
+            decode_all_emails(args.path)
+            return 0
+        else:
+            print(f"错误: 路径不存在 {args.path}")
+            return 1
+    
+    # 如果没有指定路径，尝试找到最新的邮件
+    email_data_dir = 'email_data'
+    if not os.path.exists(email_data_dir):
+        email_data_dir = os.path.join(os.getcwd(), 'email_data')
+    
+    if not os.path.exists(email_data_dir):
+        print(f"错误: 找不到邮件数据目录 {email_data_dir}")
+        print("请指定邮件文件路径: python decode_email.py <eml文件路径>")
+        return 1
+    
+    # 查找最新的邮件文件
+    latest_email = None
+    latest_time = 0
+    
+    for root, dirs, files in os.walk(email_data_dir):
+        for file in files:
+            if file.endswith(".eml"):
+                file_path = os.path.join(root, file)
+                mtime = os.path.getmtime(file_path)
+                if mtime > latest_time:
+                    latest_time = mtime
+                    latest_email = file_path
+    
+    if latest_email:
+        print(f"解析最新的邮件文件: {latest_email}")
+        return 0 if decode_eml_file(latest_email) else 1
+    else:
+        print(f"找不到任何.eml文件在 {email_data_dir}")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())