import requests
from bs4 import BeautifulSoup
import hashlib
import difflib

def get_page_text(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8' # 強制使用 utf-8 避免亂碼
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 模擬你目前的 _extract_text_content 邏輯
        # 移除基本雜訊標籤
        for tag in soup(['script', 'style', 'noscript', 'template', 'header', 'footer', 'nav']):
            tag.decompose()
            
        # 取得純文本
        text = soup.get_text(separator=' ')
        # 清理多餘空白
        clean_text = ' '.join(text.split())
        return clean_text
    except Exception as e:
        return f"Error: {e}"

def calculate_hash(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

# 測試網址
url1 = "https://alumni.ncku.edu.tw/p/405-1004-184341,c425.php?Lang=zh-tw"
url2 = "https://alumni.ncku.edu.tw/p/405-1004-184341,c15390.php?Lang=zh-tw"

print("正在抓取內容並比對...")
text1 = get_page_text(url1)
text2 = get_page_text(url2)

print("-" * 50)
print(f"URL 1 text: {text1}... (長度: {len(text1)})")
print("-" * 50)
print(f"URL 2 Hash: {text2}... (長度: {len(text2)})")
print("-" * 50)

hash1 = calculate_hash(text1)
hash2 = calculate_hash(text2)

print("-" * 50)
print(f"URL 1 Hash: {hash1[:16]}... (長度: {len(text1)})")
print(f"URL 2 Hash: {hash2[:16]}... (長度: {len(text2)})")
print("-" * 50)

if hash1 == hash2:
    print("✅ 恭喜！兩者內容完全一致。")
else:
    print("❌ Hash 不同！正在分析差異原因...\n")
    
    # 使用 difflib 找出差異
    # 我們取前 1000 個字來比對即可，因為差異通常出現在導覽或結尾
    d = difflib.Differ()
    diff = d.compare(text1.split(), text2.split())
    
    print("找出不一致的文字 ( - 代表 URL1 有但 URL2 沒有, + 代表 URL2 有但 URL1 沒有):")
    diff_count = 0
    for line in diff:
        if line.startswith('+ ') or line.startswith('- '):
            print(line)
            diff_count += 1
            if diff_count > 20: # 只顯示前 20 個差異點
                print("... (還有更多差異)")
                break

print("-" * 50)
