import time
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import difflib
import hashlib

def get_rendered_text(url):
    """使用 Playwright 渲染並提取純文字"""
    with sync_playwright() as p:
        # 啟動瀏覽器
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        print(f"正在抓取: {url}")
        # 等待網路閒置，確保 JS 執行完畢
        page.goto(url, wait_until="networkidle")
        
        # 取得渲染後的完整 HTML
        html_content = page.content()
        browser.close()
        
        # 使用 BeautifulSoup 處理文字提取
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # 模擬您主程式的過濾邏輯
        for tag in soup(['script', 'style', 'noscript', 'template', 'header', 'footer', 'nav']):
            tag.decompose()
            
        # 取得純文本並標準化空白
        text = soup.get_text(separator=' ')
        clean_text = ' '.join(text.split())
        return clean_text

def compare_texts(t1, t2):
    """比對兩段文字並顯示差異"""
    h1 = hashlib.sha256(t1.encode('utf-8')).hexdigest()
    h2 = hashlib.sha256(t2.encode('utf-8')).hexdigest()
    
    print("\n" + "="*50)
    print(f"URL 1 Hash: {h1[:16]}...")
    print(f"URL 2 Hash: {h2[:16]}...")
    print("="*50)
    
    if h1 == h2:
        print("✅ Playwright 渲染後內容完全一致！")
    else:
        print("❌ 內容仍然不一致！正在分析差異...\n")
        
        # 顯示差異點
        d = difflib.Differ()
        diff = d.compare(t1.split(), t2.split())
        
        print("差異明細 ( - 代表 URL1 特有, + 代表 URL2 特有):")
        diff_found = False
        for line in diff:
            if line.startswith('+ ') or line.startswith('- '):
                print(line)
                diff_found = True
        
        if not diff_found:
            print("警告：文字看似相同但 Hash 不同，可能是隱藏字元或編碼問題。")

# 測試目標網址
target_urls = [
    "https://alumni.ncku.edu.tw/p/405-1004-184341,c425.php?Lang=zh-tw",
    "https://alumni.ncku.edu.tw/p/405-1004-184341,c15390.php?Lang=zh-tw"
]

if __name__ == "__main__":
    content1 = get_rendered_text(target_urls[0])
    content2 = get_rendered_text(target_urls[1])

    print(f"[content1]={content1}")
    print("====================")
    print(f"[content2]={content2}")

    compare_texts(content1, content2)
