#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
查找包含特定URL的源页面（使用Playwright支持JavaScript动态内容）
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib3
import asyncio

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

TARGET_URL = "p/412-1213-20090.php"
BASE_URL = "https://cc.ncku.edu.tw/p/426-1213-3.php?Lang=zh-tw"

def find_url_in_static_html(url, target):
    """检查静态HTML中是否包含目标URL"""
    try:
        response = requests.get(url, timeout=10, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        found_links = []
        for link in soup.find_all('a', href=True):
            href = link.get('href', '')
            if target.lower() in href.lower():
                absolute_url = urljoin(url, href)
                found_links.append({
                    'text': link.get_text().strip(),
                    'href': href,
                    'absolute': absolute_url
                })
        
        return found_links
    except Exception as e:
        print(f"[错误] {url}: {e}")
        return []

async def find_url_in_javascript_rendered_html(url, target):
    """使用Playwright渲染JavaScript后检查页面内容"""
    try:
        from playwright.async_api import async_playwright
    except ImportError:
        print("[提示] 未安装Playwright，无法渲染JavaScript")
        print("       运行: pip install playwright")
        print("       然后: playwright install")
        return None
    
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            print(f"[Playwright] 正在加载: {url}")
            await page.goto(url, wait_until='networkidle')
            
            # 获取渲染后的HTML
            rendered_html = await page.content()
            
            await browser.close()
            
            # 在渲染后的HTML中查找目标URL
            soup = BeautifulSoup(rendered_html, 'html.parser')
            
            found_links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                if target.lower() in href.lower():
                    absolute_url = urljoin(url, href)
                    found_links.append({
                        'text': link.get_text().strip(),
                        'href': href,
                        'absolute': absolute_url
                    })
            
            return found_links
        
    except Exception as e:
        print(f"[Playwright错误] {e}")
        return None

# 主程序
async def main():
    print("=" * 80)
    print(f"检查URL: {BASE_URL}")
    print(f"查找: {TARGET_URL}")
    print("=" * 80)
    print()

    # 第一步：检查静态HTML
    print("[第一步] 检查静态HTML源代码...")
    print("-" * 80)
    static_links = find_url_in_static_html(BASE_URL, TARGET_URL)

    if static_links:
        print(f"✓ 在静态HTML中找到 {len(static_links)} 个链接：")
        for link in static_links:
            print(f"  - 文本: {link['text']}")
            print(f"    href: {link['href']}")
            print(f"    完整URL: {link['absolute']}")
    else:
        print(f"✗ 静态HTML中未找到包含 '{TARGET_URL}' 的链接")

    # 第二步：检查JavaScript渲染后的内容
    print()
    print("[第二步] 检查JavaScript渲染后的内容...")
    print("-" * 80)
    js_links = await find_url_in_javascript_rendered_html(BASE_URL, TARGET_URL)

    if js_links is None:
        print("[跳过] 未安装Playwright，无法验证JavaScript内容")
    elif js_links:
        print(f"✓ 在JavaScript渲染后的HTML中找到 {len(js_links)} 个链接！")
        print("[结论] 这证实了该URL是通过JavaScript动态生成的！")
        for link in js_links:
            print(f"  - 文本: {link['text']}")
            print(f"    href: {link['href']}")
            print(f"    完整URL: {link['absolute']}")
    else:
        print(f"✗ 即使在JavaScript渲染后也未找到目标链接")

    # 最终结论
    print()
    print("=" * 80)
    print("[结论]")
    print("=" * 80)
    if static_links:
        print("✓ 该URL在静态HTML中")
        print("  → 爬虫可以捕获，无需升级")
    elif js_links:
        print("✗ 该URL仅在JavaScript渲染后出现")
        print("  → 爬虫需要升级为支持JavaScript渲染（使用Playwright）")
    else:
        print("⚠ 该URL在两种情况下都未找到")
        print("  → 可能：")
        print("    1. URL格式或ID有误")
        print("    2. 链接在用户交互后才出现（需要点击等）")
        print("    3. 链接被iframe或其他隐藏元素包含")

if __name__ == "__main__":
    asyncio.run(main())


