#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
NCKU 爬蟲程式 - 取得 HTML 頁面和 PDF 文件的 URL
功能：
  - 從 seed_sites 資料表取得起始 URL
  - 遵循指定深度進行爬取（depth ≤ 3）
  - 過濾與正規化 URL
  - 收集 HTML 和 PDF URL
  - 存儲到 crawl_items 資料表
"""

import os
import re
import time
import hashlib
import logging
from urllib.parse import urljoin, urlparse, urldefrag, parse_qs
from urllib.parse import urlencode
from collections import defaultdict
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import mysql.connector
from mysql.connector import Error as MySQLError
import urllib3
from playwright.sync_api import sync_playwright

# 禁用 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 禁用 BeautifulSoup 的 XML 解析警告（某些頁面返回 XML Content-Type 但用 HTML 解析）
from bs4 import XMLParsedAsHTMLWarning
import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# 降低 urllib3 日誌級別（禁用連接池調試日誌）
logging.getLogger('urllib3').setLevel(logging.WARNING)

# 配置日誌
logging.basicConfig(    
    # 只看关键信息
    level=logging.INFO,
    #level=logging.DEBUG, #fen
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 載入 .env
load_dotenv()

DB_CONFIG = {
    'host': os.getenv('DB_HOST'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PWD'),
    'database': os.getenv('DATABASE'),
    'charset': 'utf8mb4'
}

# 爬蟲配置
MAX_DEPTH = 3
MAX_PAGES_PER_SEED = 300
TIMEOUT = 10
RATE_LIMIT = 0.5  # 秒
MAX_REDIRECTS = 3
RECRAWL_DAYS = 7  # 超過 N 天才重新爬取
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'

# Playwright 配置（混合爬蟲）
ENABLE_PLAYWRIGHT = True  # 是否啟用Playwright
MIN_TEXT_LENGTH = 800     # 文字量少於此值時觸發Playwright重爬
PLAYWRIGHT_TIMEOUT = 30   # Playwright頁面加載超時（秒）
PLAYWRIGHT_MAX_DEPTH = 2  # 僅在這個深度以內使用Playwright
# 排除語言列表
EXCLUDE_LANGUAGES = {'en', 'ja', 'zh_cn', 'zh-cn', 'EN', 'eng'}


class URLFilter:
    """URL 過濾和正規化"""
    
    def __init__(self, skip_url_file='skip_url.txt', 
                 skip_webpage_file='skip_webpage.txt',
                 skip_redirect_file='skip_redirect.txt',
                 skip_sites_file='skip_sites.txt'):
        self.skip_url_prefixes = self._load_file(skip_url_file)
        self.skip_webpages = self._load_file(skip_webpage_file)
        self.skip_redirects = self._load_file(skip_redirect_file)
        self.skip_sites = self._load_file(skip_sites_file)
        logger.info(f"已載入 {len(self.skip_url_prefixes)} 個排除 URL prefix")
        logger.info(f"已載入 {len(self.skip_webpages)} 個排除網頁")
        logger.info(f"已載入 {len(self.skip_redirects)} 個排除轉址")
        logger.info(f"已載入 {len(self.skip_sites)} 個排除網站")
    
    @staticmethod
    def _load_file(filename):
        """從文件載入排除列表"""
        if not os.path.exists(filename):
            logger.warning(f"找不到文件: {filename}")
            return set()
        
        with open(filename, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f if line.strip())
    
    @staticmethod
    def normalize_url(url):
        """
        正規化 URL：
        - 移除 fragment (#)
        - 統一協議為 https
        - 保留並排序 query string 參數
        - 對目錄型 URL 保留末尾斜杠
        """
        # 移除 fragment
        url = urldefrag(url)[0]
        
        # 解析 URL
        parsed = urlparse(url)
        
        # 統一為 HTTPS
        scheme = 'https'
        netloc = parsed.netloc
        path = parsed.path if parsed.path else '/'
        
        # 判斷是否為目錄型 URL（沒有檔案副檔名）
        # 例如 /rule 應該被當作目錄，/rule/index.php 有副檔名
        path_lower = path.lower()
        last_segment = path.split('/')[-1]  # 取得最後一段
        has_extension = '.' in last_segment and not last_segment.startswith('.')
        
        # 如果沒有副檔名且沒有末尾斜杠，添加末尾斜杠（表示目錄）
        # 除非路徑就是根目錄
        if not has_extension and not path.endswith('/') and path != '/':
            path = path + '/'
        
        # 保留並規範化 query string（對參數排序以避免參數順序差異）
        query_string = ''
        if parsed.query:
            # 解析查詢參數
            params = parse_qs(parsed.query, keep_blank_values=True)
            # 排序參數名稱，對每個參數值進行排序
            sorted_params = []
            for key in sorted(params.keys()):
                for value in sorted(params[key]):
                    sorted_params.append((key, value))
            # 重新編碼查詢字符串
            if sorted_params:
                query_string = '?' + urlencode(sorted_params)
        
        normalized = f"{scheme}://{netloc}{path}{query_string}"
        return normalized
    
    def should_skip_url(self, url):
        """檢查是否應該跳過此 URL"""
        
        # 檢查URL格式（排除 @ 符號，除非用於基本認證）
        parsed = urlparse(url)
        if '@' in parsed.netloc and not (':' in parsed.netloc.split('@')[0]):
            # @ 後面不是密碼格式，表示是不合法的 URL
            return True, "invalid URL format with @"
        
        # 檢查排除的 URL 前綴
        for skip_prefix in self.skip_url_prefixes:
            if url.startswith(skip_prefix):
                return True, f"matches skip_url prefix: {skip_prefix}"
        
        # 檢查排除的網頁
        if url in self.skip_webpages:
            return True, "in skip_webpage list"
        
        # 檢查排除的網站（skip_sites.txt）
        for skip_site in self.skip_sites:
            if url.startswith(skip_site.rstrip('/')):
                return True, f"in skip_sites list: {skip_site}"
        
        # 檢查是否是 ncku.edu.tw 域名
        if 'ncku.edu.tw' not in url:
            return True, "not ncku.edu.tw domain"
        
        # 檢查語言排除
        for lang in EXCLUDE_LANGUAGES:
            # 檢查路徑中的語言標記
            if f"/{lang}/" in url.lower() or f"_{lang}" in url.lower():
                return True, f"contains excluded language: {lang}"
            # 檢查查詢參數中的 Lang= 標記 (例如 ?Lang=en, ?lang=EN)
            if f"lang={lang}" in url.lower():
                return True, f"contains excluded language in query: {lang}"
        
        # 檢查是否以 sso.php 結尾
        if url.endswith('sso.php'):
            return True, "ends with sso.php"
        
        return False, None
    
    def is_pdf_url(self, url):
        """檢查是否為 PDF URL"""
        path = urlparse(url).path.lower()
        return path.endswith('.pdf')
    
    def is_html_url(self, url):
        """檢查是否為 HTML URL"""
        path = urlparse(url).path.lower()
        # HTML, PHP, ASP 等（只檢查 path 部分，不包含 query string）
        return path.endswith('.html') or path.endswith('.htm') or path.endswith('.php') or \
               (not path.endswith('.pdf') and '.' not in path.split('/')[-1])
    
    def should_follow_url(self, url):
        """檢查是否應該跟隨此 URL"""
        skip, reason = self.should_skip_url(url)
        return not skip, reason


class CrawlDatabase:
    """資料庫操作"""
    
    def __init__(self):
        self.conn = None
        self.connect()
    
    def connect(self):
        """連接資料庫"""
        try:
            self.conn = mysql.connector.connect(**DB_CONFIG)
            logger.info("資料庫連接成功")
        except MySQLError as e:
            logger.error(f"資料庫連接失敗: {e}")
            raise
    
    def get_seed_sites(self):
        """從 seed_sites 取得活躍的 URL"""
        cursor = self.conn.cursor(dictionary=True)
        try:
            cursor.execute("""
                SELECT id, site_url FROM seed_sites 
                WHERE is_active = 1
            """)
            sites = cursor.fetchall()
            logger.info(f"取得 {len(sites)} 個活躍的 Seed URL")
            return sites
        finally:
            cursor.close()
    
    def load_existing_urls(self):
        """從資料庫加載已爬過的 URL hash、最後爬取時間、is_active 狀態及 content_hash、file_hash"""
        cursor = self.conn.cursor()
        try:
            cursor.execute("""
                SELECT url_hash, last_crawled, is_active, content_hash, file_hash FROM crawl_items
            """)
            results = cursor.fetchall()
            # URL 數據：{url_hash: (last_crawled, is_active)}
            url_data = {row[0]: (row[1], row[2]) for row in results}
            # Hash 集合：避免數據庫隔離問題
            content_hashes = {row[3] for row in results if row[3]}
            file_hashes = {row[4] for row in results if row[4]}
            logger.info(f"已從資料庫加載 {len(url_data)} 個現存 URL, {len(content_hashes)} 個 content_hash, {len(file_hashes)} 個 file_hash")
            return url_data, content_hashes, file_hashes
        finally:
            cursor.close()
    
    def check_url_exists(self, url_hash):
        """檢查 URL 是否已存在"""
        cursor = self.conn.cursor()
        try:
            cursor.execute("""
                SELECT id FROM crawl_items WHERE url_hash = %s
            """, (url_hash,))
            return cursor.fetchone() is not None
        finally:
            cursor.close()
    
    def check_content_hash_exists(self, content_hash, seen_hashes=None):
        """檢查 content_hash 是否已存在（優先使用內存集合）"""
        if not content_hash or content_hash.strip() == '':
            return False
        # 優先使用內存集合（快速檢查，無隔離問題）
        if seen_hashes is not None:
            return content_hash in seen_hashes
        # 降級方案：查數據庫（不建議使用）
        cursor = self.conn.cursor()
        try:            
            cursor.execute("""
                SELECT id FROM crawl_items WHERE content_hash = %s
            """, (content_hash,))
            return cursor.fetchone() is not None
        finally:
            cursor.close()
    
    def check_file_hash_exists(self, file_hash, seen_hashes=None):
        """檢查 file_hash 是否已存在（優先使用內存集合）"""
        # 優先使用內存集合（快速檢查，無隔離問題）
        if seen_hashes is not None:
            return file_hash in seen_hashes
        # 降級方案：查數據庫（不建議使用）
        cursor = self.conn.cursor()
        try:
            cursor.execute("""
                SELECT id FROM crawl_items WHERE file_hash = %s AND file_hash IS NOT NULL
            """, (file_hash,))
            return cursor.fetchone() is not None
        finally:
            cursor.close()
    
    def insert_or_update_url(self, url, url_hash, seed_site_id, ext=None, is_active=1, 
                            content_hash=None, file_hash=None, row_content=None):
        """插入或更新 URL 到資料庫
        
        Args:
            url: URL 地址
            url_hash: URL 的 hash
            seed_site_id: Seed 站點 ID
            ext: 副檔名 (php, pdf) - 只在首次插入時使用，更新時保持不變
            is_active: 是否活躍（1=活躍, 0=失敗）
            content_hash: HTML 內容的 hash（僅用於網頁）
            file_hash: PDF 檔案的 hash（僅用於 PDF）
            row_content: 網頁純文字內容
        """
        cursor = self.conn.cursor()
        try:
            # 檢查是否存在
            cursor.execute("""
                SELECT id FROM crawl_items WHERE url_hash = %s
            """, (url_hash,))
            
            result = cursor.fetchone()
            now = datetime.now()
            
            if result:
                # 更新：只更新時間、is_active、content_hash、file_hash 和 row_content
                
                if content_hash or file_hash:
                    cursor.execute("""
                        UPDATE crawl_items 
                        SET last_crawled = %s, is_active = %s, content_hash = %s, 
                            file_hash = %s, row_content = %s
                        WHERE url_hash = %s
                    """, (now, is_active, content_hash, file_hash, row_content, url_hash))
                else:
                    cursor.execute("""
                        UPDATE crawl_items 
                        SET last_crawled = %s, is_active = %s
                        WHERE url_hash = %s
                    """, (now, is_active, url_hash))
                
                # 根據 is_active 決定日誌訊息
                if is_active == 0:
                    status = "[失敗]"
                else:
                    status = "[更新]"
                action = "UPDATE"
            else:
                # 插入（只有首次成功才會插入）
                
                cursor.execute("""
                    INSERT INTO crawl_items 
                    (url, url_hash, ext, seed_site_id, is_active, first_seen, last_crawled, 
                     content_hash, file_hash, row_content)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """, (url, url_hash, ext, seed_site_id, is_active, now, now, 
                      content_hash, file_hash, row_content))
                
                status = "[新增]"
                action = "INSERT"
            
            self.conn.commit() #fen
            
            # 日誌記錄
            if ext:
                logger.info(f"{status} [{ext.upper()}] {url}")
            else:
                logger.info(f"{status} {url}")
            
            return action
        except MySQLError as e:
            logger.error(f"資料庫操作失敗: {e}")
            self.conn.rollback()
            return None
        finally:
            cursor.close()
    
    def close(self):
        """關閉資料庫連接"""
        if self.conn:
            self.conn.close()


class WebCrawler:
    """網頁爬蟲"""
    
    def __init__(self):
        self.filter = URLFilter()
        self.db = CrawlDatabase()
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})
        self.visited_urls = set()  # 本次爬蟲運行中的去重（防止同一頁面樹無限遞迴）
        self.html_urls = set()  # 收集 HTML URL
        self.pdf_urls = set()  # 收集 PDF URL
        self.stats = defaultdict(int)
        # 從資料庫加載已爬過的 URLs 及時間戳、content_hash 和 file_hash（用於增量爬蟲和去重）
        url_data, content_hashes, file_hashes = self.db.load_existing_urls()
        self.url_timestamps = url_data
        self.seen_content_hashes = content_hashes
        self.seen_file_hashes = file_hashes
        
        # 初始化同步 Playwright 瀏覽器實例（長駐，避免重複啟動）
        try:
            self.p_instance = sync_playwright().start()
            self.browser = self.p_instance.chromium.launch(headless=True)
            logger.info("[Playwright] 同步瀏覽器實例已初始化")
        except ImportError:
            logger.warning("[Playwright] 未安裝 Playwright，爬蟲將不支援 JavaScript 渲染")
            self.p_instance = None
            self.browser = None
        except Exception as e:
            logger.error(f"[Playwright] 初始化失敗: {e}")
            self.p_instance = None
            self.browser = None
    
    def _extract_domain(self, url):
        """提取 URL 的完整域名（包括子域名）"""
        parsed = urlparse(url)
        return parsed.netloc
    
    def _should_follow_domain(self, current_url, base_url):
        """檢查是否應該跟隨（限制在同一 domain 和 sub-domain）"""
        current_netloc = urlparse(current_url).netloc
        base_netloc = urlparse(base_url).netloc
        return current_netloc == base_netloc
    
    def _absolute_http_link(self, link, current_url):
        """將相對 URL 轉換為絕對 URL"""
        return urljoin(current_url, link) if link else None
    
    def _get_url_hash(self, url):
        """生成 URL 的 SHA256 hash"""
        return hashlib.sha256(url.encode('utf-8')).hexdigest()
    
    def _should_process_url(self, url, seed_url):
        """檢查 URL 是否應該處理（URL 應已在 crawl_page 中規範化）"""
        # 檢查是否已訪問（本次爬蟲運行中）- 防止同一頁面樹無限遞迴
        if url in self.visited_urls:
            logger.debug(f"[跳過] {url} - 本次運行已訪問")
            return False, "already visited in this crawl"
        
        # 檢查過濾規則
        should_follow, reason = self.filter.should_follow_url(url)
        if not should_follow:
            logger.debug(f"[跳過] {url} - {reason}")
            return False, reason
        
        # 檢查域名限制
        if not self._should_follow_domain(url, seed_url):
            logger.debug(f"[跳過] {url} - 不同域名")
            return False, "different domain"
        
        # 檢查是否已被爬過（查數據庫）
        url_hash = self._get_url_hash(url)
        
        # Seed URL 本身始終會被爬取（允許重新爬 seed 首頁以發現新鏈接）
        is_seed_url = (self.filter.normalize_url(url) == self.filter.normalize_url(seed_url))
        
        if url_hash in self.url_timestamps:
            last_crawled, is_active = self.url_timestamps[url_hash]
            
            # 如果標記為不活躍，永久跳過
            if is_active == 0:
                logger.info(f"[跳過] {url} - 已標記為失敗 (is_active=0)")
                return False, "is_active=0"
            
            # Seed URL 不受時間限制，始終可以重爬
            if is_seed_url:
                logger.debug(f"[重爬] {url} - Seed URL 始終允許重爬")
                return True, None
            
            # 檢查是否需要重爬（基於時間間隔）
            days_since = (datetime.now() - last_crawled).days
            
            if days_since < RECRAWL_DAYS:
                logger.debug(f"[跳過] {url} - 距離上次爬取 {days_since} 天（< {RECRAWL_DAYS} 天）")
                return False, f"crawled {days_since} days ago"
            else:
                logger.debug(f"[重爬] {url} - 距離上次爬取 {days_since} 天（≥ {RECRAWL_DAYS} 天）")
        
        return True, None
    
    def _fetch_page(self, url):
        """取得頁面內容"""
        try:
            response = self.session.get(
                url,
                timeout=TIMEOUT,
                allow_redirects=True,
                verify=False  # 忽略 SSL 警告
            )
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            logger.warning(f"[超時] {url}")
            self.stats['timeout'] += 1
            return None
        except requests.exceptions.ConnectionError:
            logger.warning(f"[連接錯誤] {url}")
            self.stats['connection_error'] += 1
            return None
        except requests.exceptions.SSLError:
            logger.warning(f"[SSL 錯誤] {url}")
            self.stats['ssl_error'] += 1
            return None
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 404:
                logger.warning(f"[404] {url}")
                self.stats['http_404'] += 1
            elif e.response.status_code == 500:
                logger.warning(f"[500] {url}")
                self.stats['http_500'] += 1
            else:
                logger.warning(f"[HTTP 錯誤 {e.response.status_code}] {url}")
                self.stats['http_error'] += 1
            return None
        except Exception as e:
            logger.warning(f"[錯誤] {url}: {e}")
            self.stats['other_error'] += 1
            return None
    
    def _extract_links(self, html, current_url):
        """從 HTML 提取鏈接"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            links = []
            
            for link in soup.find_all('a', href=True):
                href = link.get('href', '').strip()
                if href:
                    absolute_url = self._absolute_http_link(href, current_url)
                    if absolute_url:
                        links.append(absolute_url)
            
            return links
        except Exception as e:
            logger.warning(f"HTML 解析錯誤 {current_url}: {e}")
            return []
    
    def _check_pdf_exists(self, url):
        """檢查 PDF 是否存在（先用 HEAD 請求，失敗則用 GET 流式）"""
        try:
            # 先嘗試 HEAD 請求（更快）
            response = self.session.head(
                url,
                timeout=TIMEOUT,
                allow_redirects=True,
                verify=False
            )
            if response.status_code < 400:
                return True
            
            # HEAD 失敗，改用 GET stream 模式（只讀 headers，不下載內容）
            response = self.session.get(
                url,
                timeout=TIMEOUT,
                allow_redirects=True,
                verify=False,
                stream=True
            )
            return response.status_code < 400
        except:
            return False
    
    def _extract_pdfs(self, html, current_url):
        """從 HTML 提取 PDF 鏈接"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            pdfs = []
            
            # 尋找 href 包含 .pdf 的 <a> 標籤
            for link in soup.find_all('a', href=True):
                href = link.get('href', '').strip()
                if '.pdf' in href.lower():  # 只在檢查時轉小寫，保留原始 URL
                    # 轉換為完整 URL
                    absolute_url = self._absolute_http_link(href, current_url)
                    if absolute_url:
                        pdfs.append(absolute_url)
            
            # 尋找 <embed> 和 <object> 標籤
            for tag in soup.find_all(['embed', 'object']):
                src = tag.get('src') or tag.get('data')
                if src and '.pdf' in src.lower():  # 只在檢查時轉小寫
                    absolute_url = self._absolute_http_link(src, current_url)
                    if absolute_url:
                        pdfs.append(absolute_url)
            
            return pdfs
        except Exception as e:
            logger.warning(f"PDF 提取錯誤 {current_url}: {e}")
            return []
    
    def _extract_title(self, html):
        """從 HTML 提取網頁標題"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            title_tag = soup.find('title')
            if title_tag:
                return title_tag.get_text().strip()
            return ''
        except Exception as e:
            logger.warning(f"標題提取錯誤: {e}")
            return ''
    
    def _extract_text_content(self, html):
        """從 HTML 提取純文本內容（不含 HTML 標籤）"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            
            # 擴大刪除範圍：除了 script 和 style，把隱藏的模板和導覽也刪了
            # noscript: 存放 JS 停用時的文字
            # template: 存放預載模板
            # header/footer/nav: 網頁導覽（這通常是跨頁面重複、且造成 Hash 不同的元兇）
            for tag in soup(['script', 'style', 'noscript', 'template', 'header', 'footer', 'nav']):
                tag.decompose()

            # 取得純文本
            text = soup.get_text(separator=' ')
            
            # 清理空白
            text = ' '.join(text.split())
            
            return text
        except Exception as e:
            logger.warning(f"文本提取錯誤: {e}")
            return ''
    
    def _calculate_content_hash(self, text):
        """計算文本內容的 hash"""
        return hashlib.sha256(text.encode('utf-8')).hexdigest()
    
    def _calculate_file_hash(self, file_content):
        """計算 PDF 檔案的 hash"""
        return hashlib.sha256(file_content).hexdigest()
    
    def _extract_content_with_playwright(self, url):
        """使用同步版 Playwright 渲染 JavaScript 後獲取 HTML（使用持久化瀏覽器實例）"""
        if self.browser is None:
            logger.warning(f"[Playwright] 瀏覽器未初始化，跳過: {url}")
            return None
        
        try:
            # 使用持久化瀏覽器實例建立新分頁
            page = self.browser.new_page(user_agent=USER_AGENT)
            
            try:
                page.goto(url, wait_until='networkidle', timeout=PLAYWRIGHT_TIMEOUT * 1000)
                
                # 獲取渲染後的 HTML
                rendered_html = page.content()
                
                logger.info(f"[Playwright] 成功渲染: {url}")
                return rendered_html
            finally:
                # 只關閉頁面，保持瀏覽器運行
                page.close()
        
        except Exception as e:
            logger.warning(f"[Playwright] 錯誤: {url} - {e}")
            return None

    def _should_use_playwright(self, text_content, depth):
        """判定是否應該使用Playwright渲染
        
        觸發條件：
        1. 啟用了Playwright
        2. 文字量少於MIN_TEXT_LENGTH
        3. 深度小於PLAYWRIGHT_MAX_DEPTH
        """
        if not ENABLE_PLAYWRIGHT:
            return False
        
        if depth >= PLAYWRIGHT_MAX_DEPTH:
            return False
        
        if len(text_content) < MIN_TEXT_LENGTH:
            return True
        
        return False
    
    def crawl_page(self, url, seed_url, seed_site_id, depth=0):
        """爬蟲單一頁面"""
        if depth > MAX_DEPTH:
            return 0
        
        # 規範化 URL（移除 fragment 和 query string）
        url = self.filter.normalize_url(url)
        
        # 檢查是否應處理
        should_process, reason = self._should_process_url(url, seed_url)
        if not should_process:
            #logger.debug(f"[調試] {url} - 不處理: {reason}")
            return 0
        
        # 記為已訪問（本次爬蟲運行中）
        self.visited_urls.add(url)
        url_hash = self._get_url_hash(url)
        
        # 判斷是否為重爬
        is_recrawl = url_hash in self.url_timestamps
        
        # 取得頁面
        response = self._fetch_page(url)
        if not response:
            # 連線失敗：重爬時標記為 is_active=0，首次爬蟲時略過
            if is_recrawl:
                logger.warning(f"[重爬失敗] {url} - 標記為 is_active=0")
                self.db.insert_or_update_url(url, url_hash, seed_site_id, is_active=0)
                # 同步更新 url_timestamps
                self.url_timestamps[url_hash] = (datetime.now(), 0)
            return 0
        
        # 檢查轉址（跟隨 redirect）
        final_url = response.url
        final_url = self.filter.normalize_url(final_url)
        
        # 檢查轉址是否在排除列表
        if final_url != url:
            # 檢查轉址後的 URL 是否已訪問過
            if final_url in self.visited_urls:
                logger.debug(f"[轉址已訪問] {url} -> {final_url}")
                return 0
            
            if final_url in self.filter.skip_redirects:
                logger.info(f"[轉址排除] {url} -> {final_url}")
                self.stats['skip_redirect'] += 1
                return 0
            
            # 重新檢查轉址後的 URL 是否符合過濾規則
            should_follow, reason = self.filter.should_follow_url(final_url)
            if not should_follow:
                logger.info(f"[轉址域名排除] {url} -> {final_url}")
                return 0
            
            # 更新為最終 URL
            url = final_url
            url_hash = self._get_url_hash(url)
            self.visited_urls.add(url)  # 記錄轉址後的最終 URL
            
            # 檢查轉址後的 URL 是否已在數據庫爬過（防止重複爬取）
            if url_hash in self.url_timestamps:
                last_crawled, is_active = self.url_timestamps[url_hash]
                if is_active == 0:
                    logger.info(f"[跳過] {url} - 轉址目標已標記為失敗 (is_active=0)")
                    return 0
                # 如果 is_active=1 且超過 RECRAWL_DAYS，才允許重爬
                days_since = (datetime.now() - last_crawled).days
                if days_since < RECRAWL_DAYS:
                    logger.debug(f"[跳過] {url} - 轉址目標最近已爬過 ({days_since} 天 < {RECRAWL_DAYS} 天)")
                    return 0
        
        # 檢查是否為 HTML
        content_type = response.headers.get('content-type', '').lower()
        is_html = 'text/html' in content_type
        is_pdf = 'application/pdf' in content_type or url.endswith('.pdf')
        
        if not is_html and not is_pdf:
            logger.debug(f"[忽略] {url} - 不支援的內容類型")
            return 0
        
        # 如果是 HTML
        if is_html:
            # 提取網頁標題
            title = self._extract_title(response.content)
            
            # 檢查標題中是否包含"友善列印"
            if '友善列印' in title:
                logger.debug(f"[跳過] {url} - 標題包含 '友善列印': {title}")
                return 0
            
            # 提取網頁純文本內容
            text_content = self._extract_text_content(response.content)

            playwright_html = None  # 用於保存Playwright渲染的HTML
            
            # 判斷是否需要用Playwright重爬（如果文字量太少）
            if self._should_use_playwright(text_content, depth):
                logger.info(f"[Playwright觸發] 文字量不足({len(text_content)}字), 使用同步Playwright重爬: {url}")
                try:
                    # 獲取 Playwright 渲染後的 HTML
                    playwright_html = self._extract_content_with_playwright(url)
                    if playwright_html is not None:
                        # 用 BeautifulSoup 從 Playwright HTML 提取文本
                        text_content = self._extract_text_content(playwright_html)    
                        logger.info(f"[Playwright成功] {url} - 新文字量: {len(text_content)}字")
                except Exception as e:
                    logger.warning(f"[Playwright失敗] {url} - {e}, 繼續使用requests結果")

            content_hash = self._calculate_content_hash(text_content)

            # 檢查 content_hash 是否已存在（相同內容不重複爬取）- 同時檢查內存和數據庫
            if content_hash in self.seen_content_hashes or self.db.check_content_hash_exists(content_hash):
                logger.debug(f"[跳過] {url} - 內容 hash 已存在 (content_hash: {content_hash[:8]}...)")
                self.stats['duplicate_content'] += 1
                return 0
            
            # 保存 HTML URL（統一記錄為 php）
            self.html_urls.add(url)
            action = self.db.insert_or_update_url(
                url, url_hash, seed_site_id, ext='php',
                content_hash=content_hash, row_content=text_content, file_hash=''
            )
            # 同步更新 url_timestamps 和 seen_content_hashes（只有成功時才更新）
            if action:
                self.url_timestamps[url_hash] = (datetime.now(), 1)
                self.seen_content_hashes.add(content_hash)
            
            # 提取 PDF 鏈接
            try:
                pdfs = self._extract_pdfs(response.content, url)
                for pdf_url in pdfs:
                    pdf_url = self.filter.normalize_url(pdf_url)
                    
                    # 如果 PDF 最近爬過，就不要再 check 也不要更新 DB
                    should_proc, _ = self._should_process_url(pdf_url, seed_url)
                    if not should_proc:
                        continue

                    # 檢查過濾
                    should_follow, _ = self.filter.should_follow_url(pdf_url)
                    if should_follow and self._should_follow_domain(pdf_url, seed_url):
                        # 檢查是否已訪問過此 PDF（跨 seed 去重）
                        if pdf_url in self.visited_urls:
                            logger.debug(f"[PDF 已訪問] {pdf_url}")
                            continue
                        
                        pdf_hash = self._get_url_hash(pdf_url)
                        # 添加到已訪問集合
                        self.visited_urls.add(pdf_url)
                        
                        # 直接下載 PDF（不需要先 HEAD 檢查），同時計算 file_hash
                        pdf_response = self._fetch_page(pdf_url)
                        if pdf_response:
                            self.pdf_urls.add(pdf_url)
                            file_hash = self._calculate_file_hash(pdf_response.content)
                            
                            # 檢查 file_hash 是否已存在 - 同時檢查內存和數據庫
                            if file_hash in self.seen_file_hashes or self.db.check_file_hash_exists(file_hash):
                                logger.debug(f"[跳過] {pdf_url} - 檔案 hash 已存在 (file_hash: {file_hash[:8]}...)")
                                self.stats['duplicate_file'] += 1
                            else:
                                action = self.db.insert_or_update_url(
                                    pdf_url, self._get_url_hash(pdf_url), seed_site_id, ext='pdf',
                                    file_hash=file_hash, content_hash='', row_content=''
                                )
                                # 同步更新 url_timestamps 和 seen_file_hashes（只有成功時才更新）
                                if action:
                                    self.url_timestamps[self._get_url_hash(pdf_url)] = (datetime.now(), 1)
                                    self.seen_file_hashes.add(file_hash)
                                    self.stats['pdf_found'] += 1
                        else:
                            logger.debug(f"[PDF 下載失敗] {pdf_url}")
                            self.stats['pdf_notfound'] += 1
            except Exception as e:
                logger.warning(f"PDF 提取失敗 {url}: {e}")
            
            # 提取並爬蟲更多的 HTML 鏈接（只在深度允許時）
            if depth < MAX_DEPTH:
                try:
                    # 優先使用 Playwright 渲染的 HTML（如果有的話），否則使用 requests 獲取的 HTML
                    html_for_links = playwright_html if playwright_html is not None else response.content
                    links = self._extract_links(html_for_links, url)
                    logger.debug(f"[鏈接] 提取 {len(links)} 個鏈接 (使用{'Playwright' if playwright_html else 'requests'})")
                    
                    count = 0
                    for link in links:
                        if count >= MAX_PAGES_PER_SEED:
                            break
                        
                        link = self.filter.normalize_url(link)
                        
                        # 只爬蟲 HTML 鏈接
                        if self.filter.is_html_url(link):
                            time.sleep(RATE_LIMIT)  # Rate limit
                            count += self.crawl_page(
                                link, seed_url, seed_site_id, depth + 1
                            )
                            if count >= MAX_PAGES_PER_SEED:
                                break
                except Exception as e:
                    logger.warning(f"鏈接提取失敗 {url}: {e}")
            
            self.stats['html_crawled'] += 1
            return 1
        
        # 如果是 PDF
        elif is_pdf:
            # 下載 PDF 內容以計算 file_hash
            try:
                pdf_response = self._fetch_page(url)
                if pdf_response:
                    file_hash = self._calculate_file_hash(pdf_response.content)
                    
                    # 檢查 file_hash 是否已存在（相同檔案不重複爬取）- 同時檢查內存和數據庫
                    if file_hash in self.seen_file_hashes or self.db.check_file_hash_exists(file_hash):
                        logger.debug(f"[跳過] {url} - 檔案 hash 已存在 (file_hash: {file_hash[:8]}...)")
                        self.stats['duplicate_file'] += 1
                        return 0
                    
                    # 記為已訪問（本次爬蟲運行中）
                    if url not in self.visited_urls:
                        self.visited_urls.add(url)
                    self.pdf_urls.add(url)
                    action = self.db.insert_or_update_url(
                        url, url_hash, seed_site_id, ext='pdf', 
                        file_hash=file_hash, content_hash='', row_content=''
                    )
                    # 同步更新 url_timestamps 和 seen_file_hashes（只有成功時才更新）
                    if action:
                        self.url_timestamps[url_hash] = (datetime.now(), 1)
                        self.seen_file_hashes.add(file_hash)
                        self.stats['pdf_crawled'] += 1
                        return 1
                    return 0
                else:
                    # PDF 下載失敗
                    logger.debug(f"[PDF 下載失敗] {url}")
                    return 0
            except Exception as e:
                logger.warning(f"PDF 檔案 hash 計算失敗 {url}: {e}")
                return 0
    
    def crawl_seed(self, seed_site):
        """爬蟲單一 Seed 網站"""
        seed_id = seed_site['id']
        seed_url = seed_site['site_url']
        
        logger.info(f"\n{'='*80}")
        logger.info(f"Seed Site: {seed_url} (ID: {seed_id})")
        logger.info(f"{'='*80}")
        
        # 開始爬蟲（visited_urls 在整個程序運行中保留，跨越所有 Seed Site）
        self.crawl_page(seed_url, seed_url, seed_id, depth=0)
    
    def crawl_all(self):
        """爬蟲所有 Seed URL"""
        seed_sites = self.db.get_seed_sites()
        # DEBUG: 只爬 https://donate.ncku.edu.tw        
        #seed_sites = [{'id': 57, 'site_url': 'https://alumni.ncku.edu.tw/p/405-1004-184341,c425.php?Lang=zh-tw'}] #fen
        #logger.info(f"DEBUG MODE: 只爬測試 URL - https://donate.ncku.edu.tw")

        total_stats = defaultdict(int)
        
        for seed_site in seed_sites:
            self.crawl_seed(seed_site)
            
            # 累計統計
            for key, value in self.stats.items():
                total_stats[key] += value
            
            # Rate limit between seeds
            time.sleep(1)
        
        # 顯示總統計
        logger.info(f"\n{'='*60}")
        cursor = self.db.conn.cursor()
        try:
            cursor.execute("SELECT COUNT(*) as count FROM crawl_items WHERE ext = 'php'")
            html_count = cursor.fetchone()[0]
            cursor.execute("SELECT COUNT(*) as count FROM crawl_items WHERE ext = 'pdf'")
            pdf_count = cursor.fetchone()[0]
            logger.info(f"爬蟲完成 - HTML: {html_count}, PDF: {pdf_count}")
        finally:
            cursor.close()
        logger.info(f"{'='*60}")
    
    def close(self):
        """關閉爬蟲"""
        # 關閉 Playwright 瀏覽器實例
        if self.browser is not None:
            try:
                self.browser.close()
                logger.info("[Playwright] 瀏覽器已關閉")
            except Exception as e:
                logger.warning(f"[Playwright] 關閉瀏覽器失敗: {e}")
        
        if self.p_instance is not None:
            try:
                self.p_instance.stop()
                logger.info("[Playwright] 實例已停止")
            except Exception as e:
                logger.warning(f"[Playwright] 停止實例失敗: {e}")
        
        self.db.close()
        self.session.close()


def main():
    """主程式"""
    crawlers = WebCrawler()
    try:
        crawlers.crawl_all()
    except KeyboardInterrupt:
        logger.info("\n爬蟲被中斷")
    finally:
        crawlers.close()


if __name__ == '__main__':
    main()
