import mysql.connector
import requests
import os
from datetime import datetime
from urllib.parse import urlparse
from dotenv import load_dotenv
import urllib3

# 抑制 HTTPS 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 加載 .env 檔案
load_dotenv()

# 資料庫設定
# ipsystem 資料庫（查詢 ws_sites 和 ws_dept）
DB_CONFIG_IPSYSTEM = {
    'host': os.getenv('DB_HOST'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PWD'),
    'database': os.getenv('DATABASE_2'),  # ipsystem
    'charset': 'utf8mb4'
}

# rag 資料庫（儲存 seed_sites）
DB_CONFIG_RAG = {
    'host': os.getenv('DB_HOST'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PWD'),
    'database': os.getenv('DATABASE'),  # rag
    'charset': 'utf8mb4'
}

def load_skip_sites(filename='skip_sites.txt'):
    """從檔案中讀取要略過的網站清單"""
    skip_sites = set()
    try:
        filepath = os.path.join(os.path.dirname(__file__), filename)
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    # 去除前後空白和換行符
                    line = line.strip()
                    # 忽略空行和註釋行
                    if line and not line.startswith('#'):
                        skip_sites.add(line)
            if skip_sites:
                print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 已載入 {len(skip_sites)} 個略過的網站')
        else:
            print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 警告：找不到 {filename} 檔案')
    except Exception as e:
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 讀取 {filename} 出錯: {str(e)}')
    return skip_sites

# 略過的網站清單 - 從 skip_site.txt 檔案讀取
SKIP_SITES = load_skip_sites()

def get_domain_from_url(url):
    """從URL中提取domain"""
    try:
        parsed = urlparse(url if url.startswith('http') else f'http://{url}')
        return parsed.netloc or url
    except:
        return url

def should_skip_site(url, domain):
    """檢查網站是否應該被略過"""
    if not SKIP_SITES:
        return False
    
    url_lower = url.lower()
    domain_lower = domain.lower()
    
    for skip_item in SKIP_SITES:
        skip_item_lower = skip_item.lower()
        # 檢查是否匹配URL或domain
        if skip_item_lower in url_lower or skip_item_lower in domain_lower:
            return True
    return False

def check_website_exists(url):
    """檢查網站是否存在"""
    if not url.startswith('http'):
        url = f'http://{url}'
    
    # 建立 session 以保持連接
    session = requests.Session()
    
    # 設定完整的 headers，模擬瀏覽器請求
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    
    # 先嘗試 HEAD 請求
    try:
        response = session.head(url, timeout=5, allow_redirects=True, headers=headers, verify=False)
        # 如果狀態碼不是 403 或 405，直接返回結果
        if response.status_code != 403 and response.status_code != 405:
            if response.status_code != 200:
                print(f'      [狀態碼: {response.status_code}]', end='')
            return response.status_code == 200
    except (requests.RequestException, Exception):
        pass
    
    # HEAD 失敗或返回 403/405，嘗試 GET 請求
    try:
        response = session.get(url, timeout=5, allow_redirects=True, headers=headers, verify=False)
        
        # 如果不是 200，列印狀態碼
        if response.status_code != 200:
            print(f'      [GET狀態碼: {response.status_code}]', end='')
        
        return response.status_code == 200
    except requests.Timeout:
        print(f'      [超時]', end='')
        return False
    except requests.exceptions.SSLError:
        # SSL 錯誤通常表示網站存在，只是憑證問題
        print(f'      [SSL忽略]', end='')
        return True
    except requests.RequestException as e:
        print(f'      [異常: {type(e).__name__}]', end='')
        return False
    except Exception as e:
        print(f'      [錯誤: {type(e).__name__}]', end='')
        return False

def retrieve_and_store_sites():
    """執行查詢並儲存網站資訊到seed_sites表"""
    
    conn_ipsystem = None
    conn_rag = None
    try:
        # 連接 ipsystem 資料庫（查詢）
        conn_ipsystem = mysql.connector.connect(**DB_CONFIG_IPSYSTEM)
        cursor_query = conn_ipsystem.cursor(dictionary=True)
        
        # 執行查詢
        sql = """
        SELECT ws.site_name, ws.site_url 
        FROM ws_sites AS ws
        JOIN ws_dept AS wd ON ws.ws_dept = wd.dept_code
        WHERE wd.deleted = 0 
          AND wd.type = '1'
          AND ws.site_ext_ip LIKE '140.116.%' 
          AND ws.is_service = '1' 
          AND ws.is_external = '1'
        """
        
        cursor_query.execute(sql)
        sites = cursor_query.fetchall()
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 查詢完成，共取得 {len(sites)} 個網站')
        
        # 連接 rag 資料庫（儲存）
        conn_rag = mysql.connector.connect(**DB_CONFIG_RAG)
        cursor_insert = conn_rag.cursor(dictionary=True)
        
        # 收集本次查詢的所有 site_url
        current_urls = set(site['site_url'] for site in sites)
        
        # 處理每個網站
        insert_count = 0
        update_count = 0
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 開始處理網站資訊...')
        
        for idx, site in enumerate(sites, 1):
            site_name = site['site_name']
            site_url = site['site_url']
            
            # 提取domain
            domain = get_domain_from_url(site_url)

            # 檢查是否在略過名單中
            if should_skip_site(site_url, domain):
                print(f'  [{idx}/{len(sites)}] 檢查網站: {site_name} ({site_url})... [略過]')
                continue
            
            # 檢查網站是否存在
            print(f'  [{idx}/{len(sites)}] 檢查網站: {site_name} ({site_url})...', end=' ')
            is_active = 1 if check_website_exists(site_url) else 0
            print(f'[{"活躍" if is_active else "不活躍"}]')
            
            # 建立時間
            created_at = datetime.now()
            updated_at = datetime.now()
            
            # 先檢查記錄是否存在
            check_sql = "SELECT id FROM seed_sites WHERE site_url = %s"
            cursor_insert.execute(check_sql, (site_url,))
            result = cursor_insert.fetchone()
            
            if result:
                # 記錄存在，執行 UPDATE
                update_sql = """
                UPDATE seed_sites 
                SET site_name = %s, domain = %s, is_active = %s, updated_at = %s
                WHERE site_url = %s
                """
                cursor_insert.execute(update_sql, (site_name, domain, is_active, updated_at, site_url))
                update_count += 1
                print(f'    → 已更新')
            else:
                # 記錄不存在，執行 INSERT
                insert_sql = """
                INSERT INTO seed_sites (site_name, site_url, domain, is_active, created_at, updated_at)
                VALUES (%s, %s, %s, %s, %s, %s)
                """
                cursor_insert.execute(insert_sql, (site_name, site_url, domain, is_active, created_at, updated_at))
                insert_count += 1
                print(f'    → 已插入')
        
        # 標記不在本次查詢中的記錄為不活躍
        # 這些記錄在下一次執行時會被清除
        cleanup_count = 0
        if current_urls:
            placeholders = ','.join(['%s'] * len(current_urls))
            cleanup_sql = f"""
            UPDATE seed_sites 
            SET is_active = 0, updated_at = %s
            WHERE site_url NOT IN ({placeholders})
            AND is_active = 1
            """
            cursor_insert.execute(cleanup_sql, [created_at] + list(current_urls))
            cleanup_count = cursor_insert.rowcount
            if cleanup_count > 0:
                print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 已標記 {cleanup_count} 個不在列表中的網站為不活躍')
        
        # 提交交易
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 提交交易...')
        conn_rag.commit()
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 完成！ (新增: {insert_count}, 更新: {update_count}, 停用: {cleanup_count})')
        
    except mysql.connector.Error as e:
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 資料庫錯誤: {str(e)}')
        if conn_rag:
            conn_rag.rollback()
    except Exception as e:
        print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] 發生錯誤: {str(e)}')
        if conn_rag:
            conn_rag.rollback()
    finally:
        if conn_ipsystem:
            conn_ipsystem.close()            
        if conn_rag:
            conn_rag.close()            

if __name__ == '__main__':
    # 調試模式：测試單一網址
    DEBUG = False
    
    if DEBUG:
        # 在這裡 hardcode 要測試的網址
        test_urls = [            
            'https://genaff.cc.ncku.edu.tw/cir/',
        ]
        
        print("[DEBUG 模式] 測試單一網址")
        print("=" * 60)
        
        for url in test_urls:
            print(f'\n測試: {url}')
            result = check_website_exists(url)
            print(f'結果: {"活躍" if result else "不活躍"}')
        
        print("\n" + "=" * 60)
    else:
        # 正常模式：執行完整的網站列表更新
        retrieve_and_store_sites()
