#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import json
import hashlib
import subprocess
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
import mysql.connector
try:
    import requests
except ImportError:
    print("Error: Required packages not installed. Please install: requests mysql-connector-python")
    sys.exit(1)

# Database configuration (adjust these based on your connect.php settings)

load_dotenv()
dbhost = os.getenv("DB_HOST")
database = os.getenv("DATABASE")
database_2 = os.getenv("DATABASE_2")
dbuser = os.getenv("DB_USER")
dbpwd = os.getenv("DB_PWD")

# PDF cache directory
PDF_CACHE_DIR = 'pdf_cache_cli'

# Python parser configuration
PYTHON_PARSER = '/var/www/html/RAG/RAG_venv/bin/python'
PARSE_SCRIPT = '/var/www/html/RAG/parse_pdf.py'


def get_db_connection(db_name):
    """Establish database connection"""
    try:
        db = mysql.connector.connect(
            host=dbhost,
            database=db_name,
            user=dbuser,
            password=dbpwd
        )
        return db
    except mysql.connector.Error as e:
        print(f"Database connection error: {e}")
        raise


def starts_with(text, prefix):
    """Check if text starts with prefix"""
    return text.startswith(prefix)


def download_pdf(url, save_dir=PDF_CACHE_DIR):
    """Download PDF from URL and save to local cache"""
    
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=30, allow_redirects=True)
        
        # Check HTTP status code
        if response.status_code == 404:
            raise Exception(f"URL 不存在 (404): {url}")
        
        if response.status_code >= 400:
            raise Exception(f"HTTP 錯誤 ({response.status_code}): {url}")
        
        # Check if it's actually a PDF
        content_type = response.headers.get('content-type', '').lower()
        if 'pdf' not in content_type:
            raise Exception(f"Not a PDF or content-type check failed: {url}")
        
        data = response.content
        if not data:
            raise Exception(f"Download failed or empty response: {url}")
        
        # Generate MD5 hash of the file content to ensure uniqueness
        file_hash = hashlib.md5(data).hexdigest()
        local_path = os.path.join(save_dir, f"{file_hash}.pdf")
        
        # If file already exists, skip download
        if os.path.exists(local_path):
            return local_path
        
        # Write the PDF file
        with open(local_path, 'wb') as f:
            f.write(data)
        
        return local_path
        
    except requests.exceptions.RequestException as e:
        raise Exception(f"Download error: {str(e)}")


def parse_pdf_with_python(pdf_path):
    """Parse PDF using Python script"""
    
    try:
        cmd = [PYTHON_PARSER, PARSE_SCRIPT, pdf_path]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        
        if result.returncode != 0:
            raise Exception(f"Python parser failed: {result.stderr}")
        
        output = result.stdout
        if not output:
            raise Exception("Python parser returned empty output")
        
        # Parse JSON output
        data = json.loads(output)
        
        if not isinstance(data, dict):
            print(f"<div style='background-color:#ffebeb; border:1px solid red; padding:10px; margin:10px;'>")
            print(f"<strong>[DEBUG] Python Output Error:</strong><br>")
            print(f"<pre>{output}</pre>")
            print(f"</div>")
            raise Exception("Invalid JSON from Python parser. See DEBUG info above.")
        
        # Check if file is skipped
        if data.get('skipped') and data.get('skipped') is True:
            reason = data.get('reason', 'Unknown')
            doc_title = data.get('doc_title', 'Unknown')
            print(f"[Skip] {reason}: {doc_title}<br>")
            return None
        
        return data
        
    except json.JSONDecodeError as e:
        print(f"JSON parse error: {e}")
        raise Exception(f"Invalid JSON from Python parser: {str(e)}")
    except subprocess.TimeoutExpired:
        raise Exception("Python parser timeout")
    except Exception as e:
        raise Exception(f"Parser error: {str(e)}")


def process_pdf_files():
    """Main function to process PDF files"""
    
    # Connect to databases
    conn_rag = get_db_connection(database)
    conn_ipsystem = get_db_connection(database_2)
    
    try:
        cursor_rag = conn_rag.cursor(dictionary=True)
        cursor_ipsystem = conn_ipsystem.cursor(dictionary=True)        
        
        # Query to get URLs to process
        sql = "SELECT id, url, url_redirect, status FROM ws_sites_urls_20250924 WHERE id = %s ORDER BY id ASC"
        
        # Test with specific ID
        test_id = 10526  # Adjust this ID as needed
        
        num = 0
        insert_num = 0
        update_num = 0
        
        cursor_ipsystem.execute(sql, (test_id,))
        pages = cursor_ipsystem.fetchall()
        
        for page in pages:
            num += 1
            
            # Use url_redirect if available, otherwise use url
            pdf_url = page.get('url_redirect') or page.get('url')
            
            # Try to download PDF
            try:
                local_pdf = download_pdf(pdf_url)
            except Exception as e:
                print(f"[Skip] {pdf_url} - Reason: {str(e)}<br>")
                continue
            
            # Calculate SHA256 hash of file
            with open(local_pdf, 'rb') as f:
                file_hash = hashlib.sha256(f.read()).hexdigest()
            
            # Check if hash already exists in database
            sql_check_hash = "SELECT file_hash FROM pdf_ncku WHERE file_hash = %s LIMIT 1"
            cursor_rag.execute(sql_check_hash, (file_hash,))
            existing_hash_record = cursor_rag.fetchone()
            
            if existing_hash_record and existing_hash_record.get('file_hash'):
                print(f"[Skip Processing] {pdf_url} (file_hash 相同，檔案內容未變: {file_hash})<br>")
                # Clean up temporary downloaded file
                if os.path.exists(local_pdf):
                    os.remove(local_pdf)
                continue
            
            # Hash is different or doesn't exist, need to re-parse
            print(f"[開始處理] {pdf_url} - 檔案內容變更或首次處理<br>")
            rows = parse_pdf_with_python(local_pdf)
            
            # Case A: Blacklist file (e.g., meeting minutes)
            if rows is None:
                print(f" -> 識別為黑名單<br>")
                continue
            
            # Case B: File parsing result is empty
            if not rows or not isinstance(rows, list):
                print(f" -> 檔案解析結果為空，異常/需檢查。<br>")
                continue
            
            # Insert or update database records
            for row in rows:
                # Determine url_available based on status
                url_available = 1  # Default is available
                if page.get('status') == 'delete':
                    url_available = 0  # delete means not available
                
                # Extract parsed fields
                doc_title = row.get('doc_title', '')
                section_title = row.get('section_title', '')
                content = row.get('content', '')
                page_num = row.get('page_num', 1)
                doc_date = row.get('doc_date', '')
                
                # Check if record exists
                sql_check = "SELECT id FROM pdf_ncku WHERE url = %s AND page_num = %s AND section_title = %s LIMIT 1"
                cursor_rag.execute(sql_check, (pdf_url, page_num, section_title))
                rec = cursor_rag.fetchone()
                
                print(f"[DEBUG] pdfUrl={pdf_url}<br>")
                print(f"[DEBUG] doc_title={doc_title}<br>")
                print(f"[DEBUG] doc_date={doc_date}<br>")
                print(f"[DEBUG] page_num={page_num}<br>")
                print(f"[DEBUG] section_title={section_title}<br>")
                print(f"[DEBUG] content={content}<br>")
                
                if not rec or not rec.get('id'):  # Record doesn't exist
                    # Insert new record
                    sql_insert = """
                        INSERT INTO pdf_ncku
                        (url, title, file_hash, page_num, section_title, content, url_available, embedding_status, last_fetched)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """
                    cursor_rag.execute(sql_insert, [
                        pdf_url,
                        doc_title,
                        file_hash,
                        page_num,
                        section_title,
                        content,
                        url_available,
                        'pending',
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    ])
                    print(f"[INSERT][{page['id']}] {pdf_url} (page={page_num}, url status={page.get('status')}, available={url_available})<br>")
                    insert_num += 1
                else:
                    # Update existing record
                    sql_update = """
                        UPDATE pdf_ncku 
                        SET title=%s, file_hash=%s, page_num=%s, section_title=%s, content=%s, 
                            url_available=%s, embedding_status=%s, last_fetched=%s 
                        WHERE id=%s
                    """
                    cursor_rag.execute(sql_update, [
                        doc_title,
                        file_hash,
                        page_num,
                        section_title,
                        content,
                        url_available,
                        'pending',
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        rec['id']
                    ])
                    print(f"[UPDATE][{page['id']}] {pdf_url} (page={page_num}, url status={page.get('status')}, available={url_available})<br>")
                    update_num += 1
            
            # Clean up downloaded PDF file after processing
            if os.path.exists(local_pdf):
                os.remove(local_pdf)
        
        # Commit all changes
        conn_rag.commit()
        
        # Print summary
        print("<hr>")
        print(f"Total: {num}<br>")
        print(f"Insert num: {insert_num}<br>")
        print(f"Update num: {update_num}<br>")
        
    except Exception as e:
        print(f"Error: {str(e)}")
        conn_rag.rollback()
        raise
    finally:
        cursor_ipsystem.close()
        cursor_rag.close()
        conn_ipsystem.close()
        conn_rag.close()


if __name__ == '__main__':
    try:
        process_pdf_files()
    except Exception as e:
        print(f"Fatal error: {str(e)}")
        sys.exit(1)