import sys
import json
import pdfplumber
import hashlib
import re
from collections import Counter
from typing import List
from ollama import Client
import os
from dotenv import load_dotenv


load_dotenv()
api_endpoint = os.getenv("API_ENDPOINT") #ollama server
llm_model = os.getenv("LLM_MODEL")
num_ctx = int(os.getenv("CONTEXT_LENGTH"))
client = Client(host=api_endpoint, headers={'Content-Type': 'application/json'})

# --- Prompt A: 針對表格 ---
# 將「解析表格」的輸出要求改為 JSON
PROMPT_TABLE = """【角色】你是一個高階資料解析助理。
【任務】將輸入的文字內容（包含標題、表格、備註）轉換為 JSON Array。

【重要原則】
為了配合系統格式，所有內容（包含標題、表格數據、備註）都必須封裝在同一個 JSON Array 中。**嚴禁**在 JSON Array 之外輸出任何文字。

【JSON 欄位定義】
每一筆資料包含兩個欄位：
1. **section_title**: 章節或項目名稱。
2. **content**: 該項目的完整內容描述。

【處理邏輯 (請依序執行)】

**步驟 1：處理文件標題 (若存在)**
   - 檢查表格上方是否有標題文字。
   - 若有，請建立一個物件：
     {{ "section_title": "表單資訊", "content": "標題內容..." }}
   - 若無，則跳過此步驟。

**步驟 2：處理表格內容 (核心)**
   - 逐行解析 Markdown 表格。
   - **section_title**: 取自表格的第一欄（例如：婚假、事假）。
   - **content**: 將該列其餘欄位合併，並加上欄位名稱描述（例如：給假日數：8日；工資：照給）。
   - **合併規則**：若同一項目被切成多行，請合併為單一物件。

**步驟 3：處理備註 (若存在)**
   - 檢查表格下方是否有備註或說明文字。
   - 若有，請建立一個物件：
     {{ "section_title": "表單備註", "content": "備註內容..." }}
   - 若無，則跳過此步驟。

【JSON 輸出範例】
[
  {{
    "section_title": "表單資訊",
    "content": "國立成功大學教師請假一覽表"
  }},
  {{
    "section_title": "婚假",
    "content": "給假日數：8日，需檢附證明；工資給與：照給。"
  }},
  {{
    "section_title": "事假",
    "content": "給假日數：14日..."
  }},
  {{
    "section_title": "表單備註",
    "content": "1. 本表適用於編制內人員。 2. 請假需透過系統申請。"
  }}
]

【待處理內容】：
{content}

"""

# --- Prompt B: 純文字法規專用 (針對無表格的頁面) ---
PROMPT_TEXT = """【角色】你是一個法規結構化專家。
【任務】將輸入的法規文字去除行政雜訊並轉換為符合 RAG 資料庫格式的 JSON Array。

【輸入資料特性】
這是一份純文字法規文件，通常包含：
1. 法規標題。
2. **修法沿革** (一長串的日期與會議紀錄，如「xx年xx月xx日修正通過」) -> **這是雜訊，必須刪除**。
3. **現行條文** (以 一、二、... 或 第1條、第2條... 開頭)。
4. 頁碼或檔號 -> **這是雜訊，必須刪除**。

【欄位定義】
1. **section_title**: 條號或段落標題 (例如: "一、", "二、申請資格", "第四條")。
2. **content**: 該條文的完整內容 (包含項次如 (一)、1. 等細節)。

**處理邏輯 (過濾規則)：**
1. **標題保留**：保留最上方的法規名稱 (如「...作業要點」)。
2. **強力過濾 (重點)**：
   - **刪除** 所有「修正通過」的歷程紀錄 (無論有多少行，全部略過)。
   - **刪除** 頁碼 (如：Page 1, 第x頁, 1/5)。
   - **刪除** 頁首/頁尾的行政代碼 (如：檔號、保存年限)。
3. **條文整理**：
   - 保持「一、(一)、1.」的層次結構。
   - 去除文中不必要的斷行，將同一段落接續起來。
4. **通用性**：適用於任何條列式文件。   

【範例對照】
**原始輸入**：
國立成功大學XX要點
88年修正通過
99年修正通過
一、為提高...
(Page 1)

**正確輸出**：
國立成功大學XX要點
一、為提高...

【輸出格式範例】
[
  {{
    "section_title": "一、目的",
    "content": "為提高本校學術研究水準，依據..."
  }},
  {{
    "section_title": "二、申請資格",
    "content": "以在校服務滿二年以上者為原則。但有下列情形..."
  }}
]

【嚴禁】輸出 Markdown 代碼，只輸出純 JSON。
【待處理內容】：
{content}
"""

# === 輔助函式：清理 LLM 回傳的 JSON 字串 ===
def clean_and_parse_json(llm_output):
    """
    嘗試解析 LLM 回傳的 JSON，處理 Markdown code block 和不完整的格式。
    """
    try:
        # 1. 移除 Markdown code blocks (```json ... ```)
        clean_text = re.sub(r'```json\s*', '', llm_output, flags=re.IGNORECASE)
        clean_text = re.sub(r'```', '', clean_text)
        clean_text = clean_text.strip()
        
        # 2. 嘗試解析 JSON
        return json.loads(clean_text)
    except json.JSONDecodeError as e:
        print(f"JSON 解析失敗: {e}")
        # 如果失敗，印出原始文字以供除錯
        # print(f"原始輸出片段: {llm_output[:200]}...") 
        return []
    
def detect_text_orientation(pdf_path, check_pages=3, min_chars_per_page=15):
    """
    偵測 PDF 文字排版方向：直排 (vertical) 或 橫排 (horizontal)
    
    參數：
        pdf_path: PDF 檔案路徑
        check_pages: 檢查前 N 頁（預設 3 頁）
        min_chars_per_page: 每頁最少字符數（預設 15）
    
    返回：
        "vertical" (直排) 或 "horizontal" (橫排)
    """
    vertical_score = 0
    horizontal_score = 0
    valid_pairs = 0  # 有效的字對數量

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # 檢查前幾頁
            for page in pdf.pages[:check_pages]:
                chars = page.chars
                if len(chars) < min_chars_per_page:
                    continue

                # ==================================================
                # 線性掃描：比較相鄰字符的位置
                # ==================================================
                for i in range(len(chars) - 1):
                    c1 = chars[i]
                    c2 = chars[i + 1]

                    # 取字體大小（預設 12）
                    font_size = c1.get("size", 12)
                    if font_size <= 0:
                        font_size = 12
                    
                    # 計算相鄰字的水平和垂直距離
                    dx = abs(c2["x0"] - c1["x0"])
                    dy = abs(c2["top"] - c1["top"])

                    # --------------------------------------------------
                    # 過濾異常情況：距離太遠的字對（可能是換段或跳行）
                    # --------------------------------------------------
                    dist_limit = font_size * 2.5  # 容許 2.5 倍字高的距離
                    if dx > dist_limit and dy > dist_limit:
                        # 兩個軸都太遠，說明不連續
                        continue
                    
                    # 橫向跳躍（換欄）：X 軸跳躍但 Y 軸穩定
                    if dx > dist_limit and dy < font_size * 0.8:
                        continue
                    
                    # 縱向跳躍（換行/段落）：Y 軸跳躍但 X 軸穩定
                    if dy > dist_limit and dx < font_size * 0.8:
                        continue

                    # --------------------------------------------------
                    # 判定方向
                    # 容許值：0.6 倍字高（較寬鬆，容許更多波動）
                    # --------------------------------------------------
                    tol = font_size * 0.6

                    # 橫排特徵：Y 軸幾乎不動，X 軸明顯移動
                    if dy < tol and dx > font_size * 0.3:
                        horizontal_score += 1
                        valid_pairs += 1

                    # 直排特徵：X 軸幾乎不動，Y 軸明顯移動
                    elif dx < tol and dy > font_size * 0.3:
                        vertical_score += 1
                        valid_pairs += 1

        # 輸出診斷資訊
        #print(f"[檢測結果] 檢查字對數: {valid_pairs}, 直排: {vertical_score}, 橫排: {horizontal_score}")

        # 邊界情況：如果沒有有效的判斷，預設為橫排
        if valid_pairs < 10:
            #print(f"[警告] 樣本過少 ({valid_pairs} 對)，可能判斷不准確")
            return "horizontal"

        if vertical_score == 0 and horizontal_score == 0:
            return "horizontal"

        # 判定邏輯：取積分較高的方向
        # 不考慮混合排版，只判定主要方向
        result = "vertical" if vertical_score > horizontal_score else "horizontal"
        #confidence = max(vertical_score, horizontal_score) / valid_pairs if valid_pairs > 0 else 0
        
        # 檢查是否為混合排版（兩者積分都接近）
        #ratio = min(vertical_score, horizontal_score) / max(vertical_score, horizontal_score) if max(vertical_score, horizontal_score) > 0 else 0
        #if ratio > 0.3:
        #    print(f"[注意] 檢測到可能的混合排版（比例: {ratio:.1%}），但只判定主要方向")
        
        #print(f"[可信度] {confidence:.1%}")
        return result

    except Exception as e:
        #print(f"[檢測失敗] {e}，預設使用橫排")
        return "horizontal"

def extract_vertical_text(pdf_path):
    """
    專門處理【直排文字】(由上而下，由右至左) 的 PDF 提取函式
    """
    text_content = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 1. 抓取頁面所有字元物件
            chars = page.chars
            
            # 2. 關鍵排序邏輯：
            #    第一優先 (Primary Key): x0 (X座標) 由大到小 (因為是從右邊開始讀) -> 使用 -c['x0']
            #    第二優先 (Secondary Key): top (Y座標) 由小到大 (由上往下讀) -> 使用 c['top']
            #    
            #    注意：為了避免同一行有些微誤差導致順序錯亂，
            #    我們可以把 x0 除以一個寬度 (例如 10) 來做分群 (Bucket)，
            #    但最簡單的暴力法通常對這種整齊排版有效。
            
            # 這裡做一個簡單的容錯排序 (Tolerance Sorting)
            # 這是簡易版，對付這份檔案應該夠用
            chars_sorted = sorted(chars, key=lambda c: (-c['x0'], c['top']))
            
            # 3. 組合文字
            page_text = ""
            for char in chars_sorted:
                page_text += char.get('text', '')
            
            text_content += page_text + "\n"
            
    return text_content

# === 輔助函式：切片處理邏輯 ===
def extract_mixed_content(page, valid_tables):
    """
    針對有表格的頁面，執行「切片法」：
    文字 -> 表格 -> 文字 -> 表格...
    """
    page_content = []
    
    # 確保表格由上而下排序
    valid_tables.sort(key=lambda x: x.bbox[1])
    
    current_y = 0
    page_height = page.height

    for table in valid_tables:
        table_top = table.bbox[1]
        table_bottom = table.bbox[3]

        # 1. 提取表格「上方」的文字
        if table_top > current_y:
            text_area_bbox = (0, current_y, page.width, table_top)
            if text_area_bbox[3] - text_area_bbox[1] > 1:
                cropped_page = page.crop(text_area_bbox)
                text = cropped_page.extract_text()
                if text and text.strip():
                    page_content.append(text.strip())

        # 2. 提取表格本身 (轉 Markdown)
        data = table.extract()
        if data:
            num_cols = len(data[0])
            # 製作簡易 Markdown
            md_table = " | ".join(["Col" for _ in range(num_cols)]) + "\n"
            md_table += " | ".join(["---" for _ in range(num_cols)]) + "\n"
            for row in data:
                clean_row = [str(cell).replace('\n', '').strip() if cell is not None else "" for cell in row]
                md_table += " | ".join(clean_row) + "\n"
            page_content.append(f"\n[表格資料]:\n{md_table}")

        # 3. 更新 Y 座標
        current_y = table_bottom

    # 4. 提取最後一個表格「下方」的文字
    if current_y < page_height:
        text_area_bbox = (0, current_y, page.width, page_height)
        if text_area_bbox[3] - text_area_bbox[1] > 1:
            cropped_page = page.crop(text_area_bbox)
            text = cropped_page.extract_text()
            if text and text.strip():
                page_content.append(text.strip())

    return "\n".join(page_content)

# === 主函式：自動分流處理器 ===
def process_pdf_auto_router(pdf_path):
    all_json_results = []    
   
    table_settings = {
        "vertical_strategy": "lines", 
        "horizontal_strategy": "lines",
        "snap_tolerance": 4, 
        "join_tolerance": 4, 
    }

    try:
        with pdfplumber.open(pdf_path) as pdf:
            #print(f"--- 開始處理檔案: {pdf_path} ---")
            doc_title = pdf.metadata.get("Title") or "未命名文件"
            current_section_title = doc_title

            for page_num, page in enumerate(pdf.pages, start=1):
                #print(f"\n正在分析第 {page_num+1} 頁...")

                # --- 步驟 1: 偵測有效表格 ---
                raw_tables = page.find_tables(table_settings)
                valid_tables = []
                
                # 過濾雜訊 (只保留夠大的表格)
                for table in raw_tables:
                    data = table.extract()
                    if not data: continue
                    num_cols = len(data[0])
                    num_rows = len(data)
                    # 判斷標準：欄位>=2 且 (欄位>=4 或 行數>=2)
                    if num_cols >= 2 and (num_cols >= 4 or num_rows >= 2):
                        valid_tables.append(table)

                # --- 步驟 2: 分流 (Router) ---
                has_table = len(valid_tables) > 0
                
                content_for_llm = ""
                selected_prompt = ""

                if has_table:
                    #print(f"模式: [表格修復模式] - 偵測到 {len(valid_tables)} 個表格")
                    # 呼叫上面的輔助函式，傳入 page 和 table 物件
                    content_for_llm = extract_mixed_content(page, valid_tables)
                    selected_prompt = PROMPT_TABLE
                
                else:
                    #print(f"模式: [純文字模式] - 無表格，執行法規解析")
                    # 直接抓純文字
                    content_for_llm = page.extract_text()
                    selected_prompt = PROMPT_TEXT
                    print("[debog]content_for_llm:"+content_for_llm)

                # --- 步驟 3: 呼叫 LLM (針對該頁面) ---
                if content_for_llm and content_for_llm.strip():
                    try:
                        response = client.chat(
                            model=llm_model,
                            options={'num_ctx':num_ctx, 'temperature':0.1},
                            messages=[{'role':'user', 'content': selected_prompt.format(content=content_for_llm)}]
                        )
                        result_text = response["message"]["content"]
                        print("[debug]llm return:"+result_text)

                        # --- 步驟 4: 解析 JSON 並印出 ---
                        parsed_json = clean_and_parse_json(result_text)                        
                        
                        # 確保回傳的是 List，如果是單個 Dict 就包成 List
                        #if isinstance(parsed_list, dict):
                        #    parsed_list = [parsed_list]

                        # --- D. 注入統一標題 (Data Injection) ---
                        for item in parsed_json:                            
                            item['doc_title'] = doc_title
                            item['page_num'] = page_num
                            # 加入列表
                            all_json_results.append(item)

                    except json.JSONDecodeError:
                        print(f"[錯誤] Chunk {i+1} 解析 JSON 失敗，跳過。")
                        # 實務上這裡可以記錄 raw_content 以便 debug
                    except Exception as e:
                        print(f"[錯誤] 發生未預期錯誤: {e}")
                        
                  
    except Exception as e:
        return f"PDF Error: {str(e)}"

    return all_json_results

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python parse_pdf.py <pdf_path>")
        sys.exit(1)

    try:
        # 2. 執行你的解析邏輯
        final_data = process_pdf_auto_router(sys.argv[1])        
        # ensure_ascii=False 讓中文正常顯示，不會變成 \uXXXX
        # indent=2 雖然好讀，但在正式串接時建議拿掉以節省頻寬，這裡為了 Debug 先保留也行
        print(json.dumps(final_data, ensure_ascii=False))                
        sys.exit(0)
        """
        print("\n" + "="*50)
        print(f"【處理完成】共取得 {len(final_data)} 筆結構化資料")
        print("="*50 + "\n")

        for index, record in enumerate(final_data):
            print(f"資料 ID: {index + 1}")
            print(f"文件標題: {record.get('doc_title')}")
            print(f"章節標題: {record.get('section_title')}")
            print(f"頁碼: {record.get('page_num')}")
            print(f"內    容: {record.get('content')}...")
            print("-" * 30)
        """
    except Exception as e:
        # 捕捉所有錯誤，並以 JSON 格式回傳錯誤訊息給 PHP
        error_msg = {"error": str(e), "type": type(e).__name__}
        print(json.dumps(error_msg, ensure_ascii=False))
        sys.exit(1) # 告訴 PHP 執行失敗