import pdfplumber
import sys
import os
from ollama import Client

#OLLAMA_SERVER = "http://140.116.240.181:45015/"
#STAGE2_MODEL = "mistral-small:24b-instruct-2501-q4_K_M"
OLLAMA_SERVER="https://primehub.aic.ncku.edu.tw/console/apps/ollama-0-13-4-xa9c7"
STAGE2_MODEL = "gpt-oss:20b"
client = Client(host=OLLAMA_SERVER, headers={'Content-Type': 'application/json'})

# --- Prompt A: 針對表格 (維持原本最強的版本) ---
PROMPT_TABLE = """【角色】你是一個高階資料解析助理，負責將 PDF 內容轉換為結構化資料。
【輸入資料特性】
輸入的文字是依據原始排版順序呈現，結構是不固定的。可能包含（也可能不包含）：
1. 標題或前言 (可能位於表格上方)
2. Markdown 表格 (核心資料)
3. 備註或補充說明 (可能位於表格下方)

【核心任務】
請掃描輸入內容，依據**實際存在的內容**進行結構化輸出。

**處理邏輯 (依序執行，若無該區塊則跳過)：**

**1. 偵測標題 (Optional)**
    - 檢查表格上方是否有文字段落。
    - 若有，請判斷是否為文件標題或法規名稱。
    - **若存在**：請**直接輸出該段文字**，保持原樣，**不需要**加上任何前綴。
    - **若不存在** (內容直接從表格開始)，則**完全不要**輸出此區塊。

**2. 解析表格 (Required)**
    - 這是核心資料，一定存在。請依據 Markdown 表格內容生成鍵值對。
    - **合併規則**：若多行內容屬於同一筆資料 (第一欄空白或重複)，請將內容合併為單一字串 (用「；」連接)。
    - 格式：依據表頭自動生成，如「欄位名：內容」。
    【範例對照 (抽象化範例)】
    **正確 (合併)**：
    假別：特別休假，給假日數：6個月以上1年未滿3日；1年以上2年未滿7日；2年以上3年未滿10日；3年以上5年未滿每年14日；5年以上10年未滿每年15日；10年以上每年加1日至30日，工資給與：工資照給。

**3. 偵測備註 (Optional)**
    - 檢查表格下方是否有文字段落。
    - **若存在**，請加上前綴「【其他資訊】：」並輸出。
    - **若不存在**，則**完全不要**輸出此區塊。

【範例對照】
情境 A (全都有)：
輸出：【文件標題】：XX表 \n 欄位：內容... \n 【其他資訊】：備註...

情境 B (只有表格)：
輸出：欄位：內容...

【輸出格式嚴格規範】
1. **嚴禁** 無中生有。如果沒有標題，就不要寫「【文件標題】：(空)」。
2. **嚴禁** 輸出 Markdown 代碼。
3. **表格合併** 必須準確，不要讓同一筆資料斷成兩行。

【待處理內容】：
{content}
"""

PROMPT_TEXT = """【角色】你是一個法規文件結構化專家，擅長提取現行條文並去除行政雜訊。

【輸入資料特性】
這是一份純文字法規文件，通常包含：
1. 法規標題。
2. **修法沿革** (一長串的日期與會議紀錄，如「xx年xx月xx日修正通過」) -> **這是雜訊，必須刪除**。
3. **現行條文** (以 一、二、... 或 第1條、第2條... 開頭)。
4. 頁碼或檔號 -> **這是雜訊，必須刪除**。

【核心任務】
請去除雜訊，僅保留「法規標題」與「現行條文內容」。

**處理邏輯 (過濾規則)：**
1. **標題保留**：保留最上方的法規名稱 (如「...作業要點」)。
2. **強力過濾 (重點)**：
   - **刪除** 所有「修正通過」的歷程紀錄 (無論有多少行，全部略過)。
   - **刪除** 頁碼 (如：Page 1, 第x頁, 1/5)。
   - **刪除** 頁首/頁尾的行政代碼 (如：檔號、保存年限)。
3. **條文整理**：
   - 保持「一、(一)、1.」的層次結構。
   - 去除文中不必要的斷行，將同一段落接續起來。

【範例對照】
**原始輸入**：
國立成功大學XX要點
88年修正通過
99年修正通過
一、為提高...
(Page 1)

**正確輸出**：
國立成功大學XX要點
一、為提高...

【輸出格式嚴格規範】
1. **嚴禁** 輸出任何「修正通過」的日期行。
2. **嚴禁** 輸出 Markdown 代碼。
3. 直接輸出淨化後的內容。

【待處理內容】：
{content}
"""

# === 輔助函式：切片處理邏輯 (將上一回的複雜邏輯封裝在這裡) ===
def extract_mixed_content(page, valid_tables):
    """
    針對有表格的頁面，執行「切片法」：
    文字 -> 表格 -> 文字 -> 表格...
    """
    page_content = []
    
    # 確保表格由上而下排序
    valid_tables.sort(key=lambda x: x.bbox[1])
    
    current_y = 0
    page_height = page.height

    for table in valid_tables:
        table_top = table.bbox[1]
        table_bottom = table.bbox[3]

        # 1. 提取表格「上方」的文字
        if table_top > current_y:
            text_area_bbox = (0, current_y, page.width, table_top)
            if text_area_bbox[3] - text_area_bbox[1] > 1:
                cropped_page = page.crop(text_area_bbox)
                text = cropped_page.extract_text()
                if text and text.strip():
                    page_content.append(text.strip())

        # 2. 提取表格本身 (轉 Markdown)
        data = table.extract()
        if data:
            num_cols = len(data[0])
            # 製作簡易 Markdown
            md_table = " | ".join(["Col" for _ in range(num_cols)]) + "\n"
            md_table += " | ".join(["---" for _ in range(num_cols)]) + "\n"
            for row in data:
                clean_row = [str(cell).replace('\n', '').strip() if cell is not None else "" for cell in row]
                md_table += " | ".join(clean_row) + "\n"
            page_content.append(f"\n[表格資料]:\n{md_table}")

        # 3. 更新 Y 座標
        current_y = table_bottom

    # 4. 提取最後一個表格「下方」的文字
    if current_y < page_height:
        text_area_bbox = (0, current_y, page.width, page_height)
        if text_area_bbox[3] - text_area_bbox[1] > 1:
            cropped_page = page.crop(text_area_bbox)
            text = cropped_page.extract_text()
            if text and text.strip():
                page_content.append(text.strip())

    return "\n".join(page_content)

# === 主函式：自動分流處理器 ===
def process_pdf_auto_router(pdf_path):
    final_output = []
    
    table_settings = {
        "vertical_strategy": "lines", 
        "horizontal_strategy": "lines",
        "snap_tolerance": 4, 
        "join_tolerance": 4, 
    }

    try:
        with pdfplumber.open(pdf_path) as pdf:
            #print(f"--- 開始處理檔案: {pdf_path} ---")
            
            for i, page in enumerate(pdf.pages):
                #print(f"\n正在分析第 {i+1} 頁...")

                # --- 步驟 1: 偵測有效表格 ---
                raw_tables = page.find_tables(table_settings)
                valid_tables = []
                
                # 過濾雜訊 (只保留夠大的表格)
                for table in raw_tables:
                    data = table.extract()
                    if not data: continue
                    num_cols = len(data[0])
                    num_rows = len(data)
                    # 判斷標準：欄位>=2 且 (欄位>=4 或 行數>=2)
                    if num_cols >= 2 and (num_cols >= 4 or num_rows >= 2):
                        valid_tables.append(table)

                # --- 步驟 2: 分流 (Router) ---
                has_table = len(valid_tables) > 0
                
                content_for_llm = ""
                selected_prompt = ""

                if has_table:
                    print(f"模式: [表格修復模式] - 偵測到 {len(valid_tables)} 個表格")
                    # 呼叫上面的輔助函式，傳入 page 和 table 物件
                    content_for_llm = extract_mixed_content(page, valid_tables)
                    selected_prompt = PROMPT_TABLE
                
                else:
                    print(f"模式: [純文字模式] - 無表格，執行法規解析")
                    # 直接抓純文字
                    content_for_llm = page.extract_text()
                    selected_prompt = PROMPT_TEXT

                # --- 步驟 3: 呼叫 LLM (針對該頁面) ---
                if content_for_llm and content_for_llm.strip():
                    try:
                        response = client.chat(
                            model=STAGE2_MODEL,
                            options={'num_ctx':131072, 'temperature':0.0},
                            messages=[{'role':'user', 'content': selected_prompt.format(content=content_for_llm)}]
                        )
                        result_text = response["message"]["content"]
                        final_output.append(result_text)
                        
                        # 即時印出結果 (方便除錯)
                        print("--- LLM 解析結果 ---")
                        print(result_text)
                        
                    except Exception as e:
                        print(f"LLM Error: {e}")
                        final_output.append(f"LLM Error on page {i+1}")

    except Exception as e:
        return f"PDF Error: {str(e)}"

    return "\n\n".join(final_output)

# 執行
if __name__ == "__main__":
    #process_pdf_auto_router("pdf_cache2/holiday.pdf")
    process_pdf_auto_router("pdf_cache2/reg09-03.pdf")