import sys
import json
import pdfplumber
import hashlib
import re
from collections import Counter
from typing import List
from ollama import Client
import os

# ==========================================
# 1. Regex 過濾與標題規則
# ==========================================

NOISE_PATTERNS = [
    r'Page\s*\d+', 
    r'-\s*\d+\s*-', 
    r'\d+\s*/\s*\d+', 
    r'第\s*\d+\s*頁.*',
    r'中華民國\d+年.*(修正|訂定|公布)', 
    r'\d{2,3}\.\d{1,2}\.\d{1,2}.*(修正|訂定)',
    r'\d{1,3}\s*年\s*\d{1,2}\s*月\s*\d{1,2}\s*日.*?(修正|訂定|公布)',
    r'\d{1,3}\s*學年度.*?(修正|訂定|公布)', 
    r'[-─＿_＊*]{5,}', 
    r'（本頁空白）',
    r'^第[一二三四五六七八九十]+章$', 
    r'^※.*', 
    r'^備註.*',
]

WATERMARK_KEYWORDS = ['僅供參考', '草案', 'DRAFT', 'CONFIDENTIAL']

SECTION_PATTERNS = [
    re.compile(r'^\s*[一二三四五六七八九十]+、.+'),
    re.compile(r'^\s*[壹貳參肆伍陸柒捌玖拾]+、.+'),
    re.compile(r'^\s*[(（][一二三四五六七八九十]+[)）].+'),
    re.compile(r'^\s*\d+[.、]\s*.+'),
    re.compile(r'^\s*[(（]\d+[)）].+'),
    re.compile(r'^.+?(名單|一覽表|對照表|清冊|簡表|統計表)$'),
]

from dotenv import load_dotenv
load_dotenv()
api_endpoint = os.getenv("API_ENDPOINT") #ollama server
llm_model = os.getenv("LLM_MODEL")
num_ctx = int(os.getenv("CONTEXT_LENGTH"))

# ==========================================
# 2. 判斷與 LLM 介面
# ==========================================

def detect_complex_page(page, text_lines: List[str]) -> bool:
    """
    【通用版】透過版面結構特徵判斷是否為複雜頁面/表格。
    完全移除特定關鍵字依賴，改用幾何特徵偵測。
    """
    # 1. 強特徵：直接使用 pdfplumber 的表格偵測演算法
    # 只要偵測到明顯的格線結構，就判定為表格
    tables = page.find_tables({
        "vertical_strategy": "lines", 
        "horizontal_strategy": "lines"
    })
    if len(tables) > 0:
        return True

    # 2. 幾何特徵：檢查繪圖物件 (Drawing Objects)
    # 表格通常由許多線條(lines)或矩形(rects)組成
    # 如果一頁裡面有超過 5 條橫/直線或矩形，極大機率是圖表或表格
    if len(page.lines) > 5 or len(page.rects) > 5:
        return True

    # 3. 空間分佈特徵：偵測「隱性表格」 (無格線，但文字分欄排列)
    # 邏輯：檢查每一行文字，如果中間有「大寬度的空白分隔」，代表是多欄位
    
    # 取得頁面上所有單字的物理位置
    words = page.extract_words()
    
    # 將單字按垂直位置 (top) 分組，容許 3px 的誤差 (同一行)
    rows = {}
    for w in words:
        # 將 y 座標四捨五入到最近的整數，做為 key
        y_loc = round(w['top'] / 3) * 3
        if y_loc not in rows:
            rows[y_loc] = []
        rows[y_loc].append(w)

    # 計算有多少行是「多欄位」的 (即同一行有多個單字，且單字間距夠大)
    multi_column_lines = 0
    total_lines = len(rows)

    if total_lines == 0:
        return False

    for y, row_words in rows.items():
        if len(row_words) < 2:
            continue
        
        # 排序該行的文字
        row_words.sort(key=lambda x: x['x0'])
        
        # 檢查該行單字之間的距離
        # 如果單字間距 > 20px (通常一個空白約 5-10px)，視為欄位分隔
        has_wide_gap = False
        for i in range(len(row_words) - 1):
            gap = row_words[i+1]['x0'] - row_words[i]['x1']
            if gap > 20: 
                has_wide_gap = True
                break
        
        if has_wide_gap:
            multi_column_lines += 1

    # 如果超過 30% 的行數具有多欄位特徵，判定為表格/清單頁面
    if multi_column_lines / total_lines > 0.3:
        return True

    return False

# ==========================================
# 3. 策略二核心：表格結構化提取
# ==========================================

def normalize_table_structure(markdown_table: str) -> str:
    """
    【表格標準化 v2】
    智能合併複雜表格：
    1. 垂直合併：識別「假別」欄為空的行，合併到上一行
    2. 水平合併：識別跨多列的同一項目，整合為單一行
    3. 表頭處理：保留表頭列，標記出主要數據行
    
    支援兩種表格結構：
    - 簡單表格：主類別欄為空時向上合併
    - 複雜表格：測驗名稱/項目跨越多列時進行水平合併
    """
    lines = markdown_table.strip().split('\n')
    if len(lines) < 2:
        return markdown_table
    
    # 解析表格為列表
    rows = []
    for line in lines:
        if '|' not in line:
            continue
        cells = [cell.strip() for cell in line.split('|')]
        cells = [c for c in cells if c]  # 移除首尾空欄
        if cells:
            rows.append(cells)
    
    if len(rows) < 2:
        return markdown_table
    
    first_row = rows[0]
    
    # ============ 策略 1：簡單表格（第一欄為空時垂直合併） ============
    result_rows = []
    i = 0
    while i < len(rows):
        current_row = rows[i]
        
        # 檢查第一欄是否為空
        if current_row[0].strip() == "":
            # 這是一個「子行」，應該合併到前一個主行
            if result_rows:
                prev_row = result_rows[-1]
                # 將子行的非空內容追加到前一行相應欄位
                for col_idx in range(1, len(current_row)):
                    if col_idx < len(prev_row):
                        cell_content = current_row[col_idx].strip()
                        if cell_content and cell_content != "":
                            # 對第二欄進行分號合併
                            if col_idx == 1:
                                prev_row[col_idx] += "；" + cell_content
                            # 其他欄位若為空則填充
                            elif not prev_row[col_idx].strip():
                                prev_row[col_idx] = cell_content
        else:
            # 這是一個新的主行
            result_rows.append(list(current_row))
        
        i += 1
    
    # ============ 策略 2：複雜表格（測驗名稱/項目跨多列） ============
    # 檢測是否存在「測驗名稱」相關的欄位
    has_exam_col = any('test' in str(cell).lower() or '測驗' in str(cell) 
                       for cell in first_row)
    
    if has_exam_col and len(result_rows) > 2:
        # 嘗試識別並合併同一測驗的多行數據
        final_rows = []
        i = 0
        while i < len(result_rows):
            current = result_rows[i]
            
            # 找到測驗名稱欄（通常是第一欄或第二欄）
            exam_name = current[0].strip() if current[0].strip() else (
                current[1].strip() if len(current) > 1 else ""
            )
            
            # 如果測驗名稱有明確內容（不是表頭、不是空、不是符號），視為新項目
            if exam_name and exam_name not in ['', '-', '- - -'] and not any(
                kw in exam_name for kw in ['(Cambridge', '(GEPT', 'ALTE', 'Level', '級', '型態']
            ):
                merged = list(current)
                
                # 查看下一行是否是同一測驗的延續
                j = i + 1
                while j < len(result_rows):
                    next_row = result_rows[j]
                    next_exam = next_row[0].strip() if next_row[0].strip() else ""
                    
                    # 若下一行的第一欄為空或是「延續內容」，則合併
                    if not next_exam or next_exam in ['', '-', '- - -']:
                        for col_idx in range(len(next_row)):
                            if col_idx < len(merged):
                                next_content = next_row[col_idx].strip()
                                if next_content and next_content != '':
                                    if col_idx >= 1:
                                        # 合併非第一欄的內容
                                        if merged[col_idx].strip():
                                            merged[col_idx] += "；" + next_content
                                        else:
                                            merged[col_idx] = next_content
                        j += 1
                    else:
                        break
                
                final_rows.append(merged)
                i = j
            else:
                final_rows.append(current)
                i += 1
        
        result_rows = final_rows
    
    # 重組為 Markdown 表格
    output_lines = []
    for row in result_rows:
        output_lines.append("| " + " | ".join(row) + " |")
    
    return "\n".join(output_lines)


def extract_table_as_text(page) -> str:
    """
    【策略二修正：多重策略提取】
    針對不同類型的表格，嘗試多種提取策略並合併結果，確保不遺漏表頭和數據。
    """
    extracted_text = ""
    
    # 策略 1：格線優先 (lines strategy) - 適合有明顯邊框的表格
    tables_lines = page.extract_tables({
        "vertical_strategy": "lines", 
        "horizontal_strategy": "lines"
    })
    
    # 策略 2：文字間隙優先 (text strategy) - 適合無格線但對齊的表格
    tables_text = page.extract_tables({
        "vertical_strategy": "text",
        "horizontal_strategy": "text",
        "intersection_x_tolerance": 20, 
        "snap_tolerance": 5,
    })
    
    # 策略 3：寬鬆的文字策略 - 降低容許度，捕捉更多行
    tables_loose = page.extract_tables({
        "vertical_strategy": "text",
        "horizontal_strategy": "text",
        "intersection_x_tolerance": 30,  # 更寬鬆
        "snap_tolerance": 10,  # 更寬鬆
    })
    
    # 選擇最合適的結果：優先使用行最多的提取結果
    all_tables = [tables_lines, tables_text, tables_loose]
    best_tables = max(all_tables, key=lambda t: sum(len(table) for table in t) if t else 0) if any(all_tables) else []
    
    if not best_tables:
        return ""

    for table in best_tables:
        markdown_rows = []
        for row in table:
            # 【關鍵步驟：平坦化處理】
            # extract_tables 會把斷行的字變成 "Key English\nTest (KET)"
            # 我們這裡把 \n 換成空白，讓 LLM 看到的是連續的一句話！
            clean_row = []
            for cell in row:
                if cell:
                    # 移除換行符號，黏合斷裂的文字
                    text_content = str(cell).replace('\n', '').strip()
                    clean_row.append(text_content)
                else:
                    clean_row.append("") # 保持空白佔位，維持對齊
            
            # 過濾掉「整列都是空」的無效列
            if any(c for c in clean_row):
                # 使用 Pipe 分隔，這對 LLM 來說是最標準的表格格式
                markdown_rows.append("| " + " | ".join(clean_row) + " |")
        
        # 【新增步驟：表格標準化】在返回前進行邏輯合併
        if markdown_rows:
            markdown_table = "\n".join(markdown_rows)
            normalized = normalize_table_structure(markdown_table)
            extracted_text += normalized + "\n\n"

    return extracted_text.strip()

def call_llm_api(page_text: str) -> str:
    client = Client(            
        host=api_endpoint,
        verify=False,
        headers={'Content-Type': 'application/json'}
    )       
    start = '**PDF內容開始**'
    end ='**PDF內容結束**'
    prompt_prefix = """
    【角色】你是一個資料庫建檔專家，擅長解析 Markdown 表格數據。

    【輸入說明】
    輸入內容為 **Markdown 格式表格 (使用 | 分隔欄位)**。
    程式已經預先將儲存格內的斷行修復，內容是連續的。

    【核心任務：多層表頭與資料對應】
    請依照以下邏輯解析表格：

    1. **識別多層表頭 (Multi-level Headers)**：
       - 表格的前 1~3 列通常是表頭。
       - **父子合併**：若第一列是「大標題」(如：托福)，第二列是「子標題」(如：紙筆型態)，請輸出 **「托福-紙筆型態」**。
       - **空欄位處理**：在表頭區域，若某格是空的 (`| |`)，通常代表它屬於**左側**或**上方**標題的延伸，請根據上下文填補。

    2. **修正標題錯位**：
       - **注意邊界**：請觀察分隔線 `|`。屬於「多益」下方的「舊制/新制」，不要錯誤對應到「托福」。
       - **標題完整性**：若標題為「本校公務人員陞任評分計分標準」，請完整保留，不可簡化為「分數」。

    3. **推論缺失標題**：
       - 若某欄位表頭完全空白，請看內容推斷。
       - 範例：若內容是 "A2(基礎級)"，表頭應標記為 "CEF等級"；若內容是 "S-1+"，表頭應為 "口試"。

    4. **處理複雜多行數據的關鍵規則**：
       - 若表格某個「假別」(主類別) 有多個子項目，請**按邏輯分組**，不要重複輸出「假別」欄位。
       - 格式：【假別】→【子項目1】；【子項目2】... (使用箭頭和分號區隔層級)
       - 例如：假別「特別休假」有多個工作年限條件，應改為：
         **假別：特別休假，給假日數：(1) 6個月以上1年未滿者，3日；(2) 1年以上2年未滿者，7日；...，工資給與：工資照給。**
       - 而不是重複輸出多次「假別：特別休假」。

    【輸出格式嚴格規範】
    1. **邏輯分組優先於行對應**：表格同一「主類別」的多行內容，應**合併為單一輸出行**，用分號或箭頭分隔子項目。
    2. 格式：**欄位名：內容，欄位名：內容。** (全形逗號分隔，句號結尾)
    3. **嚴禁**重複輸出相同欄位名，除非它們是不同的邏輯主體。
    4. **嚴禁**輸出 Markdown 代碼 (如 |---|)、JSON 或 XML。
    5. **嚴禁**閒聊。

    【待處理內容】：
    """


    prompt = prompt_prefix+start+page_text+end    

    response = client.generate(model= llm_model, 
        options={'num_ctx': num_ctx},
        prompt=prompt)
    
    return response['response']      

# ==========================================
# 4. 基礎輔助函式
# ==========================================

def calc_file_hash(path: str) -> str:
    h = hashlib.sha256()
    with open(path, 'rb') as f:
        h.update(f.read())
    return h.hexdigest()

def detect_repeated_lines(pages_lines: List[List[str]], threshold=0.5) -> set:
    total_pages = len(pages_lines)
    if total_pages < 2: return set()
    counter = Counter()
    for lines in pages_lines:
        unique_lines = set(line.strip() for line in lines if line.strip())
        for line in unique_lines: counter[line] += 1
    repeated = {line for line, cnt in counter.items() if cnt > 1 and (cnt / total_pages >= threshold)}
    return repeated

def clean_text_lines(lines: List[str]) -> List[str]:
    cleaned = []
    for line in lines:
        line = line.strip()
        if not line: continue
        if any(re.search(p, line) for p in NOISE_PATTERNS): continue
        if any(k in line for k in WATERMARK_KEYWORDS): continue
        cleaned.append(line)
    return cleaned

def extract_section_title(lines: List[str]) -> str | None:
    for line in lines:
        line = line.strip()
        for pattern in SECTION_PATTERNS:
            if pattern.match(line): return line
    return None

# ==========================================
# 5. 主程式邏輯
# ==========================================

def parse_pdf(pdf_path: str):
    file_hash = calc_file_hash(pdf_path)
    results = []
    
    with pdfplumber.open(pdf_path) as pdf:
        raw_pages = []
        doc_title = pdf.metadata.get("Title") or "未命名文件"

        # 第一輪：偵測重複內容 (頁首頁尾)
        for page in pdf.pages:
            text = page.extract_text() or ""
            raw_pages.append(text.splitlines())
        repeated_lines = detect_repeated_lines(raw_pages)

        current_section_title = doc_title 

        # 第二輪：正式處理
        for page_num, page in enumerate(pdf.pages, start=1):
            # 1. 取得並清理文字 (用於標題偵測與純文字頁面)
            text = page.extract_text() or ""
            lines = text.splitlines()
            lines = clean_text_lines(lines)
            lines = [l for l in lines if l not in repeated_lines]
            
            # 2. 標題偵測 (若是新章節，更新標題)
            found_title = extract_section_title(lines)
            if found_title:
                current_section_title = found_title

            page_content = ""
            final_type = "text"
            
            # 3. 判斷是否為表格頁
            is_complex = detect_complex_page(page, lines)
            
            if is_complex:                
                structured_text = extract_table_as_text(page)

                structured_text="""
                劍橋大學英語能力認證分級測驗(Cambridge MainSuite) | 劍橋大學國際商務英語能力測驗(BULATS) | 外語能力測驗(FLPT) |  | 全民英檢(GEPT) | CEF語言能力參考指標 | 本校公務人員陞任評分計分標準 | 托福（TOEFL） |  |  | 多益測驗(TOEIC) |  | 大學校院英語能力測驗(CSEPT) |  | IELTS
 |  | 三項筆試總分 | 口試 |  |  |  | 紙筆型態 | 電腦型態 | 網路型態 | 舊制 | 新制 | 第一級 | 第二級 |
Key EnglishTest (KET) | ALTELevel 1 | 105-149 | S-1+ | 初級 | A2(基礎級)Waystage | 3分 | 390以上 | 90以上 | - - - | 350以上 | 總分225以上：聽力110以上且閱讀115以上 | 170 | - - - | - - -
PreliminaryEnglish Test(PET) | ALTELevel 2 | 150-194 | S-2 | 中級 | B1(進階級)Threshold | 5分 | 457以上 | 137以上 | 42以上 | 550以上 | 總分550以上：聽力275以上且閱讀275以上 | 230 | 240 | 4以上
FirstCertificate inEnglish (FCE) | ALTELevel 3 | 195-239 | S-2+ | 中高級 | B2(高階級)Vantage | 8分 | 527以上 | 197以上 | 72以上 | 750以上 | 總分785以上：聽力400以上且閱讀385以上 | - - - | 330 | 5.5以上
Certificate inAdvancedEnglish (CAE) | ALTELevel 4 | 240-330 | S-3以上 | 高級 | C1(流利級)EffectiveOperationalProficiency | 8分 | 560以上 | 220以上 | 95以上 | 880以上 | 總分945以上：聽力490以上且閱讀455以上 | - - - | - - - | 7以上
Certificate ofProficiency inEnglish (CPE) | ALTELevel 5 | - - - |  | 優級 | C2(精通級)Mastery | 8分 | 630以上 | 267以上 | 114以上 | 950以上 | - - - | - - - | - - - | 8.5以上
"""                 
                # 只有當真的有抓到東西時才送給 LLM
                if structured_text:
                    # 成功抓到結構化表格，送給 LLM
                    page_content = call_llm_api(structured_text)
                    final_type = "table_text_strategy"
                else:
                    # 雖然判定複雜但抓不到字 (可能是純圖片)，退回一般處理
                    page_content = "\n".join(lines)
                    final_type = "text_fallback"
                
            else:
                # [路線 B] 純文字頁面 -> 直接串接 lines
                if lines:
                    page_content = "\n".join(lines)

            if not page_content:
                continue

            results.append({
                "title": doc_title,
                "file_hash": file_hash,
                "page_num": page_num,
                "section_title": current_section_title, 
                "page_content": page_content,
                "type": final_type,
                "structured_text":structured_text
            })
        
    return results

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python parse_pdf.py <pdf_path>")
        sys.exit(1)
    rows = parse_pdf(sys.argv[1])
    print(json.dumps(rows, ensure_ascii=False, indent=2))