import pdfplumber

def extract_vertical_text(pdf_path):
    """
    專門處理【直排文字】(由上而下，由右至左) 的 PDF 提取函式
    """
    text_content = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 1. 抓取頁面所有字元物件
            chars = page.chars
            
            # 2. 關鍵排序邏輯：
            #    第一優先 (Primary Key): x0 (X座標) 由大到小 (因為是從右邊開始讀) -> 使用 -c['x0']
            #    第二優先 (Secondary Key): top (Y座標) 由小到大 (由上往下讀) -> 使用 c['top']
            #    
            #    注意：為了避免同一行有些微誤差導致順序錯亂，
            #    我們可以把 x0 除以一個寬度 (例如 10) 來做分群 (Bucket)，
            #    但最簡單的暴力法通常對這種整齊排版有效。
            
            # 這裡做一個簡單的容錯排序 (Tolerance Sorting)
            # 這是簡易版，對付這份檔案應該夠用
            chars_sorted = sorted(chars, key=lambda c: (-c['x0'], c['top']))
            
            # 3. 組合文字
            page_text = ""
            for char in chars_sorted:
                page_text += char.get('text', '')
            
            text_content += page_text + "\n"
            
    return text_content

def detect_text_orientation(pdf_path, check_pages=3, min_chars_per_page=15):
    """
    偵測 PDF 文字排版方向：直排 (vertical) 或 橫排 (horizontal)
    
    參數：
        pdf_path: PDF 檔案路徑
        check_pages: 檢查前 N 頁（預設 3 頁）
        min_chars_per_page: 每頁最少字符數（預設 15）
    
    返回：
        "vertical" (直排) 或 "horizontal" (橫排)
    """
    vertical_score = 0
    horizontal_score = 0
    valid_pairs = 0  # 有效的字對數量

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # 檢查前幾頁
            for page in pdf.pages[:check_pages]:
                chars = page.chars
                if len(chars) < min_chars_per_page:
                    continue

                # ==================================================
                # 線性掃描：比較相鄰字符的位置
                # ==================================================
                for i in range(len(chars) - 1):
                    c1 = chars[i]
                    c2 = chars[i + 1]

                    # 取字體大小（預設 12）
                    font_size = c1.get("size", 12)
                    if font_size <= 0:
                        font_size = 12
                    
                    # 計算相鄰字的水平和垂直距離
                    dx = abs(c2["x0"] - c1["x0"])
                    dy = abs(c2["top"] - c1["top"])

                    # --------------------------------------------------
                    # 過濾異常情況：距離太遠的字對（可能是換段或跳行）
                    # --------------------------------------------------
                    dist_limit = font_size * 2.5  # 容許 2.5 倍字高的距離
                    if dx > dist_limit and dy > dist_limit:
                        # 兩個軸都太遠，說明不連續
                        continue
                    
                    # 橫向跳躍（換欄）：X 軸跳躍但 Y 軸穩定
                    if dx > dist_limit and dy < font_size * 0.8:
                        continue
                    
                    # 縱向跳躍（換行/段落）：Y 軸跳躍但 X 軸穩定
                    if dy > dist_limit and dx < font_size * 0.8:
                        continue

                    # --------------------------------------------------
                    # 判定方向
                    # 容許值：0.6 倍字高（較寬鬆，容許更多波動）
                    # --------------------------------------------------
                    tol = font_size * 0.6

                    # 橫排特徵：Y 軸幾乎不動，X 軸明顯移動
                    if dy < tol and dx > font_size * 0.3:
                        horizontal_score += 1
                        valid_pairs += 1

                    # 直排特徵：X 軸幾乎不動，Y 軸明顯移動
                    elif dx < tol and dy > font_size * 0.3:
                        vertical_score += 1
                        valid_pairs += 1

        # 輸出診斷資訊
        print(f"[檢測結果] 檢查字對數: {valid_pairs}, 直排: {vertical_score}, 橫排: {horizontal_score}")

        # 邊界情況：如果沒有有效的判斷，預設為橫排
        if valid_pairs < 10:
            print(f"[警告] 樣本過少 ({valid_pairs} 對)，可能判斷不准確")
            return "horizontal"

        if vertical_score == 0 and horizontal_score == 0:
            return "horizontal"

        # 判定邏輯：取積分較高的方向
        # 不考慮混合排版，只判定主要方向
        result = "vertical" if vertical_score > horizontal_score else "horizontal"
        confidence = max(vertical_score, horizontal_score) / valid_pairs if valid_pairs > 0 else 0
        
        # 檢查是否為混合排版（兩者積分都接近）
        ratio = min(vertical_score, horizontal_score) / max(vertical_score, horizontal_score) if max(vertical_score, horizontal_score) > 0 else 0
        if ratio > 0.3:
            print(f"[注意] 檢測到可能的混合排版（比例: {ratio:.1%}），但只判定主要方向")
        
        print(f"[可信度] {confidence:.1%}")
        return result

    except Exception as e:
        print(f"[檢測失敗] {e}，預設使用橫排")
        return "horizontal"


# ==================== 測試程式 ====================
if __name__ == "__main__":
    import sys
    
    # 測試用法
    if len(sys.argv) > 1:
        pdf_path = sys.argv[1]
        print(f"正在檢測: {pdf_path}")
        orientation = detect_text_orientation(pdf_path)
        print(f"最終結果: {orientation}")

       # 判斷直式後，呼叫 extract_vertical_text() 讀取內容
        if orientation == "vertical":
            print("\n" + "="*50)
            print("[直排文字內容]")
            print("="*50)
            vertical_text = extract_vertical_text(pdf_path)
            print(vertical_text)
        else:
            print("\n[提示] 此檔案為橫排文字，不進行直排提取")

    else:
        print("使用方式: python pdf_test.py <pdf_path>")
        print("例如: python pdf_test.py holiday.pdf")