o
    XTiS                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZ ddlmZ eejdkr9ejd ded	ee fd
dZdee d	e jfddZdedeeee	f  d	dfddZejdd fdee d	dfddZedkr|e  dS dS )    N)defaultdictdeque)chain)AnyDefaultDictDictList   )PDFz--helpp_strreturnc                 C   s8   d| v rt t| d\}}tt||d S t| gS )N-r	   )mapintsplitlistrange)r   startend r   I/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/pdfplumber/cli.pyparse_page_spec   s   
r   args_rawc                 C   s   t d}|jddt dd | }|jdddd	 |jd
ddd	 |jdg ddd |jddd |jdddd |jdddd |jdtjd |jdtd |jddtd |jdtdd |	| }|j
d urvtt|j
 |_
|S )N
pdfplumberinfile?rb)nargstypez--structurezoWrite the structure tree as JSON.  All other arguments except --pages, --laparams, and --indent will be ignored
store_true)helpactionz--structure-textzWrite the structure tree as JSON including text contents.  All other arguments except --pages, --laparams, and --indent will be ignoredz--format)csvjsontextr"   )choicesdefaultz--types+)r   z--include-attrsz1Include *only* these object attributes in output.)r   r    z--exclude-attrsz,Exclude these object attributes from output.z
--laparams)r   z--precisionz--pagesz--indentz&Indent level for JSON pretty-printing.)r   r    )argparseArgumentParseradd_argumentFileTypeadd_mutually_exclusive_groupr#   loadsr   r   
parse_argspagesr   r   )r   parsergroupargsr   r   r   r.      sF   


r.   pdfdatac           	         s   t dd }| jD ]!}||j  |jD ]}|d}|d u rq |  |d 7  < qq	t|}|rc| }d|v r@||d  |d}|d u rJq/||  d|v r_ fdd	|d D |d< |s1d S d S )
Nc                   S   s   t tS )N)r   strr   r   r   r   <lambda>J   s    z#add_text_to_mcids.<locals>.<lambda>mcidr$   childrenpage_numbermcidsc                    s   g | ]} | qS r   r   ).0r7   text_contentsr   r   
<listcomp>\   s    z%add_text_to_mcids.<locals>.<listcomp>)r   r/   r9   charsgetr   popleftextend)	r3   r4   page_contentspagecr7   delpagenor   r<   r   add_text_to_mcidsI   s*   




rI   c              	   C   sD  t | }tj|j|j|jd}|jrttj	|j
|jd nY|jr6|j
}t|| ttj	||jdd nJ|jdkrK|jtj|j|j|j|jd n=|jdkr_|jD ]
}t|jdd	 qSn1|jtj|j|j|j|j|jd
 W d    d S W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )N)r/   laparams)indentF)rK   ensure_asciir"   )	precisioninclude_attrsexclude_attrsr$   T)layout)rM   rN   rO   rK   )r.   r
   openr   r/   rJ   	structureprintr#   dumpsstructure_treerK   structure_textrI   formatto_csvsysstdouttypesrM   rN   rO   extract_textto_json)r   r2   r3   treerD   r   r   r   main_   sJ   



"r_   __main__)r(   r#   rY   collectionsr   r   	itertoolsr   typingr   r   r   r   r3   r
   lenargvappendr5   r   r   	Namespacer.   rI   r_   __name__r   r   r   r   <module>   s    "2$ 
