o
    ´^i_  ã                   @   sl   d dl Z d dlmZ d dlmZ ddd„Ze jdd„ ƒZe jddej	fdd	„ƒZ
e jddej	fd
d„ƒZdS )é    N)ÚTensorDescriptorc                    sì   t |ƒ}t ˆ jƒ}t|ƒ}|dk r||7 }d|  kr$|d k s)J dƒ‚ J dƒ‚|dks1J dƒ‚t|ƒ|ks;J dƒ‚d}d}|| |ksIJ d	ƒ‚|||< ˆ  |¡}d
| |g‡ fdd„t|ƒD ƒ }||g| }	ddg| }
tˆ |	||
ƒS )a­  
    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
    which behaves like a concatenation (along the first axis) of subarrays
    of potentially unequal size.

    The load_ragged and store_ragged device functions can be used to read
    and write from subarrays T[batch_offset : batch_offset + batch_size]
    with hardware bounds-checking preventing any sort of leakage outside
    the subarray.
    r   é   zlast dimension cannot be raggedé   z<read-write ragged descriptors must have at most 3 dimensionsz1block shape must have same length as tensor shapei  ÿé   @z#number of rows may not exceed 2**30l        c                    s   g | ]}ˆ   |¡‘qS © )Ústride)Ú.0Úi©ÚTr   úR/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/triton/tools/ragged_tma.pyÚ
<listcomp>)   s    z,create_ragged_descriptor.<locals>.<listcomp>)ÚlistÚshapeÚlenr   Úranger   )r   Úblock_shapeÚ
ragged_dimÚtensor_shapeÚrankÚmax_intÚbillionÚragged_strideÚ
tma_strideÚ	tma_shapeÚ	box_shaper   r
   r   Úcreate_ragged_descriptor   s"   
(
"r   c                 C   s"   d}|| | }| | }|||fS )z;
    Helper function for load_ragged and store_ragged.
    r   r   )Úbatch_offsetÚ
batch_sizeÚrowr   ÚxÚyr   r   r   Úto_ragged_indices0   s   
r"   r   c           	      C   s~   t  t| jƒt|ƒd kd¡ t|||| ƒ\}}}|  ||g|d|…  |g ||d d…  ¡}t  ||jdd… ¡}|S )zù
    Read from a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where reading outside the subarray gives zeros.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.load().
    é   z*TMA must be a read-write ragged descriptorNr   )ÚtlÚstatic_assertr   r   r"   ÚloadÚreshape)	ÚTMAr   r   Úcoordsr   Úc0Úc1Úc2Údatar   r   r   Úload_ragged=   s
   
0r.   c           	      C   sb   t |||| ƒ\}}}t |ddg|j ¡}|  ||g|d|…  |g ||d d…  |¡ dS )a  
    Write to a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where writes outside the subarray are masked
    correctly.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.store().
    r   N)r"   r$   r'   r   Ústore)	r(   r   r   r)   r-   r   r*   r+   r,   r   r   r   Ústore_raggedO   s   6r0   )r   )ÚtritonÚtriton.languageÚlanguager$   Útriton.tools.tensor_descriptorr   r   Újitr"   Ú	constexprr.   r0   r   r   r   r   Ú<module>   s    
(
