o
    ^iQ                     @   s   d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlmZ d	efd
dZdd Zdd ZeddG dd dZG dd deZdS )    )BaseBackend	GPUTargetLanguage)irpassesllvmamd)knobs)	dataclass)AnyDictTuple)
ModuleTypeN)Pathtargetc                 C   s   dd S )Nc                 S   s   dS )N   r   r    )lhs_typerhs_typer   r   W/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/triton/backends/amd/compiler.py<lambda>   s    z"get_min_dot_size.<locals>.<lambda>r   r   r   r   r   get_min_dot_size   s   r   c                 C   s,   t jjd u r| dkp| dko|du S t jjS )Ngfx942gfx950T)r	   r   use_block_pingpong)archuse_async_copyr   r   r   is_pingpong_schedule_enabled   s   r   c                 C   s   t jjd u r
| dkS t jjS )Nr   )r	   r   use_in_thread_transposer   r   r   r   is_in_thread_transpose_enabled   s   r"   T)frozenc                   @   s2  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	e
ed	< d
Zeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< d"Zeed#< d$Zeed%< d&d' Z d(d) Z!dS )*
HIPOptions   	num_warpsr   waves_per_eu   
num_stagesnum_ctasNextern_libsr   cluster_dimsFdebugTsanitize_overflowr   )fp8e4nvfp8e5fp8e5b16fp8e4b8supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypesieeedefault_dot_input_precision)r5   allowed_dot_input_precisionsenable_fp_fusionlaunch_cooperative_gridr   matrix_instr_nonkdimkpackallow_flush_denormmax_num_imprecise_acc_defaulthipbackend_name instrumentation_modenoneschedule_hintc                 C   s   t | jdd }|dkrdnd}t| d| | jdkr'| j| jd @ dks+J d	| jd
krF| jdkrFtd| j d t| dd tt	j
d }| jd u rTi nt| j}dD ]}t|| d ||< q[t| dt|  d S )N   
       @   	warp_sizer   r   znum_warps must be a power of 2r   zckpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = z7 will be overwritten to 1 to make transitioning easier.r;   lib)ocmlocklz.bcr+   )intr   object__setattr__r&   r;   warningswarnr   __file__parentr+   dictstrtupleitems)self	gfx_majorrI   default_libdirr+   rJ   r   r   r   __post_init__G   s     zHIPOptions.__post_init__c                 C   s.   d dd | j D }t|d S )N_c                 S   s   g | ]\}}| d | qS )-r   ).0namevalr   r   r   
<listcomp>[   s    z#HIPOptions.hash.<locals>.<listcomp>zutf-8)join__dict__rW   hashlibsha256encode	hexdigest)rX   keyr   r   r   hashZ   s   zHIPOptions.hash)"__name__
__module____qualname__r&   rM   __annotations__r'   r)   r*   r+   rT   r,   rV   r-   boolr.   r   rU   r3   r   r4   r6   r7   r8   r9   r:   r;   r<   r=   r?   rA   rC   r[   ri   r   r   r   r   r$      s2   
 r$   c                       s
  e Zd ZdZedefddZdeddf fddZdefdd	Z	de
fd
dZdd Zdd Zdeeef fddZdd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zd&d' Ze d(d) Z  ZS )*
HIPBackendNr   c                 C   s
   | j dkS )Nr>   )backendr   r   r   r   supports_targetb   s   
zHIPBackend.supports_targetreturnc                    s&   t  | t|jtsJ d| _d S )Nhsaco)super__init__
isinstancer   rU   
binary_ext)rX   r   	__class__r   r   ru   f   s   
zHIPBackend.__init__c                 C   s   d|j  S )Nhip:r!   rX   optionsr   r   r   get_target_namek   s   zHIPBackend.get_target_namec                    s   dt jjp| jji} dddkrtd| jjdkr/ttj	}|
dh tt||d< d vr<tttj|d< | jjd	krVttj}|
d
dh tt||d< d vr`t jj|d< |
 fddtj D  tdi |S )Nr   r*   r   z'num_ctas > 1 not supported for AMD GPUsr   tf32r7   r3   r   r1   r2   r4   r8   c                    s*   i | ]}| v r | d ur| | qS Nr   )r^   koptsr   r   
<dictcomp>   s    z,HIPBackend.parse_options.<locals>.<dictcomp>r   )r	   runtimeoverride_archr   r   get
ValueErrorsetr$   r7   updaterV   sortedr3   r4   languagedefault_fp_fusion__dataclass_fields__keys)rX   r   argsr7   r4   r   r   r   parse_optionsn   s"   

zHIPBackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r   r(   )r&   r*   sharedr,   )rX   metadatar   r   r   pack_metadata   s   zHIPBackend.pack_metadatac                 C   s   dt | jiS )Nmin_dot_size)r   r   r{   r   r   r   get_codegen_implementation   s   z%HIPBackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )	libdeviceztriton.language.extra.libdevice)triton.language.extra.hipr   )rX   r   r   r   r   get_module_map   s   zHIPBackend.get_module_mapc                 C   s$   t | tjrtj| d S d S r   )r   load_dialectsro   instrumentation)rX   ctxr   r   r   r      s   
zHIPBackend.load_dialectsc                 C   sL   dd l }d}t| dr|  |kS t| |jr$t| dr$|   |kS dS )Nr   i	ptr_rangeuntyped_storageF)torchhasattrr   rv   Tensorr   size)argr   
MAX_INT_32r   r   r   is_within_2gb   s   
zHIPBackend.is_within_2gbc                 C   s$   t | }d| v r|ddgg7 }|S )NSztt.pointer_rangerG   )r   
parse_attr)descretr   r   r   r      s   
zHIPBackend.parse_attrc                 K   s:   t j| |fi |}tjjr|dkrt| r|d7 }|S )Ntensorr   )r   get_arg_specializationr	   r   use_buffer_opsro   r   )r   tykwargsr   r   r   r   r      s   z!HIPBackend.get_arg_specializationc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| tj| tj| ||  | S r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_triton_licmadd_symbol_dceadd_loop_unrollrun)modr   r|   pmr   r   r   	make_ttir   s   
zHIPBackend.make_ttirc                 C   s  t | j}|  tj|d|j |j|j	|j
 ||  t | j}|  tj| tj| tj| tjj||j|j|j tj| tjj| tj|d tjj| tj| tj| tj| tj| tjj}tjj}tjj}t|j|}tjj ||j!|||| |rtjj"||j tj| |j#$ dkrtjj%||j# tj|d tj| tj&| t'|jrtjj(| tj| tjj)| |r|j!dkrtjj*||j! tjj+rtjj,| tj| tjj-||jtjj. tjj/| tj| tj0| tj1| |r;tjj2||j ||  | S )Nrz   TrB   r   )3r   r   r   r   r   r   add_convert_to_ttgpuirr   r&   rI   r*   r   ttgpuiradd_coalesceadd_remove_layout_conversionsadd_optimize_thread_localityr   add_accelerate_matmulr:   r;   add_optimize_epilogueadd_optimize_dot_operandsadd_hoist_layout_conversionsadd_fuse_nested_loopsr   r   r   r	   global_prefetchlocal_prefetchr   r   add_stream_pipeliner)   add_coalesce_async_copyrC   lowerinsert_instruction_sched_hintsadd_reduce_data_duplicationr"   add_in_thread_transposeadd_reorder_instructionsadd_block_pingpongr   add_canonicalize_pointersadd_convert_to_buffer_opsuse_buffer_atomicsadd_fold_true_cmpir   r   add_update_async_wait_count)r   r   r|   r   r   r   r   r   r   r   r   
make_ttgir   sj   



zHIPBackend.make_ttgirc                 C   sn   | }t |j}|  tj| tj| tj	| tj
| tj| tj| || |S r   )r   r   r   r   r   gluonr   add_resolve_auto_encodingsr   add_sccpr   add_loop_aware_cser   r    add_combine_tensor_select_and_ifr   )srcr   r|   r   r   r   r   r   gluon_to_ttgir  s   
zHIPBackend.gluon_to_ttgirc                    s  | }t |j}|  d}tjj||j| tj	
| tj	| tjj| tjr7tjd||j d}tjj||j| tj| tj| tj	| tj	| tj| tj| tj| |j dkrtjj||j|j tjrtjd||j tjjstj| tjj || |!| t"#  t" }t"$|| t%  d}tjj&rd}t"' tj(|j| t) |j t* d t+ d	d
 t+ dd t+ dd
 t+ d|j,dk dd  - D }	|	d .tj/ |	d 0dd|j1|j,   |	d 0d|j2  |j3r#dnd}
|	d 0d|
 tjj&r?|	d 4d |	d 5  t6|	d  tjj&rjt7t8j9d }t:|d t:|d t:|d g}t"; | n|j<r fdd|j<D }t=|dkrt"; | t"> t"j?|jdg |j@ tA|jr|	d Bd |	d Bd |	d Bd tjjCrtD|	d  | Ed|d < | Ed!pd|d"< | Ed#pd$|d%< tF  tG  t: S )&Nr   ttgpuir_to_llvmirTrB   llvmir_to_llvmr@   +xnacki  __oclc_finite_only_optF__oclc_correctly_rounded_sqrt32__oclc_unsafe_math_opt__oclc_wavefrontsize64rH   c                 S   s   g | ]}|  s|qS r   )is_declaration)r^   fnr   r   r   ra   ]  s    z(HIPBackend.make_llir.<locals>.<listcomp>zamdgpu-flat-work-group-sizez1,zamdgpu-waves-per-euzpreserve-signr5   zdenormal-fp-math-f32rJ   z
asanrtl.bczocml.bczockl.bcc                    s    g | ]\}}t  |r|qS r   )r   need_extern_lib)r^   r_   pathllvm_modr   r   ra   }  s     zamdgpu-no-workgroup-id-xzamdgpu-no-workgroup-id-yzamdgpu-no-workgroup-id-zz
ttg.sharedr   zttg.profile_scratch_memory_sizeprofile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)Hr   r   r   r   r   r   r   add_optimize_lds_usager   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryro   r   patchadd_to_llvmirr   r   r   add_cf_to_llvmiradd_arith_to_llvmirr   rC   r   lower_instruction_sched_hintsr)   r	   compilationdisable_line_infollvmiradd_di_scopeadd_builtin_func_to_llvmirr   r   init_targets	to_moduleattach_target_tripleenable_asanattach_datalayoutTARGET_TRIPLEset_isa_versionset_abi_versionset_bool_control_constantrI   get_functionsset_calling_convCALLING_CONV_AMDGPU_KERNELadd_fn_attrr&   r'   r<   add_fn_target_featureadd_fn_asan_attrset_all_fn_arg_inregr   rR   rS   rU   link_extern_libsr+   lenoptimize_moduleOPTIMIZE_O3r8   has_architected_sgprsremove_fn_attrscalarize_packed_fops#add_scalarize_packed_fops_llvm_passget_int_attrcleanup_bitcode_metadatadisable_print_inline)r   r   r|   r   r   custom_lds_size_HIPBackend__HIP_FTZr   target_featuresfnsdenormal_moderZ   pathsr   r   r   	make_llir  s   









zHIPBackend.make_llirc              	   C   s   t d| }t|dksJ |d |d< g }|jdkr |d d|jv r'dnd	}t| tj	|j|||j
d
}tjjrCtd t| |S )Nz3define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)r   r   r_   	attentionzsink-insts-to-avoid-spillsgfx11z-real-true16r@   Fz!// -----// AMDGCN Dump //----- //)refindallr  rC   appendr   r   translate_to_asmr   r  r8   r	   dump_amdgcnprint)r   r   r|   namesflagsfeaturesamdgcnr   r   r   make_amdgcn  s   

zHIPBackend.make_amdgcnc           
   
   C   s   d}t jjrd}t| |j|}t b}t +}t|j	d}|
| W d    n1 s0w   Y  t|j	|j	 W d    n1 sGw   Y  t|j	d}| }	W d    n1 saw   Y  W d    |	S W d    |	S 1 syw   Y  |	S )Nr@   r   wbrb)r	   r   r  r   assemble_amdgcnr   tempfileNamedTemporaryFileopenr_   write
link_hsacoread)
r   r   r|   r  rs   tmp_outtmp_infd_infd_outr   r   r   r   
make_hsaco  s*   




zHIPBackend.make_hsacoc                    s   |t jkr fdd|d<  fdd|d< n|t jkr& fdd|d<  fdd|d<  fd	d|d
<  fdd|d< d S )Nc                        | | S r   )r   r   r   r|   rX   r   r   r         z'HIPBackend.add_stages.<locals>.<lambda>r   c                    r=  r   )r   r>  r?  r   r   r     r@  ttgirc                    r=  r   )r   r>  r?  r   r   r     r@  c                    r=  r   )r!  r>  r?  r   r   r     r@  llirc                    r=  r   )r.  r>  r?  r   r   r     r@  r-  c                    r=  r   )r<  r>  r?  r   r   r     r@  rs   )r   TRITONGLUON)rX   stagesr|   r   r   r?  r   
add_stages  s   

zHIPBackend.add_stagesc                 C   s   | j  S r   r   )rX   r   r   r   ri     s   zHIPBackend.hash) rj   rk   rl   r   staticmethodr   rq   ru   rU   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!  r.  r<  rF  	functools	lru_cacheri   __classcell__r   r   rx   r   ro   _   sB    






:

 


ro   )triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r   tritonr	   dataclassesr
   typingr   r   r   typesr   rd   r2  r$  rH  rP   pathlibr   r   r   r"   r$   ro   r   r   r   r   <module>   s$    @