o
    ^i>U                     @   sl  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efddZde
j fddZ!e" dd Z#e" de$fddZ%de$fddZ&e" de$fddZ'e"ddd Z(de$fddZ)eddG d d! d!Z*G d"d# d#eZ+dS )$    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   dt tttf fdd}|S )Nreturnc                 S   s0   | j j}|j j}||ksJ d|dkrdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r      )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidth r   Z/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r    r   r   r   min_dot_size   s   
r"   r   c                   C   s   t jjS N)r	   r   ptxasr   r   r   r   	get_ptxas"   s   r%   c                  C   s0   t jj} | d ur
| S tt jdgd}|S )Nz	--versionutf-8)r	   r   mock_ptx_version
subprocesscheck_outputr%   pathdecode)mock_verversionr   r   r   get_ptxas_version&   s
   r.   c                 C   s   t | tsJ tt| d\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S |d
krCd}||d
 d  | S td|  )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   r      F   
   ?      Z   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr!   splitRuntimeError)cuda_versionmajorminorbase_ptxr   r   r   ptx_get_version/   s   rB   archc                 C   s"   | j }|d u rt j}t|}|S r#   )ptx_versionr%   r-   rB   )optionsrC   rD   r>   r   r   r   get_ptx_version_from_optionsG   s
   rF   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)rF   min)rE   rC   rD   llvm_ptx_versionfeaturesr   r   r   get_featuresO   s   


rK   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r*   fr   r   r   	file_hash]   s   $rS   
capabilityc                 C   s   | dkrdnd}d|  | S )Nr8   a sm_r   )rT   suffixr   r   r   sm_arch_from_capabilityc   s   rY   T)frozenc                   @   sF  e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	e
e ed
< dZeed< d	Zeed< d	Zeed< d	Ze
e ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< d	Zeed< d	Zeed< dZeed< d Zeed!< dZeed"< d	Z eed#< d$Z!eed%< d&d' Z"d(d) Z#d	S )*CUDAOptions   	num_warpsr   num_ctas   
num_stagesr   	warp_sizeNmaxnreg)r   r   r   cluster_dimsrD   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)rm   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrC   rV   instrumentation_modec                 C   s   t tjd }| jd u ri nt| j}|dd s%tjjp"t	|d |d< t
| dt|  | jdkr?| j| jd @ dksCJ dd S )Nlib	libdevicezlibdevice.10.bcrs   r   r   znum_warps must be a power of 2)r   __file__parentrs   dictgetr	   r   libdevice_pathr:   object__setattr__tupleitemsr]   )selfdefault_libdirrs   r   r   r   __post_init__   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S r#   )rS   ).0kvr   r   r   	<genexpr>   s    z#CUDAOptions.hash.<locals>.<genexpr>rs   _c                 S   s   g | ]\}}| d | qS )-r   )r   namevalr   r   r   
<listcomp>   s    z$CUDAOptions.hash.<locals>.<listcomp>r&   )
r}   __dict__r   sortedjoinr   rN   rO   encoderQ   )r   	hash_dictkeyr   r   r   hash   s   
zCUDAOptions.hash)$__name__
__module____qualname__r]   r!   __annotations__r^   r`   ra   rb   r   rc   r   rD   rd   r:   re   rf   boolrg   rh   rk   r   rl   rn   rq   rr   rs   r}   rt   rv   rw   rC   rx   r   r   r   r   r   r   r[   i   s4   
 
r[   c                       s   e Zd ZdZedefddZdd Zdefdd	Z	deddf fd
dZ
defddZdd Zdd Zdeeef fddZdd Zedd Zedd Zdd Zdd Zdd Zd d! Zd"d# Ze d$d% Z  ZS )&CUDABackendNr   c                 C   s
   | j dkS )Nru   )backend)r   r   r   r   supports_target   s   
zCUDABackend.supports_targetc                 C   s0   d}t ||}|std| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r   )re	fullmatch
ValueErrorr!   group)r   rC   patternmatchr   r   r   _parse_arch   s
   zCUDABackend._parse_archr   c                 C   s   |  |j}d| S )Ncuda:)r   rC   )r   rE   rT   r   r   r   get_target_name      
zCUDABackend.get_target_namec                    s   t  | d| _d S )Ncubin)super__init__
binary_ext)r   r   	__class__r   r   r      r   zCUDABackend.__init__c                    s   dt jjpd| jj i}| fddtj D  t	| 
|d }|dddkr9|dk r9td| d	d
|vrSttj}|dkrK|d tt||d
< d|vr_|dkr_d|d< d|vrit jj|d< |dkrodnd|d< tdi |S )NrC   smc                    s*   i | ]}| v r | d ur| | qS r#   r   )r   r   optsr   r   
<dictcomp>   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>r^   r   r8   zBnum_ctas > 1 requires NVIDIA SM90+ (Hopper). Current target is sm_zM. This configuration will fail. Please set num_ctas=1 or target an SM90+ GPU.rk   Y   fp8e4nvrl   )rj   rf   i   @r   rr   r   )r	   runtimeoverride_archr   rC   updater[   __dataclass_fields__keysr!   r   r~   r   setrk   addr   r   languagedefault_fp_fusion)r   r   argsrT   rk   r   r   r   parse_options   s&   


zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r      )r]   r^   sharedrc   )r   metadatar   r   r   pack_metadata   s   zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr|jn|jt	| j
d}|S )Nr   r2   )convert_custom_typesr"   )triton.language.extra.cudar   extraru   r!   r   rC   convert_custom_float8_sm80convert_custom_float8_sm70r"   r   )r   rE   ru   rT   codegen_fnsr   r   r   get_codegen_implementation   s   z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )rz   ztriton.language.extra.libdevice)r   rz   )r   rz   r   r   r   get_module_map   s   zCUDABackend.get_module_mapc                 C   s$   t | tjrtj| d S d S r#   )r   load_dialectsr   instrumentation)r   ctxr   r   r   r      s   
zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| |d dk r"tj	| tj
| tj| tj| tj| tj| tj| ||  | S )Nr5   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optrT   pmr   r   r   	make_ttir   s   
zCUDABackend.make_ttirc                 C   s  |j d ur| dt| j|j  t }|jd ur.|jd |_	|jd |_
|jd |_t| j}| }tj|d| |jd|j tj| |d dkrYtj| tjj|| tj| tj| tj| tj| tj||d	k tjj| tj| |d d
v rtj| tj | tj!| tj | tj"| tjj#$||j%| tj&||j% tj'| tj(||j%| ng|d dkr6tj| tj | tj!| tj)| tj*|d tjj+| tj&||j% tj'| tj,||j% tj(||j%| tj"| tj*|d tjj-| ntj!| tj | tj| tj.| tj||d	k tj/| tjj0| tj| tjj1| tj2| tj3| tj| tj4| |d dkrtjj5| tjj6|| tjj7| tj8| tj9| tj | |:|  |j	|j
|jf|d< | ; }||d< | S )Nzttg.maxnregr   r   r   r   r   r5   r   r2   )r   r   FTr   rc   tensordesc_meta)<rb   set_attrr   builderr   get_int32_attrr   ClusterInforc   clusterDimXclusterDimYclusterDimZr   r   r   r   add_convert_to_ttgpuirr]   r^   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecr`   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_tma_loweringadd_fence_insertionadd_lower_mmaadd_sccpr   r   get_tensordesc_metadata)r   r   r   rT   cluster_infor   dump_enabledr   r   r   r   
make_ttgir   s   


zCUDABackend.make_ttgirc                 C   sz   |}t |j}|  tj| tj| tj	| tj
| tj| tj| || | |d< |S )Nr   )r   r   r   r   r   gluonr   add_resolve_auto_encodingsr   r  r   r   r   r   r   r   r  )r   srcr   rE   rT   r   r   r   r   r   gluon_to_ttgirE  s   
zCUDABackend.gluon_to_ttgirc                 C   s  t || jj}|}t|j}|  tj	| tj
| tj| tjj||| tjj| tjjr?tj| tj| tjj|| tjrYtjd||j tjj||| tj| tj| tjj| tjj| tj| tj| tj | tj!| tjj"stj#$| tjrtjd||j |%| t&'  t& }tjj(rt)dt&*||}	t+|}
t,|| jj}d}t-  t&.|	||
| t/|	 |j0rt1|	rdd |j0D }t&2|	| t&3|	t&j4 |5d}|d ur||d< |5d	|d
< |5d|d< |5d|d< |5d|d< |5dp1d|d< |5dp;d|d< t6|	}~	~|S )Nttgpuir_to_llvmirllvmir_to_llvmzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )r   r   r*   r   r   r   r         z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsr]   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_alignzttg.profile_scratch_memory_sizer   profile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)7rF   r   rC   r   r   r   r   r   r   r   add_allocate_warp_groupsconvertadd_scf_to_cfr   add_allocate_shared_memory_nvr   add_allocate_tensor_memoryr	   compilationenable_experimental_consanadd_concurrency_sanitizer"add_allocate_global_scratch_memoryadd_proxy_fence_insertionr   r   patchadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   add_nvvm_to_llvmdisable_line_infollvmiradd_di_scoper   r   init_targetsenable_asanr=   	to_modulerY   rK   set_short_ptrattach_datalayoutset_nvvm_reflect_ftzrs   has_extern_depslink_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr:   )r   r  r   rE   rT   rD   r   r   r   llvm_modprocrJ   triplepathstotal_num_warpsretr   r   r   	make_llirU  sv   



zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}t||||g |jd}	t	d|	}
t
|
dks.J |
d |d< |d  d|d  }tjd	d
| |	tjd}	tjdd| |	tjd}	tdd|	}	tjjrltd t|	 |	S )Nr  Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   r   r5   r/   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rV   z // -----// NVPTX Dump //----- //)rF   r   rC   rY   rK   r   translate_to_asmrf   r   findalllensub	MULTILINEr	   r   
dump_nvptxprint)r   r  r   r   rT   rD   r>  r=  rJ   rA  namesr   r   r   make_ptx  s    zCUDABackend.make_ptxc                 C   s  t  j}tjdddde}tjddddB}|| |  |jd }g }	tjj	r3|	dd	g7 }	ntj
jr=|	d
g7 }	n|	dg7 }	|jrGg ndg}
t|}tj
jrVddgng }|jra|jdng }|g|	|
d||d| |jd|}zDtj|dd|d tj
jrt|j}t|  W d    n1 sw   Y  tj|jrt|j tj|jrt|j W nk tjy+ } z]t|j}| }W d    n1 sw   Y  tj|jrt|j |jdkrd}n|jdtj krd}nd|j }| d| dd| d}td| d| d t|d }~ww t|d}| }W d    n	1 sAw   Y  tj|rRt| W d    n1 s]w   Y  W d    |S W d    |S 1 svw   Y  |S ) NFwz.ptx)deletemoderX   rz.logz.oz	-lineinfoz-suppress-debug-infoz-gz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
zC

================================================================
z

zy
================================================================
please share the reproducer above with Triton project.
rL   )r%   r*   tempfileNamedTemporaryFilewriteflushr   r	   r$  r.  r   disable_ptxas_optrf   rY   rd   r<   r(   r   dump_ptxas_logrM   rJ  rP   osexistsremoveCalledProcessError
returncodesignalSIGSEGVr   r
   )r   r  r   r   rT   r$   fsrcflogfbin
debug_infofmadrC   disable_optptx_extra_options	ptxas_cmdlog_fileelogerrorrR   r   r   r   r   
make_cubin  s   





	

*HHHzCUDABackend.make_cubinc                    s    j |tjkr  fdd|d<  fdd|d< n|tjkr/ fdd|d<  fdd|d< fd	d|d
< fdd|d< d S )Nc                        | | S r#   )r   r  r   rT   rE   r   r   r   <lambda>      z(CUDABackend.add_stages.<locals>.<lambda>r   c                    rs  r#   )r  rt  ru  r   r   rv    rw  ttgirc                    rs  r#   )r  rt  ru  r   r   rv    rw  c                    rs  r#   )rB  rt  ru  r   r   rv    rw  llirc                        | | jjS r#   )rL  r   rC   rt  rE   r   r   r   rv    r  ptxc                    rz  r#   )rr  r   rC   rt  r{  r   r   rv    r  r   )r   rC   r   TRITONGLUON)r   stagesrE   r   r   ru  r   
add_stages  s   

zCUDABackend.add_stagesc                 C   s   t  }| d| jj S )Nr   )r.   r   rC   )r   r-   r   r   r   r   
  s   zCUDABackend.hash)r   r   r   r   staticmethodr   r   r   r:   r   r   r   r   r   r   r   r   r   r   r   r  r  rB  rL  rr  r  	functools	lru_cacher   __classcell__r   r   r   r   r      s.    



OHLr   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r   tritonr	   triton.runtime.errorsr
   dataclassesr   r  typingr   r   r   r   typesr   rN   r   rY  rd  r_  r(   pathlibr   r"   
NvidiaToolr%   r  r.   r!   rB   rF   rK   rS   rY   r[   r   r   r   r   r   <module>   s<    

,