o
    ^iQ                  !   @   s8  d dl mZmZmZ d dlZd dlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddgZG dd deZd	d
e de
 de de	 d	 e_							d)dee dee dee dee dee dee dee dedee dededededededef ddZd d! Zdee dee dee dee dee dee dedededededededefd"d#Zdee dee dee dee dee dee dedededededededefd$d%Zdee dee dee dee dee dee dedededededededed&dfd'd(ZdS )*    )castOptionalUnionN)Tensor   )_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_params_doc
_to_scalar_use_grad_for_differentiable_view_as_real	OptimizerParamsTAdagradadagradc                       s   e Zd Z						ddddddedeeef d	ed
edededee dededee f fddZ	 fddZ
dd Zdd ZedddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefusedparamslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   c             
      sb  t |tr| dkrtdd|kstd| d|ks%td| d|ks0td| d|ks;td| d|ksFtd| ||||||||	|
d		}t || |
rj|	ratd
|rgtdd| _| jD ]A}|d D ]:}| j	| }|d rt
jdt|d d|jdnt
jdt d|d< t
|rt||n|}t
j||t
jd|d< qsqmd S )Nr   zTensor lr must be 1-elementg        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r    r   r   r!   r   r   r   z)`fused` does not support `differentiable`z0`fused` and `foreach` cannot be `True` together.Tr   r    is_fused)dtypedevicer%   step)memory_formatsum)
isinstancer   numel
ValueErrorsuper__init__RuntimeError"_need_device_dtype_check_for_fusedparam_groupsstatetorchzerosr   r&   tensor
is_complexcomplex	full_likepreserve_format)selfr   r   r   r   r   r    r!   r   r   r   defaultsgrouppr3   
init_value	__class__r"   N/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/torch/optim/adagrad.pyr/      sf   

zAdagrad.__init__c                    s   t  | d }| jD ]}|dd  |dd |dd |dd }qt| j }t|dko;t	|d d }|sS|D ]}tj
t|d t|dd	|d< q@d S d S )
Nr!   r   Fr   r   r   r(   r#   r'   )r.   __setstate__r2   
setdefaultlistr3   valueslenr4   	is_tensorr6   floatr   )r;   r3   r   r=   state_valuesstep_is_tensorsr@   r"   rB   rC   b   s$   

zAdagrad.__setstate__c                 C   s4   | j D ]}|d D ]}| j| }|d   q	qdS )z6Calls tensor.share_memory_() on the state sum tensors.r   r*   N)r2   r3   share_memory_)r;   r=   r>   r3   r"   r"   rB   share_memoryw   s   

zAdagrad.share_memoryc           
      C   s   d\}}|d D ]E}|j d urM|d r"t| ddr"t|dd d| _||j jO }|t|O }|| ||j  | j| }	||	d  ||	d	  q||fS )
N)FFr   r   r1   T)cuda_unsupportedFr*   r(   )	gradgetattrr   r1   	is_sparser4   r7   appendr3   )
r;   r=   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexr>   r3   r"   r"   rB   _init_group~   s&   



zAdagrad._init_groupc           
      C   s   d}|durt   | }W d   n1 sw   Y  | jD ]A}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r    r!   r   r   r   
grad_scale	found_inf)r   r   r   r    rX   r!   r   r   rY   r   r[   r\   )r4   enable_gradr2   rZ   r   rQ   )
r;   closurelossr=   rT   rU   rV   rW   rX   rY   r"   r"   rB   r(      s@   




zAdagrad.step)r   r   r   r   r   NN)__name__
__module____qualname__r   r   rI   r   r   boolr/   rC   rN   rZ   r   r(   __classcell__r"   r"   r@   rB   r      sJ    


Fa[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    Fr   rU   rV   rW   r   r[   r\   rX   r!   r   rY   r   r   r   r    r   c                C   s   t dd |D std|du r|du rt| |	dd\}}|du r$d}|du r*d}|r5tj r5td|r@tj r@td|rJtj sJt}n|rTtj sTt}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s    | ]	}t |tjV  qd S r`   )r+   r4   r   ).0tr"   r"   rB   	<genexpr>  s    zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r    rX   r   r   rY   r[   r\   )	allr0   r   r4   jitis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   rU   rV   rW   r   r[   r\   rX   r!   r   rY   r   r   r   r    r   _funcr"   r"   rB   r      sJ   

c                 C   s   |   }t|||S r`   )sizer4   sparse_coo_tensor)rP   grad_indicesrF   rs   r"   r"   rB   _make_sparse>  s   rv   c             	   C   s  |d u r|d u s
J t j st|}t| |||D ]\}}}}|d7 }t|}|s,|n| }|dkrA|jr:td|j||d}|d|d |   }|jr|	 }|
 }| }|t|||d ||}|  |	}|jt|||| | d qt |}|rt |}t |}t |}|j||dd |r| |	 }n| |	}|j||| d |rt |}t |}qd S )Nr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)r4   rl   rm   r   zipr   rR   r0   addcoalesce_indices_valuesadd_rv   powsparse_masksqrt_r7   view_as_realaddcmul_sqrtaddcdiv_view_as_complex)r   rU   rV   rW   r[   r\   r   r   r   r    rX   r   r   rY   paramrP   	state_sumstep_tr(   clrru   grad_valuesstd
std_valuesr7   r"   r"   rB   rp   C  sN   







rp   c                   s  |rJ d|d u r|d u sJ t | dkrd S t  t| |||g}| D ]\\}}}}}ttt |}ttt |}ttt |}ttt |}|
oWtdd |D }|rmt	|||| ||	d|||||d q)|rut
||| |r|t|}tj s|d jrtj|tjddd	dd
 nt|d |dkr|rtj|||d
 ntj|||d
} fdd|D }tj|||dd t|}t||	 |dks|rt|| |}nt||}t||| q)d S )Nz#_foreach ops don't support autogradr   c                 s   s    | ]}|j V  qd S r`   )rR   )rf   rP   r"   r"   rB   rh     s    
z(_multi_tensor_adagrad.<locals>.<genexpr>Trj   g      ?cpu)r&   rw   r   c                    s&   g | ]}  d t |d     qS )r   )r   )rf   r(   r   r   r"   rB   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>rz   )rG   r   r   "_group_tensors_by_device_and_dtyperF   r   rE   r   anyrp   r   r4   _foreach_negcompileris_compilingis_cpu_foreach_add_r6   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_mul__foreach_mul_foreach_addcdiv_)r   rU   rV   rW   r[   r\   r   r   r   r    rX   r   r   rY   grouped_tensorlistsdevice_params_device_grads_device_state_sums_device_state_steps_rq   device_paramsdevice_gradsdevice_state_sumsdevice_state_stepsdevice_has_sparse_grad	minus_clrr   	numeratorr"   r   rB   ro     s   


ro   returnc                C   s~  | sd S |
s|rt d|rt dt|}|d ur|j|ind }|d ur*|j|ind }t| |||g}| D ]\\}}\\}}}}}ttt |}ttt |}ttt |}ttt |}d\}}|d ur~|d ur~||vrz|j	|dd||< || }|d ur|d ur||vr|j	|dd||< || }t
|d t
j||||||||	|||d |d urt
||gt|  q9d S )Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=True)NNT)non_blockingr   )r   r   r   r    r   r[   r\   )r0   r   r&   r   r   itemsr   rE   r   tor4   r   _fused_adagrad__foreach_sub_rG   )r   rU   rV   rW   r[   r\   r   r   r   r    rX   r   r   rY   grad_scale_dictfound_inf_dictgrouped_tensorsr&   rq   r   r   r   r   r   r   r   r   device_grad_scaledevice_found_infr"   r"   rB   rn     sp   
rn   )NNNFNFF)typingr   r   r   r4   r   	optimizerr   r   r	   r
   r   r   r   r   r   r   r   r   r   __all__r   __doc__rE   rd   rI   r   rv   rp   ro   rn   r"   r"   r"   rB   <module>   s*  < (
8

J	

A	

m	
