o
    î^iál  ã                   @  s~  U d dl mZ d dlZd dlZd dlmZmZmZmZmZ d dl	m
Z
mZmZmZ d dlZd dlmZ er;d dlmZ ddlmZ g d	¢Zed
ƒZe
dƒZeejdƒskedƒejjd< edƒejjd< edƒejjd< d dlmZmZmZ d2dd„Zd3dd„ZG dd„ dejjƒZ G dd„ dƒZ!edede"f f Z#de$d< e	 	!	d4d5d*d+„ƒZ%e	 	!	d4d6d.d+„ƒZ%	 	!	d4d7d1d+„Z%dS )8é    )ÚannotationsN)ÚCallableÚOptionalÚoverloadÚTYPE_CHECKINGÚUnion)Ú	ParamSpecÚSelfÚ	TypeAliasÚTypeVar)ÚTensor)Ú_POOL_HANDLEé   )Ú_dummy_type)Úis_current_stream_capturingÚgraph_pool_handleÚ	CUDAGraphÚgraphÚmake_graphed_callablesÚ_RÚ_PÚ_CudaStreamBaseÚ
_CUDAGraphÚ_graph_pool_handleÚ_cuda_isCurrentStreamCapturing)r   r   r   ÚreturnÚboolc                   C  s   t ƒ S )zÌReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r   © r   r   úL/var/www/html/RAG/RAG_venv/lib/python3.10/site-packages/torch/cuda/graphs.pyr   /   s   r   r   c                   C  s   t j tƒ ¡S )zÚReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )ÚtorchÚcudar   r   r   r   r   r   r   8   s   r   c                      s´   e Zd ZdZd'd(‡ fdd„Z	
d)d*‡ fdd„Zd+‡ fdd„Zd+‡ fdd„Zd+‡ fdd„Zd+‡ fdd„Z	d,‡ fdd„Z
d+‡ fdd„Zd-‡ fd d!„Zd.‡ fd#d$„Zd.‡ fd%d&„Z‡  ZS )/r   a-  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it will be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    FÚ
keep_graphr   r   r	   c                   s   t ƒ  | |¡S ©N)ÚsuperÚ__new__)Úclsr!   ©Ú	__class__r   r   r$   _   s   zCUDAGraph.__new__NÚglobalÚpoolúOptional[_POOL_HANDLE]Úcapture_error_modeÚstrÚNonec                   s   t ƒ j||d dS )að  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )r)   r+   N)r#   Úcapture_begin)Úselfr)   r+   r&   r   r   r.   b   s   zCUDAGraph.capture_beginc                   ó   t ƒ  ¡  dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r#   Úcapture_end©r/   r&   r   r   r1   w   s   	zCUDAGraph.capture_endc                   r0   )a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r#   Úinstantiater2   r&   r   r   r3   ‚   s   zCUDAGraph.instantiatec                   r0   )z,Replay the CUDA work captured by this graph.N)r#   Úreplayr2   r&   r   r   r4   ‹   ó   zCUDAGraph.replayc                   r0   )z1Delete the graph currently held by this instance.N)r#   Úresetr2   r&   r   r   r6      r5   zCUDAGraph.resetr   c                   ó
   t ƒ  ¡ S )zäReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r#   r)   r2   r&   r   r   r)   “   s   
zCUDAGraph.poolc                   r7   )z/Enable debugging mode for CUDAGraph.debug_dump.)r#   Úenable_debug_moder2   r&   r   r   r8   ›   s   
zCUDAGraph.enable_debug_modeÚ
debug_pathc                   s   t ƒ  |¡S )zÖ
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r#   Ú
debug_dump)r/   r9   r&   r   r   r:   Ÿ   s   zCUDAGraph.debug_dumpÚintc                   r7   )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r#   Úraw_cuda_graphr2   r&   r   r   r<   ©   ó   
zCUDAGraph.raw_cuda_graphc                   r7   )aª  Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.

        See the following for APIs for how to manipulate this object: `Graph Execution <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH__EXEC.html>`_ and `cuda-python Graph Execution bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-execution>`_
        )r#   Úraw_cuda_graph_execr2   r&   r   r   r>   °   r=   zCUDAGraph.raw_cuda_graph_exec)F)r!   r   r   r	   )Nr(   )r)   r*   r+   r,   r   r-   ©r   r-   ©r   r   )r9   r,   r   r-   )r   r;   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r$   r.   r1   r3   r4   r6   r)   r8   r:   r<   r>   Ú__classcell__r   r   r&   r   r   D   s    ÿ	
r   c                   @  sD   e Zd ZU dZdZded< 			dddd„Zddd„Zddd„ZdS )r   aÅ  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    NúOptional[torch.cuda.Stream]Údefault_capture_streamr(   Ú
cuda_graphr   r)   r*   Ústreamr+   r,   c                 C  sr   | j jd u rtj ¡ | j _|d u rdn|f| _|d ur|n| j j| _| jd us)J ‚tj | j¡| _|| _	|| _
d S )Nr   )r'   rG   r   r    ÚStreamr)   Úcapture_streamrI   Ú
stream_ctxrH   r+   )r/   rH   r)   rI   r+   r   r   r   Ú__init__Ø   s   
ÿÿ
zgraph.__init__r   r-   c                 C  sJ   t j ¡  t jjjrt ¡  t j ¡  | j	 
¡  | jj| jd| jiŽ d S )Nr+   )r   r    ÚsynchronizeÚcompilerÚconfigÚforce_cudagraph_gcÚgcÚcollectÚempty_cacherL   Ú	__enter__rH   r.   r)   r+   r2   r   r   r   rU   ð   s   



þ
ýzgraph.__enter__ÚargsÚobjectc                 G  s   | j  ¡  | jj|Ž  d S r"   )rH   r1   rL   Ú__exit__)r/   rV   r   r   r   rX     s   
zgraph.__exit__)NNr(   )rH   r   r)   r*   rI   rF   r+   r,   r?   )rV   rW   r   r-   )	rA   rB   rC   rD   rG   Ú__annotations__rM   rU   rX   r   r   r   r   r   ¸   s   
 û
r   útorch.nn.Module.r
   Ú_ModuleOrCallableé   FÚ	callablesÚsample_argsútuple[Tensor, ...]Únum_warmup_itersr;   Úallow_unused_inputr)   r*   c                 C  ó   d S r"   r   ©r]   r^   r`   ra   r)   r   r   r   r     ó   r   útuple[_ModuleOrCallable, ...]útuple[tuple[Tensor, ...], ...]c                 C  rb   r"   r   rc   r   r   r   r     rd   ú7Union[_ModuleOrCallable, tuple[_ModuleOrCallable, ...]]ú9Union[tuple[Tensor, ...], tuple[tuple[Tensor, ...], ...]]c           )        sô  t  ¡ rt  ¡ rtdƒ‚d}t| tƒs$d}| f} t ttdf |¡f}nt tttdf df |¡}g ‰ t	| |ƒD ]N\}}t|t j
jƒrlt|jƒdkrYt|jƒdkrYt|jƒdks]J dƒ‚tdd„ | ¡ D ƒƒslJ d	ƒ‚t jjj|Ž }	ˆ  t|	ƒ¡ td
d„ |	D ƒƒs‡J dƒ‚q9dd„ ˆ D ƒ}
dd„ | D ƒ‰‡ ‡fdd„tt| ƒƒD ƒ}dd„ tt| ƒƒD ƒ}dd„ tt| ƒƒD ƒ}|du rÁtƒ n|}t j ¡  t j t j ¡ ¡\ t	| ||ƒD ]M\}}}d\}}}t|ƒD ]4}t jj ||Ž ¡}tdd„ |D ƒƒ}t|ƒdkrt jj|tdd„ |D ƒƒtdd„ |D ƒƒd|d}qæ|||fD ]}~q qØW d  ƒ n	1 s1w   Y  t j ¡  g }g }t	| ||ƒD ]8\}}}t jj||d ||Ž }W d  ƒ n	1 sbw   Y  t jj  |¡\}}| t|ƒ¡ | |¡ qEg }g }t	t!|ƒt!|ƒt!|ƒƒD ]†\}}}tdd„ |D ƒƒ} tdd„ |D ƒƒ}d}t|ƒdkràt jj||d! t jj|tdd„ |D ƒƒtdd„ | D ƒƒd|d}W d  ƒ n	1 sÛw   Y  g }!d}"|D ]}#|#j"rþ|durþ|! ||" ¡ |"d7 }"qæ|! d¡ qæt|!ƒ}!| | ¡ | |!¡ qŽ| #¡  | #¡  d:d/d0„}$g }%t$| ƒD ]F\}&}|$||& ||& ˆ|& |
|& ||& ||& ||& ||& ||& ƒ	}'t|t j
jƒrhd;d8d9„}(|(||j%|'|j&ƒ|_&|% |¡ q(|% |'¡ q(|rv|%d S t|%ƒS )<aØ  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FT.r   z§Modules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s  s    | ]}|j d u V  qdS )FN©Úrequires_grad©Ú.0Úbr   r   r   Ú	<genexpr>‰  ó   € z)make_graphed_callables.<locals>.<genexpr>zœIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s  s    | ]	}t |tjƒV  qd S r"   )Ú
isinstancer   r   )rl   Úargr   r   r   rn     ó   € zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S  s   g | ]}t |ƒ‘qS r   )Úlen)rl   rV   r   r   r   Ú
<listcomp>—  s    z*make_graphed_callables.<locals>.<listcomp>c                 S  s*   g | ]}t |tjjƒrt| ¡ ƒnd ‘qS )r   )rp   r   ÚnnÚModuleÚtupleÚ
parameters)rl   Úcr   r   r   rt   ˜  s    ÿÿc                   s   g | ]
}ˆ | ˆ|  ‘qS r   r   ©rl   Úi©Úflatten_sample_argsÚper_callable_module_paramsr   r   rt   œ  s    ÿÿc                 S  ó   g | ]}t j ¡ ‘qS r   ©r   r    r   ©rl   Ú_r   r   r   rt   ¡  ó    c                 S  r   r   r€   r   r   r   r   rt   ¢  rƒ   N)NNNc                 s  ó    | ]}|j r|V  qd S r"   ri   ©rl   Úor   r   r   rn   ±  ro   c                 s  r„   r"   ri   rz   r   r   r   rn   µ  s   € ÿ
ÿc                 s  s     | ]}|j rt |¡V  qd S r"   ©rj   r   Ú
empty_liker…   r   r   r   rn   ¸  s   € ÿ
ÿ)ÚoutputsÚinputsÚgrad_outputsÚonly_inputsÚallow_unused)r)   c                 s  s$    | ]}|j rt |¡nd V  qd S r"   r‡   r…   r   r   r   rn   Ü  ó   € 
ÿc                 s  r„   r"   ri   r…   r   r   r   rn   à  ro   c                 s  r„   r"   ri   rz   r   r   r   rn   æ  ro   c                 s  s    | ]	}|d ur|V  qd S r"   r   r…   r   r   r   rn   ç  rr   é   Ú	fwd_graphr   Ú	bwd_graphÚmodule_paramsútuple[torch.nn.Parameter, ...]Úlen_user_argsr;   Úoutput_unflatten_specútorch.utils._pytree.TreeSpecÚstatic_input_surfacer_   Ústatic_outputsÚstatic_grad_outputsútuple[Optional[Tensor], ...]Ústatic_grad_inputsr   úCallable[..., object]c	           
        s:   G ‡‡‡‡‡‡‡	fdd„dt jjƒ‰ d‡ ‡‡fdd„}	|	S )	Nc                      sD   e Zd Zed‡‡‡‡fdd„ƒZeejjjd‡ ‡‡fd
d„ƒƒZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.GraphedÚctxrW   rŠ   r   r   r_   c                   s`   t ˆƒD ]}ˆ|  ¡ ||  ¡ krˆ|  || ¡ qˆ  ¡  tˆtƒs'J ‚tdd„ ˆD ƒƒS )Nc                 s  s    | ]}|  ¡ V  qd S r"   ©Údetachr…   r   r   r   rn     s   € zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)ÚrangeÚdata_ptrÚcopy_r4   rp   rw   )r   rŠ   r{   )r   r”   r—   r˜   r   r   Úforward  s   €zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardÚgradsc                   sr   t |ƒt ˆƒks
J ‚tˆ|ƒD ]\}}|d ur$| ¡ | ¡ kr$| |¡ qˆ  ¡  tˆtƒs0J ‚tdd„ ˆD ƒƒS )Nc                 s  s$    | ]}|d ur|  ¡ n|V  qd S r"   rž   rk   r   r   r   rn   %  rŽ   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)rs   Úzipr¡   r¢   r4   rp   rw   )r   r¤   ÚgÚgrad)r‘   r›   r™   r   r   Úbackward  s   
€ÿzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)r   rW   rŠ   r   r   r_   )r   rW   r¤   r   r   r_   )
rA   rB   rC   Ústaticmethodr£   r   ÚautogradÚfunctionÚonce_differentiabler¨   r   )r‘   r   r”   r›   r™   r—   r˜   r   r   ÚGraphed  s    	r­   Ú	user_argsrW   r   c                    s0   t jjj| Ž }ˆ jt|ƒˆ Ž }t jj |ˆ¡S r"   )r   ÚutilsÚ_pytreeÚarg_tree_leavesÚapplyrw   Útree_unflatten)r®   Úflatten_user_argsÚout)r­   r’   r•   r   r   Úfunctionalized)  s   zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r®   rW   r   rW   )r   rª   ÚFunction)
r   r‘   r’   r”   r•   r—   r˜   r™   r›   r¶   r   )
r­   r‘   r   r”   r’   r•   r›   r™   r—   r˜   r   Úmake_graphed_autograd_function  s   $z>make_graphed_callables.<locals>.make_graphed_autograd_functionÚfuncrZ   Úgraph_training_stater   ÚgraphedúCallable[_P, _R]Úorig_fwdc                   s   d	‡ ‡‡‡fdd„}|S )
Nr®   ú_P.argsÚuser_kwargsú	_P.kwargsr   r   c                    s&   ˆ j ˆkrˆ| i |¤ŽS ˆ| i |¤ŽS r"   )Útraining)r®   r¿   ©r¹   rº   r»   r½   r   r   Únew_fwdJ  s   
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwd)r®   r¾   r¿   rÀ   r   r   r   )r¹   rº   r»   r½   rÃ   r   rÂ   r   Úmake_graphed_forwardD  s   z4make_graphed_callables.<locals>.make_graphed_forward)r   r   r‘   r   r’   r“   r”   r;   r•   r–   r—   r_   r˜   r_   r™   rš   r›   r_   r   rœ   )
r¹   rZ   rº   r   r»   r¼   r½   r¼   r   r¼   )'r   Úis_autocast_enabledÚis_autocast_cache_enabledÚRuntimeErrorrp   rw   ÚtypingÚcastr   r¥   ru   rv   rs   Ú_backward_hooksÚ_forward_hooksÚ_forward_pre_hooksÚallÚbuffersr¯   r°   r±   Úappendr    r   r    rN   rI   rJ   Útree_leavesrª   r§   r   Útree_flattenÚreversedrj   ÚreverseÚ	enumeraterÁ   r£   ))r]   r^   r`   ra   r)   Újust_one_callableÚ_sample_argsry   rV   Úflatten_argÚper_callable_len_user_argsÚ"per_callable_static_input_surfacesÚ
fwd_graphsÚ
bwd_graphsÚmempoolr¹   r—   Úgrad_inputsr‰   Úoutputs_gradr‚   ÚvÚper_callable_static_outputsÚ"per_callable_output_unflatten_specr   Úfunc_outputsÚflatten_outputsÚspecÚ per_callable_static_grad_outputsÚper_callable_static_grad_inputsr˜   r‘   r™   r›   Úgrad_idxrq   r¸   Úretr{   r»   rÄ   r   r|   r   r   %  s  Iÿ
þÿÿþ
þ
ÿ
ÿÿ÷€ÿíÿ

ÿýÿûÿ

3÷
ÿ)r   r   r@   )r\   FN)r]   r[   r^   r_   r`   r;   ra   r   r)   r*   r   r[   )r]   re   r^   rf   r`   r;   ra   r   r)   r*   r   re   )r]   rg   r^   rh   r`   r;   ra   r   r)   r*   r   rg   )&Ú
__future__r   rR   rÈ   r   r   r   r   r   Útyping_extensionsr   r	   r
   r   r   r   Ú
torch.cudar   Ú_utilsr   Ú__all__r   r   ÚhasattrÚ_CÚ__dict__Útorch._Cr   r   r   r   r   r   r   rW   r[   rY   r   r   r   r   r   Ú<module>   sN    	ÿ

	tVû	ûû