U
    d                     @   s   d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
Zej r^d dlmZmZ dde	ed	d
dZedddZdS )    )CallableN)GraphModule)
TensorMetagetnvFuserDtype)PrimContext)FusionFusionDefinitionatenexecutor)ctxr   c             
   O   sb  |dkr t i | j}|j||S |dkrLtj s<tdt|dksLt| j	d}| jj
| t }t|}|g}|D ]L}	t|	tjr||	 |	 t|	j}
||
 ||
 q||
 q| jjD ]&}
|
jdkr|
jj|
_|f|
j |
_qt i | j}|j| }|| |tdd |D d W  5 Q R  S Q R X d	|}t|d
S )zJ
    Prototype ATen executor.

    Just executes the context's graph.
    r	   ZnvfuserzCAttempting to use nvFuser trace executor but CUDA is not available!r   fdcall_functionc                 s   s   | ]}t |tjr|V  qd S )N)
isinstancetorchTensor).0arg r   9/tmp/pip-unpacked-wheel-ua33x9lu/torch/_prims/executor.py	<genexpr>@   s      zexecute.<locals>.<genexpr>zQReceived unexpected value for 'executor': {0}. Allowed values are: aten, nvfuser.N) r   graphZforwardr   cudais_availableRuntimeErrorlenAssertionErrorplaceholder_rootappendr   r   r   r   Zdefine_tensorsizeZstrider   ZdtypeZ	add_inputZnodesoptargetZimpl_nvfuserargsZ
add_outputexecutetupleformat
ValueError)r   r   r#   kwargsZgmZgraph_fdZfusionr   Znv_argsr   xoutmsgr   r   r   r$      sP    


  




r$   fnc                    s   dd fdd
}|S )a  
    Returns a function that, when called, will
    trace its torch operations to prims and then
    execute those prims on the requested trace executor
    (possibly lowering them to that trace executor first).

    Only supports the torch operations defined in _torch_to_reference_map
    in context.py and operations with positional args. All args must
    be tensors and the function must return a single tensor. In the
    near future all these restrictions will be lifted.

    Example usage:

    def foo(a, b):
      return torch.add(a, b)

    traced_foo = make_traced(foo)

    a = torch.randn((1, 2, 3, 4, 5), device='cuda')
    b = torch.randn((1, 2, 3, 4, 5), device='cuda')
    result = traced_foo(a, b, executor='nvfuser')

    Executor may be either 'aten' or 'nvfuser'.
    r	   r
   c              	      s~   t jtX}g }|D ]6}t|t jr<||t| q||| q | }|	| W 5 Q R X t
|f|d| iS )Nr   )r   Z	overridesZpush_torch_function_moder   r   r   r   r   r   outputr$   )r   r#   r   Zplaceholdersr   resultr,   r   r   _tracedc   s    zmake_traced.<locals>._tracedr   )r-   r0   r   r,   r   make_tracedI   s    r1   )typingr   r   Ztorch.fxr   Ztorch._prims.utilsr   r   Ztorch._prims.contextr   Ztorch.overridesr   r   Ztorch._C._nvfuserr   r   strr$   r1   r   r   r   r   <module>   s   
;