U
    dw                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&Z&e'e(Z)G dd dZ*eddG dd dZ+eddG dd de,Z-eddeG dd dZ.eddG dd deZ/eddej0j1ee	 ee2 ee2e	f dddZ3G dd dZ4dS )    N)defaultdict)	dataclass)
NamedTupleSequenceIterableAnyListDictOptionalTuple)get_size_of_node)map_arg)compatibility   )get_node_targetOperatorSupportBase)FxGraphDrawer)	ShapeProp)split_by_tags)FxNetAccFusionsFinderCALLABLE_NODE_OPSTensorsNodeListNodeSetis_node_output_tensorc                   @   s   e Zd Zdd ZdS )_SplitterSettingBasec                 C   sf   t  }|jddtdd |jddddd	 |jd
dddd	 | \}}|j| _|j| _|j| _d S )Nz--min_acc_module_sizer   z.Minimum size limit of an accelerator subgraph.)defaulttypehelpz--skip_fusionF
store_truezIf true then no fusion groups. Fusion group is used to enforce no non-tensor data flow between submodules. If we don't have this constrain, setting this to false is recommended as it can reduce overhead.)r   actionr   z--allow_non_tensora  For some backends non-tensor data flow between cpu and them are not allowed. Therefore, if a node supported by accelerator but it has non-tensor inputs or outputs to a cpu node we would want to consider it as a cpu node during splitting. However, for some backends we might not care about non-tensor data flow and we can set this option to true to disable the functionality that prevent non-tensor data flow.)argparseArgumentParseradd_argumentintparse_known_argsmin_acc_module_sizeskip_fusionallow_non_tensor)selfparserargsunknown r-   A/tmp/pip-unpacked-wheel-ua33x9lu/torch/fx/passes/splitter_base.py__init__!   s.    	z_SplitterSettingBase.__init__N)__name__
__module____qualname__r/   r-   r-   r-   r.   r       s   r   F)Zis_backward_compatiblec                   @   sR   e Zd ZdZejjeedddZ	e
dddZdd	 Zd
d ZedddZdS )FxNetAccNodesFindera  
    Finds a set of nodes that can be supported on ACC, excluding nodes that have non-tensor
    input/output to cpu nodes to prevent non-tensor data flow between backends and cpu.

    I.e. if we have a chain:

    ACC_NODE_1 -> ACC_NODE_2 -> ACC_NODE_3 -> CPU_NODE_1

    where every ACC node produces non-tensor output, then they all should be treated as CPU nodes.

    This behavior can be turned off by passing allow_non_tensor=True.
    moduleoperator_supportr(   c                 C   s   || _ || _|| _d S Nr4   )r)   r5   r6   r(   r-   r-   r.   r/   S   s    zFxNetAccNodesFinder.__init__)cpu_worklistc                 C   sH   |rD| d}|jD ],}|| jkr| j| t|s|| qq dS )a!  
        Transitively excludes nodes from ACC supported set.
        For every node in the worklist:
        - removes its downstream ACC nodes from ACC supported set,
        - if any downstream ACC node produces non-tensor output,
          then it gets added into the worklist.
        r   N)popusers	acc_nodesremover   append)r)   r8   nodeuserr-   r-   r.   (reduce_acc_nodes_non_tensor_input_helper]   s    



z<FxNetAccNodesFinder.reduce_acc_nodes_non_tensor_input_helperc                 C   sN   g }| j jjD ]0}|jtkrq|| jkr*qt|r4q|| q| | dS )z
        Excludes nodes from ACC supported set that have direct
        upstream CPU nodes that produce non-tensor outputs.
        N)	r5   graphnodesopr   r;   r   r=   r@   )r)   Znon_tensor_cpu_nodesr>   r-   r-   r.   !reduce_acc_nodes_non_tensor_inputp   s    

z5FxNetAccNodesFinder.reduce_acc_nodes_non_tensor_inputc                 C   sj   g }| j D ]2}t|rq
|jD ]}|| j kr||  q
qq
|sDqf|D ]}| j | qH| | q dS )z~
        Excludes nodes from ACC supported set that produce non-tensor
        outputs and have downstream CPU nodes.
        N)r;   r   r:   r=   r<   r@   )r)   Znew_cpu_nodesZacc_noder?   Znew_cpu_noder-   r-   r.   "reduce_acc_nodes_non_tensor_output   s    



z6FxNetAccNodesFinder.reduce_acc_nodes_non_tensor_outputreturnc                    sF   t  j  fdd jjjD  _ js@       jS )Nc                    s(   h | ] }|j tkr j|r|qS r-   )rC   r   r6   is_node_supported.0nr)   
submodulesr-   r.   	<setcomp>   s   
z/FxNetAccNodesFinder.__call__.<locals>.<setcomp>)	dictr5   named_modulesrA   rB   r;   r(   rD   rE   )r)   r-   rL   r.   __call__   s    zFxNetAccNodesFinder.__call__N)r0   r1   r2   __doc__torchfxGraphModuler   boolr/   r   r@   rD   rE   r   rQ   r-   r-   r-   r.   r3   D   s   r3   c                   @   s   e Zd ZdS )FxNetSplitterInternalErrorN)r0   r1   r2   r-   r-   r-   r.   rW      s   rW   c                   @   s   e Zd ZU eed< eed< dS )Subgraphis_accrB   N)r0   r1   r2   rV   __annotations__r   r-   r-   r-   r.   rX      s   
rX   c                   @   s6   e Zd ZU dZejjed< ee	e
f ed< e	ed< dS )SplitResultaB  
    Stores the results of the splitter.

    Attributes:
        split_module: root module after splitting.
        submodule_inputs: a dict that maps submodule name to its inputs.
        non_acc_submodule_prefix: the prefix for non acc submodules. For
            acc submodule the prefix is alwasy "_run_on_acc_".
    split_modulesubmodule_inputsZnon_acc_submodule_prefixN)r0   r1   r2   rR   rS   rT   rU   rZ   r	   strr   r-   r-   r-   r.   r[      s   

r[   )modelinputstarget_submodulesrG   c           	   
      s   g }i  t dd |  D  fdd}znz6|  D ] \}}||kr8||| q8| |  W n2 tk
r } ztd|  W 5 d}~X Y nX W 5 |D ]}|  qX  S )a~  
    Generate inputs for targeting submdoules in the given model. Note that if two submodules refer to the same obj, this
    function doesn't work.

    Args:
        model: root model.
        inputs: inputs to the root model.
        target_submodules: submodules that we want to generate inputs for.

    Returns:
        A dict that maps from submodule name to its inputs.
    c                 s   s   | ]\}}||fV  qd S r7   r-   )rJ   namemodr-   r-   r.   	<genexpr>   s     z1generate_inputs_for_submodules.<locals>.<genexpr>c                    s   | |  < d S r7   r-   )r5   Zmodule_inputsresultsZsubmodule_to_namesr-   r.   pre_forward   s    z3generate_inputs_for_submodules.<locals>.pre_forwardzDFailed to generate submodule inputs because of the following error:
N)rO   rP   r<   r=   register_forward_pre_hook	Exceptionwarningswarn)	r_   r`   ra   Zhandlesrg   hrb   rc   er-   re   r.   generate_inputs_for_submodules   s    &rn   c                   @   s  e Zd ZdZdZd8ejjee	 e
eedddZeejjef ddd	Zd
d ZejjeejjdddZejjeedddZejjedddZd9edddZd:edddZd;ee eejjef dddZeejjef ddd Z eed!d"d#Z!ed$d%d&Z"e#eef dd'd(Z$e%e& dd)d*Z'e%e& e%e& d+d,d-Z(e%e& d.d/d0Z)d<eejjd1d2d3Z*ejjdd4d5Z+e,dd6d7Z-dS )=_SplitterBasea  
    Splits a GraphModule into sub-GraphModules for execution on CPU or the accelerator.
    Output is a GraphModule with supported and unsupported operators grouped into as few sub-GraphModules as possible.
    Assumes that only "call_module", "call_function" and "call_method" from FX IR can potentially be executed on the accelerator.

    Given the following graph:
          ==> b ==>
        //         \
       a             d
        \         //
          ==> c ==>

    class SimpleModule(torch.nn.Module):
        def forward(self, a):
            b = torch.sin(a)
            c = torch.cos(a)
            d = b + c
            return d

    and providing "operator_support" that indicates that 'b' and 'c' can be executed on the accelerator,
    we will get the following split result:

    main:
    def forward(self, a):
        run_on_acc_0_0 = self._run_on_acc_0_0(a)
        getitem = run_on_acc_0_0[0]
        getitem_1 = run_on_acc_0_0[1]
        run_on_cpu_1_1 = self._run_on_cpu_1_1(getitem, getitem_1)
        return run_on_cpu_1_1

    _run_on_acc_0_0:
    def forward(self, a):
        sin_1 = torch.sin(a)
        cos_1 = torch.cos(a)
        return (sin_1, cos_1)

    _run_on_cpu_1_1:
    def forward(self, sin_1, cos_1):
        add_1 = sin_1 + cos_1
        return add_1
    l       d _run_on_cpu_)r5   sample_inputr6   settingsnon_acc_submodule_namec                 C   s   t |tjjst|| _t| jj|  || _|| _	|| _
t| j| j	| jj | _| jjrbi | _nt|| j | _|  | _|   || _dS )aP  
        Preprocesses graph before splitting:
        - finds nodes supported by ACC,
        - finds fusion groups for ACC nodes having non-tensor IO,
        - builds a graph of direct dependencies,
        - builds a map of fused nodes to their fusions.
        As a result we get self.acc_nodes, self.deps and self.fusions.
        N)
isinstancerS   rT   rU   AssertionErrorr5   r   	propagaterr   r6   rq   r3   r(   r;   r'   fusionsr   	find_depsdepsupdate_deps_for_fusionsrs   )r)   r5   rq   r6   rr   rs   r-   r-   r.   r/     s    
z_SplitterBase.__init__rF   c                 C   sL   t t}| jjjD ]4}|jtkr"q|jD ]}|jdkr(|| | q(q|S )a	  
        Builds a graph of node dependencies. Leaf nodes don't have any
        dependencies and the "output" node doesn't have nodes depending on it.

        Resulting graph has only direct dependencies, i.e. there are no
        transitive dependencies.
        output)	r   setr5   rA   rB   rC   r   r:   add)r)   ry   r>   r?   r-   r-   r.   rx   B  s    


z_SplitterBase.find_depsc                 C   sb   | j D ]V}| j | }|D ]B}| j| | j| |  |jD ]}||kr<| j| | q<qqdS )z
        Updates graph of dependencies so that:
        - nodes from the same fusion depend on the same set of outer nodes,
        - outer nodes depending on a fusion depend on all nodes in that fusion.
        N)rw   ry   updater:   r}   )r)   r>   fusionZfused_neighborr?   r-   r-   r.   rz   T  s    


z%_SplitterBase.update_deps_for_fusions)rc   r`   rG   c                 C   s   |S )z/
        Lower the model to a backend.
        r-   r)   rc   r`   r-   r-   r.   _lower_model_to_backendg  s    z%_SplitterBase._lower_model_to_backendc                 C   s   dS )z
        When an error occurs during lowering or running the lowered mod, we use this
        function to find culprits in the `mod` that causes the error.
        zMUnable to find a culprit because _find_culprit() function is not implemented.r-   r   r-   r-   r.   _find_culpritp  s    z_SplitterBase._find_culprit)rc   supported_nodesc                    sF   dddd G  fdddt }||ddd	}| }|d
 d S )NZ	AliceBlueZchartreuse1Zcrimson)r   	supportedunsupportedc                       s    e Zd Z fddZ  ZS )zE_SplitterBase._draw_graph_based_on_node_support.<locals>.CustomDrawerc                    sJ   t  |}|kr"d |d< n$|jtkr:d |d< nd |d< |S )Nr   Z	fillcolorr   r   )super_get_node_stylerC   r   )r)   r>   template)	__class__	color_mapr   r-   r.   r     s    
zU_SplitterBase._draw_graph_based_on_node_support.<locals>.CustomDrawer._get_node_style)r0   r1   r2   r   __classcell__r-   r   r   )r   r.   CustomDrawer  s   r   Znode_supportTZignore_getattrznode_support.dot)r   Zget_main_dot_graph	write_raw)r)   rc   r   r   drawer	dot_graphr-   r   r.   !_draw_graph_based_on_node_supportz  s    z/_SplitterBase._draw_graph_based_on_node_supportF)
dump_graphc              	      s  t | j }g }tt}tt}dd  | jjjD ]}|jtkrDq4t	||} fdd|j
D }t|tdd tt|D t| }	t|d |	 }
t fdd|j D }| j||r|| || |
|f q4|| |
|f q4|r
| | j| d}| D ]8\}}|D ](\}
}|| d	|
 d
t | d7 }q"q|d7 }| D ]8\}}|D ](\}
}|| d	|
 d
t | d7 }qlq`t| |S )Nc                 S   s   | j d}t|dd S )Ntensor_metadtype)metagetgetattr)argr   r-   r-   r.   	get_dtype  s    z5_SplitterBase.node_support_preview.<locals>.get_dtypec                    s&   g | ]}t |tjjr |nd qS r7   rt   rS   rT   Node)rJ   r   r   r-   r.   
<listcomp>  s   z6_SplitterBase.node_support_preview.<locals>.<listcomp>c                 s   s   | ]\}}|d k	r|V  qd S r7   r-   )rJ   ir   r-   r-   r.   rd     s   z5_SplitterBase.node_support_preview.<locals>.<genexpr>c                 3   s,   | ]$\}}t |tjjr| |fV  qd S r7   r   )rJ   kr   r   r-   r.   rd     s   z$
Supported node types in the model:
z: (z, z)
z&
Unsupported node types in the model:
)rO   r5   rP   r   r|   rA   rB   rC   r   r   r+   lennext	enumeratereversedtuplekwargsitemsr6   rH   r=   r}   r   print)r)   r   rM   r   Zsupported_node_typesZunsupported_node_typesr>   targetZ
arg_dtypes
last_indexZarg_dtypes_tupleZkwarg_dtypes_tuplereportstZdtypesr-   r   r.   node_support_preview  sL    





&&z"_SplitterBase.node_support_previewc              	      s`  d |   }tdd |D }t|| } dt| d7   d| d| d7  | |}td	d |D }t|| } d
t| d7   d| d| d7  t|D ]B\}} |jrd| dn| j | d7   t|j d7  q| | | jdd}|	  |rPt
|ddd}| }	|	 D ]\}
}||
 d q4| j}d}|jjD ]}|jdkrbd|jkrb d|j d7  t||jdd }||| j}tj|  d}d d7  jjD ]R}|jdkrt|s d|j d7  n|t|d 7 }|jdkr|}qڈ d7  tjjd  fd!d"}t|j| | jt| } d#| d$ d%7   d&| d'7  ||k r|}|j}z| |}W n4 t k
r    d(7   | !|7  Y qbY nX z||  W n. t k
r&    d)7   | !|7  Y n
X  d*7  qb d+| d%7   d,| d-7  t"   S ).N c                 S   s   g | ]}|j r|qS r-   rY   rJ   gr-   r-   r.   r     s      z/_SplitterBase.split_preview.<locals>.<listcomp>z+Before removing small acc subgraphs, total z subgraphs are created: z acc subgraphs and z cpu subgraphs.
c                 S   s   g | ]}|j r|qS r-   r   r   r-   r-   r.   r     s      z*After removing small acc subgraphs, total _run_on_acc_z: z	 node(s)
T)
remove_tagpreviewr   z.dotZcall_moduleaccz
Processing acc submodule 
c                    s.   d   fdd}| |}| |  |   S )Nc                    s   | d S r7   r-   )r)   r`   Z
sub_inputsr-   r.   
get_inputs  s    zJ_SplitterBase.split_preview.<locals>.get_submod_inputs.<locals>.get_inputs)rh   r<   )Zmain_modsubmodZexample_inputsr   handler-   r   r.   get_submod_inputs  s    
z6_SplitterBase.split_preview.<locals>.get_submod_inputsr   zChecking inputs...
placeholderzInput = is not a tensor, this might cause problems during lowering!
r{   zChecking outputs...
r>   c                    s2   t | s d| j d7  nt| d 7 d S )NzOutput r   r   )r   rb   r   r   r   r   Ztotal_output_bytesr-   r.   	get_bytes  s    z._SplitterBase.split_preview.<locals>.get_byteszTotal input size in bytes is z , total output size in bytes is ,zF theoretical max qps (bounds by PCIe bandwidth) for this submodule is z.
z#Run into an error during lowering!
z$Run into an error during inference!
zLowering and running succeed!
zB
Theoretical max qps (bounds by PCIe bandwidth) for this model is z bottleneck is submodule .)#put_nodes_into_subgraphsr   remove_small_acc_subgraphsr   rY   rs   rB   tagsplitevalr   Zget_all_dot_graphsr   r   PCIe_BWrA   rC   r   r   rq   r   rv   r   rb   r   rS   rT   r   r   r+   maxr   RuntimeErrorr   r   )r)   r   	subgraphsZacc_subgraphs_numZcpu_subgraphs_numr   subgraphZ	split_modr   Z
dot_graphsrb   r   Zmax_qpsZbottleneck_moduler>   r   Zsubmod_inputsZtotal_input_bytesrK   Zoutput_noder   ZqpsZlowered_submodr-   r   r.   split_preview  s    
&
    

z_SplitterBase.split_previewN)tag_idrG   c                 C   sn   t t}| jjjD ]V}|jtkr"q|jD ]>}|jtkr8q(|dksXt|j	
dd |k r(|| | q(q|S )z
        Builds reversed topological node dependencies, if tag_id is specified,
        we ignore nodes that are in later subgraph i.e. nodes have greater tag_id.
        N_)r   r|   r5   rA   rB   rC   r   r:   r$   r   r   r}   )r)   r   resultr>   r?   r-   r-   r.   find_reverse_depsF  s    


 z_SplitterBase.find_reverse_depsry   c                 C   s   t  }| j D ]x\}}||kr"qt  }|D ]}|||  q,|| |D ]8}|||< |jD ]}||kr`|| | q`|| qNqd S r7   )r|   rw   r   r~   difference_updateall_input_nodesr}   )r)   ry   Zprocessed_noder>   r   Znew_deprK   r   r-   r-   r.   update_reverse_deps_for_fusions\  s    

z-_SplitterBase.update_reverse_deps_for_fusions)r   rG   c                 C   sX   t  }| jjjD ]B}|jtkr|j|kr|jD ]"}|jtkr.|j|kr.|| q.q|S )z
        Finds parent nodes of the `tag` subgraph.

        Traverse the inputs of nodes in the subgraph, if input doesn't belong to the subgraph
        and is not a placeholder, we consider it as the parent node of the subgraph.
        )	r|   r5   rA   rB   rC   r   r   r   r}   )r)   r   parent_nodesr>   r   r-   r-   r.   find_parent_nodes_of_subgraphy  s    
z+_SplitterBase.find_parent_nodes_of_subgraph)r   c           	      C   s   | j t|dd d}| | | |}t }|rd}|D ]"}|| |kr@|| jkr@|} qdq@|dkrnq||_|| |	| || j
kr| j
| D ]}||kr|	| q|jD ] }|jtkr||kr|	| qq4dS )z^
        Extend the acc subgraph with `tag` going the reversed topological direction.
        r   r   )r   N)r   r$   r   r   r   r|   r;   r   r<   r}   rw   r   rC   r   )	r)   r   ry   r   visited_nodesr>   rK   Zfusion_noder   r-   r-   r.   extend_acc_subgraph  s,    





z!_SplitterBase.extend_acc_subgraphc                 C   s\   t  }t  }| jjjD ]<}|jdkr&q|jD ]$}|| jkrF|| q,|| q,q||fS )zK
        Finds nodes that consume module inputs or get_attr nodes.
        >   r   get_attr)r|   r5   rA   rB   rC   r:   r;   r}   )r)   Zstarter_cpu_nodesZstarter_acc_nodesr>   r?   r-   r-   r.   starter_nodes  s    


z_SplitterBase.starter_nodesc           
         sh     \}}t d}|D ]} j| krd} q6qg }g }|sH|r>|rP|n|}t fdd|D d }|d kr|std|t||d | }g }q>|| | || | j	kr| j
kr| j	|   n| j	|   |jD ]8}	|	jtkrq|	 j
kr.||	 n
||	 qq>|rV|t||d |sdtd|S )NTFc                 3   s    | ]} j | kr|V  qd S r7   r   rI   r)   r   r-   r.   rd     s      z9_SplitterBase.put_nodes_into_subgraphs.<locals>.<genexpr>zSubgraph can't be empty)rY   rB   zCouldn't create subgraphs)r   r|   ry   r   rW   r=   rX   r<   r}   rw   r;   r~   r:   rC   r   )
r)   Zcurrent_cpu_nodesZcurrent_acc_nodesZacc_subgraphrK   Zcurrent_subgraph_nodesr   Zcurrent_nodesr>   r?   r-   r   r.   r     sX    








z&_SplitterBase.put_nodes_into_subgraphs)r   rG   c                 C   s   g }|D ]~}|j rZt|j| jjkr0|| q|rH|d j|j qd|_ || q|r||d j s||d j|j q|| q|S )z
        This pass finds ACC submodules with less than specified size and merges
        them with adjacent CPU submodules.
        r   F)rY   r   rB   rr   r&   r=   extend)r)   r   r   r   r-   r-   r.   r     s    z(_SplitterBase.remove_small_acc_subgraphs)r   c                 C   s|   g | _ |D ]l}| j}|jr*dt| j  n| j t| j  }| j | |jD ]$}t|drntd| d||_qPq
d S )Nr   r   zNode z was already tagged)	tagsrs   rY   r   r=   rB   hasattrrW   r   )r)   r   r   Zsubgraph_namer   r>   r-   r-   r.   r   $  s    *

z_SplitterBase.tag)r   rG   c                 C   s4   t | j| j}|r0| jjjD ]}t|dr|`q|S )Nr   )r   r5   r   rA   rB   r   r   )r)   r   r\   r>   r-   r-   r.   r   0  s    
z_SplitterBase.splitc                 C   s$   |   }| |}| | |  S r7   )r   r   r   r   )r)   r   r-   r-   r.   rQ   8  s    

z_SplitterBase.__call__c                 C   sB   |  }g }|  D ]\}}|| qt|| j|}t||| jS r7   )Znamed_childrenr=   rn   rq   r[   rs   )r)   r\   Zsubmodule_namesrb   rc   r]   r-   r-   r.   generate_split_results>  s    z$_SplitterBase.generate_split_results)rp   )F)F)N)F).r0   r1   r2   rR   r   rS   rT   rU   r   r   r   r   r^   r/   r	   r   r   rx   rz   r   nnModuler   r   r   r   rV   r   r   r
   r$   r   r   r   r   r   r   r   rX   r   r   r   r   rQ   r[   r   r-   r-   r-   r.   ro      sR   + ) 
  As .Ero   )5r!   collectionsr   Zdataclassesr   typingr   r   r   r   r   r	   r
   r   loggingrS   Z"torch.fx.passes.graph_manipulationr   Ztorch.fx.noder   Ztorch.fx._compatibilityr   r6   r   r   Zgraph_drawerr   Z
shape_propr   Zsplit_utilsr   Ztools_commonr   r   r   r   r   r   rj   	getLoggerr0   Z_LOGGERr   r3   ri   rW   rX   r[   r   r   r^   rn   ro   r-   r-   r-   r.   <module>   s@   ( 
$d
%