U
    9%e                    @   s  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlZddlm  mZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e#)e*Z+e rddl,m-Z- e" rddl.m/Z/ e r0ddl0m1Z1 e#)e*Z+dZ2dZ3dgZ4eG dd deZ5eG dd deZ6eG dd deZ7dd Z8dndd Z9G d!d" d"ej:Z;d#d$ Z<G d%d& d&ej:Z=doejej>e	e? d'd(d)Z@G d*d+ d+ej:ZAG d,d- d-ej:ZBd.d/ ZCeeeeed0d1d2ZDG d3d4 d4ej:ZEG d5d6 d6ej:ZFG d7d8 d8ej:ZGG d9d: d:ej:ZHG d;d< d<ej:ZIG d=d> d>eZJd?ZKd@ZLG dAdB dBeJZMG dCdD dDeJZNedEeKG dFdG dGeJZOedHeKG dIdJ dJeJZPdKdL ZQdpeReRdNdOdPZSG dQdR dRej:ZTG dSdT dTej:ZUG dUdV dVej:ZVeedWdXdYZWeedZd[d\ZXd]d^ ZYd_d` ZZdadb Z[G dcdd dde\Z]eje?eRe?dedfdgZ^dhdi Z_G djdk dkej:Z`G dldm dmej:ZadS )qz PyTorch DETA model.    N)	dataclass)DictListOptionalTupleUnion)Tensornn   )ACT2FN)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_scipy_availableis_vision_availablereplace_return_docstrings)BaseModelOutput)PreTrainedModel)meshgrid)is_torchvision_availableloggingrequires_backends   )AutoBackbone   )
DetaConfig)center_to_corners_format)batched_nmslinear_sum_assignmentr   zjozhang97/deta-swin-large-o365c                   @   s~   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )	DetaDecoderOutputa	  
    Base class for outputs of the DetaDecoder. This class adds two attributes to BaseModelOutputWithCrossAttentions,
    namely:
    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
    - a stacked tensor of intermediate reference points.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
            Stacked intermediate hidden states (output of each layer of the decoder).
        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
            Stacked intermediate reference points (reference points of each layer of the decoder).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nlast_hidden_stateintermediate_hidden_statesintermediate_reference_pointshidden_states
attentionscross_attentions)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   r#   r$   r   r   r%   r&    r.   r.   e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/deta/modeling_deta.pyr    D   s   
r    c                   @   s   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
ejed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeej ed
< dZeeej  ed< dZeeej  ed< dZeej ed< dZeej ed< dS )DetaModelOutputa  
    Base class for outputs of the Deformable DETR encoder-decoder model.

    Args:
        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
            Initial reference points sent through the Transformer decoder.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
            Stacked intermediate hidden states (output of each layer of the decoder).
        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
            Stacked intermediate reference points (reference points of each layer of the decoder).
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
            plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
            foreground and background).
        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
            Logits of predicted bounding boxes coordinates in the first stage.
    Ninit_reference_pointsr!   r"   r#   decoder_hidden_statesdecoder_attentionsr&   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsenc_outputs_classenc_outputs_coord_logits)r'   r(   r)   r*   r1   r+   r,   r-   r!   r"   r#   r2   r   r   r3   r&   r4   r5   r6   r7   r8   r.   r.   r.   r/   r0   j   s   
*r0   c                   @   sD  e Zd ZU dZdZeej ed< dZ	ee
 ed< dZejed< dZejed< dZeee
  ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeed< dZeed< dS )DetaObjectDetectionOutputa  
    Output type of [`DetaForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
            plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
            in the self-attention heads.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
            Stacked intermediate hidden states (output of each layer of the decoder).
        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
            Stacked intermediate reference points (reference points of each layer of the decoder).
        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
            Initial reference points sent through the Transformer decoder.
        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
            foreground and background).
        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
            Logits of predicted bounding boxes coordinates in the first stage.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputsr1   r!   r"   r#   r2   r3   r&   r4   r5   r6   r7   r8   )r'   r(   r)   r*   r:   r   r+   r,   r-   r;   r   r<   r=   r>   r   r1   r!   r"   r#   r2   r   r3   r&   r4   r5   r6   r7   r8   r.   r.   r.   r/   r9      s$   
;r9   c                    s   t  fddt|D S )Nc                    s   g | ]}t  qS r.   )copydeepcopy).0imoduler.   r/   
<listcomp>   s     z_get_clones.<locals>.<listcomp>)r	   
ModuleListrange)rD   Nr.   rC   r/   _get_clones   s    rI   h㈵>c                 C   s8   | j ddd} | j |d}d|  j |d}t|| S )Nr   r   minmaxrL   )clampr+   log)xepsx1Zx2r.   r.   r/   inverse_sigmoid   s    rT   c                       s4   e Zd ZdZ fddZ fddZdd Z  ZS )DetaFrozenBatchNorm2dz
    BatchNorm2d where the batch statistics and the affine parameters are fixed.

    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
    torchvision.models.resnet[18,34,50,101] produce nans.
    c                    sV   t    | dt| | dt| | dt| | dt| d S )Nweightbiasrunning_meanrunning_var)super__init__Zregister_bufferr+   oneszeros)selfn	__class__r.   r/   r[   
  s
    
zDetaFrozenBatchNorm2d.__init__c           	   	      s2   |d }||kr||= t  ||||||| d S )NZnum_batches_tracked)rZ   _load_from_state_dict)	r^   Z
state_dictprefixZlocal_metadatastrictZmissing_keysZunexpected_keysZ
error_msgsZnum_batches_tracked_keyr`   r.   r/   rb     s          z+DetaFrozenBatchNorm2d._load_from_state_dictc                 C   st   | j dddd}| jdddd}| jdddd}| jdddd}d}|||   }|||  }|| | S )Nr   rJ   )rV   reshaperW   rY   rX   Zrsqrt)r^   rQ   rV   rW   rY   rX   epsilonscaler.   r.   r/   forward  s    zDetaFrozenBatchNorm2d.forward)r'   r(   r)   r*   r[   rb   ri   __classcell__r.   r.   r`   r/   rU     s   rU   c                 C   s   |   D ]\}}t|tjrpt|j}|jj|j |j	j|j	 |j
j|j
 |jj|j || j|< tt| dkrt| qdS )z
    Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.

    Args:
        model (torch.nn.Module):
            input model
    r   N)Znamed_children
isinstancer	   BatchNorm2drU   Znum_featuresrV   dataZcopy_rW   rX   rY   Z_moduleslenlistchildrenreplace_batch_norm)modelnamerD   Z
new_moduler.   r.   r/   rq   *  s    

rq   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )#DetaBackboneWithPositionalEncodingsz
    Backbone model with positional embeddings.

    nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above.
    c              	      s   t    t|j}t  t| W 5 Q R X || _| jj	| _
|jjdkr| j D ]*\}}d|krXd|krXd|krX|d qXt|| _d S )NZresnetzstages.1zstages.2zstages.3F)rZ   r[   r   from_configZbackbone_configr+   no_gradrq   rr   Zchannelsintermediate_channel_sizesZ
model_typenamed_parametersrequires_grad_build_position_encodingposition_embedding)r^   configbackboners   Z	parameterr`   r.   r/   r[   H  s    


z,DetaBackboneWithPositionalEncodings.__init__)pixel_values
pixel_maskc           	      C   s   |  |j}g }g }|D ]^}tjj|d  |jdd dtj	d }| 
|||j}|||f || q||fS )z
        Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
        outputs feature maps of C_5.
        Nsizer   )rr   Zfeature_mapsr	   
functionalinterpolatefloatshapetor+   boolr{   dtypeappend)	r^   r~   r   featuresoutposZfeature_mapmaskposition_embeddingsr.   r.   r/   ri   Y  s    .z+DetaBackboneWithPositionalEncodings.forward)	r'   r(   r)   r*   r[   r+   r   ri   rj   r.   r.   r`   r/   rt   A  s   rt   )r   r   
target_lenc                 C   sf   |   \}}|dk	r|n|}| ddddddf |d|||}d| }|| t|jS )zs
    Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
    Nr         ?)r   expandr   masked_fillr   r+   finforL   )r   r   r   
batch_size
source_lenZexpanded_maskZinverted_maskr.   r.   r/   _expand_masko  s
    *r   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )DetaSinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    @   '  FNc                    sP   t    || _|| _|| _|d k	r4|dkr4td|d krFdtj }|| _d S )NFz+normalize should be True if scale is passedr   )	rZ   r[   embedding_dimtemperature	normalize
ValueErrormathpirh   )r^   r   r   r   rh   r`   r.   r/   r[     s    

z"DetaSinePositionEmbedding.__init__c           
   	   C   s  |d krt d|jdtjd}|jdtjd}| jrd}|d |d d dd d d f |  | j }|d |d d d d dd f |  | j }tj| jtj|jd}| j	dtj
|dd	d
 | j  }|d d d d d d d f | }|d d d d d d d f | }tj|d d d d d d dd df  |d d d d d d dd df  fddd}tj|d d d d d d dd df  |d d d d d d dd df  fddd}tj||fdddddd}	|	S )NzNo pixel mask providedr   r   r   gư>      ?re   r   devicefloorZrounding_moder      dimr
   )r   cumsumr+   float32r   rh   aranger   r   r   divstacksincosflattencatpermute)
r^   r~   r   Zy_embedZx_embedrR   dim_tZpos_xZpos_yr   r.   r.   r/   ri     s     ,,   \\z!DetaSinePositionEmbedding.forward)r   r   FNr'   r(   r)   r*   r[   ri   rj   r.   r.   r`   r/   r   ~  s   r   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
DetaLearnedPositionEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
       c                    s*   t    td|| _td|| _d S )N2   )rZ   r[   r	   	Embeddingrow_embeddingscolumn_embeddings)r^   r   r`   r.   r/   r[     s    
z%DetaLearnedPositionEmbedding.__init__Nc           
      C   s   |j dd  \}}tj||jd}tj||jd}| |}| |}tj|d|dd|dd|dgdd}	|		ddd}	|	d}	|	|j d ddd}	|	S )Nr   r   r   r   re   r   r   )
r   r+   r   r   r   r   r   	unsqueezerepeatr   )
r^   r~   r   heightwidthZwidth_valuesZheight_valuesZx_embZy_embr   r.   r.   r/   ri     s    

2
z$DetaLearnedPositionEmbedding.forward)r   )Nr   r.   r.   r`   r/   r     s   r   c                 C   sJ   | j d }| jdkr"t|dd}n$| jdkr6t|}ntd| j |S )Nr   ZsineT)r   ZlearnedzNot supported )d_modelZposition_embedding_typer   r   r   )r|   Zn_stepsr{   r.   r.   r/   rz     s    



rz   )valuevalue_spatial_shapessampling_locationsattention_weightsreturnc                 C   s,  | j \}}}}|j \}}}}	}
}| jdd |D dd}d| d }g }t|D ]~\}\}}|| ddd|| |||}|d d d d d d |f dddd}tjj||ddd	d
}|	| qP|dd|| d||	|
 }t
j|ddd| d||| |}|dd S )Nc                 S   s    g | ]\}}|  |   qS r.   )item)rA   r   r   r.   r.   r/   rE     s     z4multi_scale_deformable_attention.<locals>.<listcomp>r   r   r   r   Zbilinearr]   F)modeZpadding_modeZalign_cornersr   re   )r   split	enumerater   	transposerf   r	   r   Zgrid_sampler   r+   r   sumview
contiguous)r   r   r   r   r   _	num_heads
hidden_dimnum_queriesZ
num_levelsZ
num_pointsZ
value_listZsampling_gridsZsampling_value_listZlevel_idr   r   Zvalue_l_Zsampling_grid_l_Zsampling_value_l_outputr.   r.   r/    multi_scale_deformable_attention  s@    $,         r   c                       sn   e Zd ZdZeeeed fddZdd Zeje	e ddd	Z
deje	ej e	ej edddZ  ZS )!DetaMultiscaleDeformableAttentionzI
    Multiscale deformable attention as proposed in Deformable DETR.
    	embed_dimr   n_levelsn_pointsc                    s   t    || dkr*td| d| || }||d @ dkrJ|dksTtd d| _|| _|| _|| _|| _	t
||| | d | _t
||| | | _t
||| _t
||| _|   d S )Nr   z<embed_dim (d_model) must be divisible by num_heads, but got z and r   zYou'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the dimension of each attention head a power of 2 which is more efficient in the authors' CUDA implementation.r   r   )rZ   r[   r   warningswarnZim2col_stepr   r   n_headsr   r	   Linearsampling_offsetsr   
value_projoutput_proj_reset_parameters)r^   r   r   r   r   Zdim_per_headr`   r.   r/   r[     s(    
z*DetaMultiscaleDeformableAttention.__init__c              	   C   s^  t j| jjjd tj| jtj	ddt
j | j  }t| | gd}|| jdddd  | jddd	d| j| jd}t| jD ],}|d d d d |d d f  |d 9  < qt  t |d| j_W 5 Q R X t j| jjjd t j| jjjd t j| jjj t j| jjjd t j| jjj t j| jjjd d S )
N        r          @re   TZkeepdimr   r   r   )r	   init	constant_r   rV   rm   r+   r   r   r   r   r   r   r   r   absrM   r   r   r   r   rG   rv   	ParameterrW   r   xavier_uniform_r   r   )r^   ZthetasZ	grid_initrB   r.   r.   r/   r     s2    "      *
z3DetaMultiscaleDeformableAttention._reset_parameterstensorr   c                 C   s   |d kr|S || S Nr.   r^   r   r   r.   r.   r/   with_pos_embed(  s    z0DetaMultiscaleDeformableAttention.with_pos_embedNFr$   attention_maskr   output_attentionsc
              
   C   s  |d k	r|  ||}|j\}
}}|j\}
}}|d d df |d d df   |kr\td| |}|d k	r||d  td}||
|| j| j	| j }| 
||
|| j| j| jd}| ||
|| j| j| j }t|d|
|| j| j| j}|jd dkrlt|d |d gd}|d d d d d d d d d d f ||d d d d d d d d f   }n||jd d	kr|d d d d d d d d d df || j |d d d d d d d d dd f  d
  }ntd|jd  t||||}| |}||fS )Nr   r   z[Make sure to align the spatial shapes with the sequence length of the encoder hidden states).Nr   re   ).r   .r   r   r   z5Last dim of reference_points must be 2 or 4, but got )r   r   r   r   r   r   r   r   r   r   r   r   r   r   Fsoftmaxr+   r   r   r   )r^   r$   r   r5   encoder_attention_maskr   reference_pointsspatial_shapeslevel_start_indexr   r   r   r   Zsequence_lengthr   r   r   Zoffset_normalizerr   r   r.   r.   r/   ri   +  s`    (

     
   
    ""0
z)DetaMultiscaleDeformableAttention.forward)NNNNNNNF)r'   r(   r)   r*   intr[   r   r+   r   r   r   r   ri   rj   r.   r.   r`   r/   r     s$           r   c                       s   e Zd ZdZdeeeed fddZej	eeddd	Z
ej	ee	 d
ddZdej	eej	 eej	 eeej	eej	 eeej	  f dddZ  ZS )DetaMultiheadAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.

    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
    r   T)r   r   dropoutrW   c                    s   t    || _|| _|| _|| | _| j| | jkrNtd| j d| d| jd | _tj	|||d| _
tj	|||d| _tj	|||d| _tj	|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )rW   )rZ   r[   r   r   r   head_dimr   scalingr	   r   k_projv_projq_projout_proj)r^   r   r   r   rW   r`   r.   r/   r[   m  s    

zDetaMultiheadAttention.__init__)r   seq_lenr   c                 C   s    | ||| j| jdd S )Nr   r   )r   r   r   r   r   )r^   r   r  r   r.   r.   r/   _shape  s    zDetaMultiheadAttention._shaper   c                 C   s   |d kr|S || S r   r.   r   r.   r.   r/   r     s    z%DetaMultiheadAttention.with_pos_embedNF)r$   r   r   r   r   c                 C   sV  |  \}}}|dk	r&|}| ||}| || j }	| | |d|}
| | |d|}|| j d| jf}| |	||j	| }	|
j	| }
|j	| }|
 d}t
|	|
dd}|  || j ||fkrtd|| j ||f d|   |dk	rt||j}|dk	rr|  |d||fkrHtd|d||f d|   |	|| j||| }|	|| j ||}tjj|dd}|r|	|| j||}|	|| j ||}nd}tjj|| j| jd	}t
||}|  || j || jfkrtd
|| j|| jf d|   |	|| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelNre   r   r   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   ptrainingz `attn_output` should be of size )r   r   r   r   r  r   r   r   r   r   r+   Zbmmr   r   r   r   r	   r   r   r   r  rf   r  )r^   r$   r   r   r   r   r   r   Zhidden_states_originalZquery_statesZ
key_statesZvalue_statesZ
proj_shaper   attn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr.   r.   r/   ri     sT    	





zDetaMultiheadAttention.forward)r   T)NNF)r'   r(   r)   r*   r   r   r   r[   r+   r   r  r   r   r   ri   rj   r.   r.   r`   r/   r   f  s*   
     r   c                       s>   e Zd Zed fddZd	ejejejedddZ  Z	S )
DetaEncoderLayerr|   c                    s   t    |j| _t| j|j|j|jd| _t	
| j| _|j| _t|j | _|j| _t	| j|j| _t	|j| j| _t	
| j| _d S )Nr   )rZ   r[   r   r   r   Zencoder_attention_headsnum_feature_levelsZencoder_n_points	self_attnr	   	LayerNormself_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr   Zencoder_ffn_dimfc1fc2final_layer_normr^   r|   r`   r.   r/   r[     s    
zDetaEncoderLayer.__init__NFr   c                 C   s  |}| j |||||||||d	\}}	tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}| jrt| st| rt|jjd }
tj||
 |
d}|f}|r||	f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Input to the layer.
            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Attention mask.
            position_embeddings (`torch.FloatTensor`, *optional*):
                Position embeddings, to be added to `hidden_states`.
            reference_points (`torch.FloatTensor`, *optional*):
                Reference points.
            spatial_shapes (`torch.LongTensor`, *optional*):
                Spatial shapes of the backbone feature maps.
            level_start_index (`torch.LongTensor`, *optional*):
                Level start index.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        	r$   r   r5   r   r   r   r   r   r   r    rK   )r  r	   r   r   r  r  r  r  r  r  r  r+   isinfanyisnanr   r   rM   rO   )r^   r$   r   r   r   r   r   r   residualr  Zclamp_valueoutputsr.   r.   r/   ri     s<    




zDetaEncoderLayer.forward)NNNNF)
r'   r(   r)   r   r[   r+   r   r   ri   rj   r.   r.   r`   r/   r    s        r  c                       sR   e Zd Zed fddZd	ejeej eej eej ee dddZ	  Z
S )
DetaDecoderLayerr	  c                    s   t    |j| _t| j|j|jd| _|j| _t	|j
 | _|j| _t| j| _t| j|j|j|jd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )N)r   r   r   r   )rZ   r[   r   r   r   Zdecoder_attention_headsZattention_dropoutr  r   r   r  r  r  r	   r  r  r   r
  Zdecoder_n_pointsencoder_attnencoder_attn_layer_normr   Zdecoder_ffn_dimr  r  r  r  r`   r.   r/   r[   4  s*    
zDetaDecoderLayer.__init__NF)r$   r   r5   r   r   c	                 C   s  |}	| j |||d\}}
tjj|| j| jd}|	| }| |}|}d}| j|||||||||d	\}}tjj|| j| jd}|| }| |}|}	| | 	|}tjj|| j
| jd}| |}tjj|| j| jd}|	| }| |}|f}|r||
|f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            position_embeddings (`torch.FloatTensor`, *optional*):
                Position embeddings that are added to the queries and keys in the self-attention layer.
            reference_points (`torch.FloatTensor`, *optional*):
                Reference points.
            spatial_shapes (`torch.LongTensor`, *optional*):
                Spatial shapes.
            level_start_index (`torch.LongTensor`, *optional*):
                Level start index.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r$   r   r   r  Nr  )r  r	   r   r   r  r  r  r  r  r  r  r  r  )r^   r$   r   r   r   r   r5   r   r   r  Zself_attn_weightsZsecond_residualZcross_attn_weightsr  r.   r.   r/   ri   P  sH     





zDetaDecoderLayer.forward)NNNNNNF)r'   r(   r)   r   r[   r+   r   r   r   ri   rj   r.   r.   r`   r/   r  3  s          r  c                       s<   e Zd ZdZeeeed fddZejdddZ	  Z
S )DetaClassificationHeadz-Head for sentence-level classification tasks.)	input_dim	inner_dimnum_classespooler_dropoutc                    s8   t    t||| _tj|d| _t||| _d S )Nr  )rZ   r[   r	   r   denseZDropoutr   r  )r^   r   r!  r"  r#  r`   r.   r/   r[     s    
zDetaClassificationHead.__init__)r$   c                 C   s6   |  |}| |}t|}|  |}| |}|S r   )r   r%  r+   tanhr  )r^   r$   r.   r.   r/   ri     s    




zDetaClassificationHead.forward)r'   r(   r)   r*   r   r   r[   r+   r   ri   rj   r.   r.   r`   r/   r    s   r  c                   @   s*   e Zd ZeZdZdZdd Zd	ddZdS )
DetaPreTrainedModelrr   r~   c                 C   s  | j j}t|tr4tj|jj tj|j	j nt|t
rH|  nzt|tjtjtjfr|jjjd|d |jd k	r|jj  n:t|tjr|jjjd|d |jd k	r|jj|j   t|dr| j jstjj|jjjdd tj|jjjd t|drtj|j d S )Nr   )meanstdr   r   )Zgainlevel_embed)r|   Zinit_stdrk   r   r	   r   Zuniform_r   rV   r   r   r   r   Conv2drl   rm   Znormal_rW   Zzero_r   Zpadding_idxhasattr	two_stager   r   r   r*  )r^   rD   r)  r.   r.   r/   _init_weights  s&    




z!DetaPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )rk   DetaDecodergradient_checkpointing)r^   rD   r   r.   r.   r/   _set_gradient_checkpointing  s    
z/DetaPreTrainedModel._set_gradient_checkpointingN)F)	r'   r(   r)   r   config_classZbase_model_prefixZmain_input_namer.  r1  r.   r.   r.   r/   r'    s
   r'  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DetaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ae	  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it.

            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).

            [What are attention masks?](../glossary#attention-mask)

        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Not used by default. Can be used to mask object queries.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
c                	       s<   e Zd ZdZed fddZedd Zd
dd	Z  Z	S )DetaEncodera  
    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
    [`DetaEncoderLayer`].

    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.

    Args:
        config: DetaConfig
    r	  c                    s@   t     j| _t fddt jD | _|   d S )Nc                    s   g | ]}t  qS r.   )r  rA   r   r	  r.   r/   rE     s     z(DetaEncoder.__init__.<locals>.<listcomp>)	rZ   r[   r   r	   rF   rG   Zencoder_layerslayers	post_initr  r`   r	  r/   r[     s     zDetaEncoder.__init__c              
   C   s   g }t | D ]\}\}}ttjd|d |tj|dtjd|d |tj|ddd\}}|dd |ddd|df |  }|dd |ddd|df |  }t||fd}	||	 qt|d}
|
dddddf |dddf  }
|
S )	a>  
        Get reference points for each feature map. Used in decoder.

        Args:
            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                Spatial shapes of each feature map.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                Valid ratios of each feature map.
            device (`torch.device`):
                Device on which to create the tensors.
        Returns:
            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
        r   r   ijZindexingre   Nr   r   )	r   r   r+   linspacer   rf   r   r   r   )r   valid_ratiosr   Zreference_points_listlevelr   r   Zref_yZref_xrefr   r.   r.   r/   get_reference_points$  s    
&&&z DetaEncoder.get_reference_pointsNc
              
   C   s
  |dk	r|n| j j}|dk	r |n| j j}|	dk	r4|	n| j j}	|}
tjj|
| j| jd}
| j|||j	d}|rpdnd}|r|dnd}t
| jD ]F\}}|r||
f }||
||||||d}|d }
|r||d f }q|r||
f }|	stdd	 |
||fD S t|
||d
S )aR  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
                - 1 for pixel features that are real (i.e. **not masked**),
                - 0 for pixel features that are padding (i.e. **masked**).
                [What are attention masks?](../glossary#attention-mask)
            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Position embeddings that are added to the queries and keys in each self-attention layer.
            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                Spatial shapes of each feature map.
            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
                Starting index of each feature map.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
                Ratio of valid area in each feature level.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
        Nr  r   r.   )r   r   r   r   r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r.   rA   vr.   r.   r/   	<genexpr>  s      z&DetaEncoder.forward.<locals>.<genexpr>r!   r$   r%   )r|   r   output_hidden_statesuse_return_dictr	   r   r   r  r=  r   r   r5  tupler   )r^   inputs_embedsr   r   r   r   r:  r   rB  return_dictr$   r   Zencoder_statesZall_attentionsrB   Zencoder_layerlayer_outputsr.   r.   r/   ri   C  sB    &


  zDetaEncoder.forward)	NNNNNNNNN)
r'   r(   r)   r*   r   r[   staticmethodr=  ri   rj   r.   r.   r`   r/   r3    s   
	
          r3  c                       s0   e Zd ZdZed fddZdddZ  ZS )	r/  a  
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`].

    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.

    Some tweaks for Deformable DETR:

    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
    - it also returns a stack of intermediate outputs and reference points from all decoding layers.

    Args:
        config: DetaConfig
    r	  c                    sR   t     j| _t fddt jD | _d| _d | _	d | _
|   d S )Nc                    s   g | ]}t  qS r.   )r  r4  r	  r.   r/   rE     s     z(DetaDecoder.__init__.<locals>.<listcomp>F)rZ   r[   r   r	   rF   rG   decoder_layersr5  r0  
bbox_embedclass_embedr6  r  r`   r	  r/   r[     s     zDetaDecoder.__init__Nc                    s   dk	r n| j j |
dk	r |
n| j j}
|dk	r4|n| j j}|dk	rH|}|
rPdnd} r\dnd} rp|dk	rpdnd}d}d}t| jD ]\}}|jd dkr|dddddf t||gddddf  }n<|jd dkrt	d|dddddf |dddf  }|
r||f7 }| j
rV| jrV fdd}tjj|||||d}n|||||||| d	}|d
 }| jdk	r| j| |}|jd dkr|t| }| }nT|jd dkrt	d|jd  |}|dddf t| |dddf< | }| }||f7 }||f7 } r||d f7 }|dk	r||d f7 }qtj|dd}tj|dd}|
r|||f7 }|stdd ||||||fD S t||||||dS )aj	  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
                The query embeddings that are passed into the decoder.
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
                in `[0, 1]`:
                - 1 for pixels that are real (i.e. **not masked**),
                - 0 for pixels that are padding (i.e. **masked**).
            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
                Position embeddings that are added to the queries and keys in each self-attention layer.
            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
                Spatial shapes of the feature maps.
            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
                Indexes for the start of each feature level. In range `[0, sequence_length]`.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
                Ratio of valid area in each feature level.

            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
        Nr.   re   r   r   z2Reference points' last dimension must be of size 2c                    s    fdd}|S )Nc                     s    | f S r   r.   )inputs)rD   r   r.   r/   custom_forward  s    zJDetaDecoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr.   )rD   rM  r   rC   r/   create_custom_forward  s    z2DetaDecoder.forward.<locals>.create_custom_forward)r   r5   r   r   r   r   r   r   z;Reference points' last dimension must be of size 2, but is .r   r   c                 s   s   | ]}|d k	r|V  qd S r   r.   r>  r.   r.   r/   r@  :  s   z&DetaDecoder.forward.<locals>.<genexpr>)r!   r"   r#   r$   r%   r&   )r|   r   rB  rC  r   r5  r   r+   r   r   r0  r  utils
checkpointrJ  rT   sigmoiddetachr   rD  r    )r^   rE  r5   r   r   r   r   r   r:  r   rB  rF  r$   Zall_hidden_statesZall_self_attnsZall_cross_attentionsintermediater#   idxZdecoder_layerZreference_points_inputrO  rG  tmpZnew_reference_pointsr.   rN  r/   ri     s    .0&

$


zDetaDecoder.forward)NNNNNNNNNNN)r'   r(   r)   r*   r   r[   ri   rj   r.   r.   r`   r/   r/    s              r/  z
    The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    c                       s   e Zd Zed fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeeeeeddejeej eej eej eej eej ee ee ee eeej ef d
ddZ  ZS )	DetaModelr	  c                    s  t  | |jrt| dg t|| _| jj}|jdkrt|}g }t	|D ]6}|| }|
ttj||jddtd|j qNt	|j| D ]8}|
ttj||jddddtd|j |j}qt|| _n2tttj|d |jddtd|jg| _|js,t|j|jd | _t|| _t|| _tt|j|j| _|jrt|j|j| _t|j| _ t|jd |jd | _!t|jd | _"t|j|j| _#t|j| _$nt|jd| _%|j&| _&|j'| _'| (  d S )	NZtorchvisionr   )kernel_size    r
   r   )rX  Zstridepaddingre   ))rZ   r[   r-  r   rt   r}   rw   r
  rn   rG   r   r	   Z
Sequentialr+  r   Z	GroupNormrF   
input_projr   r   query_position_embeddingsr3  encoderr/  decoderr   r+   r   r*  r   
enc_outputr  enc_output_norm	pos_transpos_trans_normZ	pix_transZpix_trans_normr   assign_first_stagetwo_stage_num_proposalsr6  )r^   r|   rw   Znum_backbone_outsZinput_proj_listr   Zin_channelsr`   r.   r/   r[   X  s`    

	

zDetaModel.__init__c                 C   s   | j S r   )r]  r^   r.   r.   r/   get_encoder  s    zDetaModel.get_encoderc                 C   s   | j S r   )r^  re  r.   r.   r/   get_decoder  s    zDetaModel.get_decoderc                 C   s&   | j jj D ]\}}|d qd S )NFr}   Zconv_encoderrr   rx   ry   r^   rs   paramr.   r.   r/   freeze_backbone  s    zDetaModel.freeze_backbonec                 C   s&   | j jj D ]\}}|d qd S )NTrh  ri  r.   r.   r/   unfreeze_backbone  s    zDetaModel.unfreeze_backbonec           
      C   st   |j \}}}t|dddddf d}t|dddddf d}| | }| | }t||gd}	|	S )z(Get the valid ratio of all feature maps.Nr   r   re   )r   r+   r   r   r   )
r^   r   r   r   r   valid_heightvalid_widthZvalid_ratio_heigthZvalid_ratio_widthZvalid_ratior.   r.   r/   get_valid_ratio  s    zDetaModel.get_valid_ratioc              	   C   s   | j jd }d}dtj }tj|tj|jd}|dtj|ddd |  }|	 | }|dddddddf | }tj
|dddddddddf  |dddddddddf  fd	d
d}|S )z,Get the position embedding of the proposals.r   r   r   r   r   Nr   r   r   r   )r|   r   r   r   r+   r   r   r   r   rR  r   r   r   r   )r^   	proposalsZnum_pos_featsr   rh   r   r   r.   r.   r/   get_proposal_pos_embed  s    
 \z DetaModel.get_proposal_pos_embedc              
   C   sR  |j d }g }d}g }t|D ]\}\}	}
|dd|||	|
  f ||	|
d}t|ddddddf  d}t|ddddddf  d}ttjd|	d |	tj|jdtjd|
d |
tj|jddd\}}t	|
d|
dgd}t	|
d|
dgd|ddd}|
d|dddd	 | }t|d
 d|  }t	||fd|dd}|| ||	|
 7 }||j|	|
 tjd|  qt	|d}|dk|dk @ jddd}t|d|  }||
dtd}|| td}|}||
dtd}|| td}| | |}t	|}|||fS )a7  Generate the encoder output proposals from encoded enc_output.

        Args:
            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.

        Returns:
            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
                  directly predict a bounding box. (without the need of a decoder)
                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
                  sigmoid.
        r   Nr   r   r7  r8  re   r   r   g?r   r   r   {Gz?gGz?Tr   inf)r   r   r   r+   r   r   r9  r   r   r   r   r   Z	ones_liker   Znew_oneslongallrP   r   r   r`  r_  )r^   r_  Zpadding_maskr   r   rp  _cur	level_idsr;  r   r   Zmask_flatten_rm  rn  Zgrid_yZgrid_xgridrh   Zwidth_heigthZproposaloutput_proposalsZoutput_proposals_validZobject_queryr.   r.   r/   gen_encoder_output_proposals  s@    
(""
(
 
z&DetaModel.gen_encoder_output_proposalsoutput_typer2  N)
r~   r   decoder_attention_maskencoder_outputsrE  decoder_inputs_embedsr   rB  rF  r   c
           D         sn  |dk	r|n j j}|dk	r |n j j}|	dk	r4|	n j j}	|j\}
}}}|j}|dkrptj|
||ftj|d} 	||\}}g }g }t
|D ]:\}\}}| j| | || |dkrtdq j jt|krt|}t| j jD ]}||kr j| |d d }n j| |d }tjj|d  |jdd dtjd } j	|||j}|| || || qd} j js jj}dd	 |D }d
d	 |D }dd	 |D }g }t
|D ]>\}}|ddd}| j| ddd }|| qt |d}t |d}t |d}tj!|tj|jd}t |"d|#d$ddd f}t% fdd	|D d} |  } |dkrƈ j&|||||| |||	d	}nP|	rt'|t(st(|d t|dkr|d ndt|dkr|d ndd}|d j\}
}!}d}"d}# j jr )|d | |\}$}%}& j*j+d |$}" j*j,d |$}'|'|% }# j-}(|"d }) j.r"t/|#0  1dd}*g }+t|
D ]^},|*|, }-|)|, }.d}/g }0tt|D ],}1|&|1k}2|0t2|.0 |2 |/d  qt |0}0t3|-|0 |.|0 |&|0 d}3|0|3 }4t|4 j-k rxt4dt|4 d j- d t2|)|, |(d }4|(t| }5|&|4 d tj5t||&jddddf k}6|6|6$d|5k@ }7|76d}7|77 |(k r |(|77  }8|7 8 d|8 }9d|7|9< |4|7 }:|+|: qt%|+}+ntj2|"d |(ddd }+t9|#d|+:d;ddd};|;< };|;0 }<|<}= = > ?|;}>tj@|>|dd\}?}@nNtj@||dd\}?}@|?:dA|
dd}?|@:dA|
dd}@ B|?0 }<|<}= j*|@|?|d ||<||| |||	d}A|	s<tCdd |"|#fD }B|=f|A | |B }C|CS tD|=|AjE|AjF|AjG|AjH|AjI|AjJ|jE|jH|jI|"|#dS ) a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetaModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365")
        >>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False)

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 900, 256]
        ```Nr   zNo attention mask was providedre   r   r   r   c                 S   s   g | ]}|j d d qS )r   N)r   rA   sourcer.   r.   r/   rE   P  s     z%DetaModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d dd qS )r   r   )r   r   r  r.   r.   r/   rE   Q  s     c                 S   s   g | ]}| d qS r   )r   )rA   r   r.   r.   r/   rE   R  s     r   r   r  c                    s   g | ]}  |qS r.   )ro  )rA   mre  r.   r/   rE   _  s     )	rE  r   r   r   r   r:  r   rB  rF  rA  r   r  g?z[WARNING] nms proposals (z) < z, running naive topkr   Tr   r   )rE  r   r5   r   r   r   r   r:  r   rB  rF  c                 s   s   | ]}|d k	r|V  qd S r   r.   )rA   r   r.   r.   r/   r@    s      z$DetaModel.forward.<locals>.<genexpr>)r1   r!   r"   r#   r2   r3   r&   r4   r5   r6   r7   r8   )Kr|   r   rB  rC  r   r   r+   r\   rt  r}   r   r   r[  r   r
  rn   rG   r	   r   r   r   r   r   r{   r   r-  r\  rV   r   r   r*  r   r   	as_tensorZ	new_zerosprodr   r   r]  rk   r   rz  r^  rK  rJ  rd  rc  r   rR  rO   topkr   printr   r  r   nonzerogatherr   r   rS  rb  ra  rq  r   r   r   rD  r0   r!   r"   r#   r$   r%   r&   )Dr^   r~   r   r}  r~  rE  r  r   rB  rF  r   Znum_channelsr   r   r   r   Zposition_embeddings_listsourcesmasksr;  r  r   Z_len_sourcesZpos_lZquery_embedsr   Zsource_flattenZmask_flattenZlvl_pos_embed_flattenZ	pos_embedZlvl_pos_embedr   r:  r   r7   r8   Zobject_query_embeddingry  rw  
delta_bboxr  Zproposal_logitZproposal_boxesZtopk_proposalsbZprop_boxes_bZprop_logits_bZpre_nms_topkZpre_nms_indsZlvlZlvl_maskZpost_nms_indsZ	keep_indsZq_per_lZis_level_orderedZkeep_inds_maskZ
num_to_addZpad_indsZkeep_inds_topkZtopk_coords_logitsr   r1   Zpos_trans_outZquery_embedtargetZdecoder_outputsenc_outputstuple_outputsr.   re  r/   ri     sD   &


.


(

  
"
   

  zDetaModel.forward)NNNNNNNN)r'   r(   r)   r   r[   rf  rg  rk  rl  ro  rq  rz  r   DETA_INPUTS_DOCSTRINGr   r0   _CONFIG_FOR_DOCr+   r,   r   
LongTensorr   r   r   ri   rj   r.   r.   r`   r/   rW  P  s<   A4
        rW  z
    DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
    such as COCO detection.
    c                       s   e Zd ZdgZed fddZejjdd Z	e
eeeeddejeej eej eej eej eej eee  ee ee ee eeej ef d	d
dZ  ZS )DetaForObjectDetectionzbbox_embed\.\d+r	  c                    s  t  | t| _t|j|j _t	|j|jddd _
d}td| |  }t|j|  jj_tj j
jd jjd tj j
jd jjd |jr|jd n|j}|jrt j| _t j
| _
tj j
d jd jjdd  d	  j
 jj_
nhtj j
jd jjdd  d	 t fd
dt|D  _t fddt|D  _
d  jj_
|jr j jj_ j
D ]&}tj|jd jjdd  d q   d S )Nr   r
   )r   r   
output_dim
num_layersrr  r   re   r   r   g       c                    s   g | ]
} j qS r.   )rK  r4  re  r.   r/   rE     s     z3DetaForObjectDetection.__init__.<locals>.<listcomp>c                    s   g | ]
} j qS r.   )rJ  r4  re  r.   r/   rE     s     r   )rZ   r[   rW  rr   r	   r   r   
num_labelsrK  DetaMLPPredictionHeadrJ  r   rP   r+   r\   rW   rm   r   r   r5  rV   r-  rI  Zwith_box_refinerI   r^  rF   rG   r6  )r^   r|   Z
prior_probZ
bias_valueZnum_predZ	box_embedr`   re  r/   r[     s:    
   &"

$zDetaForObjectDetection.__init__c                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]\}}||d qS ))r<   r=   r.   )rA   ar  r.   r.   r/   rE   +  s     z8DetaForObjectDetection._set_aux_loss.<locals>.<listcomp>re   )zip)r^   outputs_classoutputs_coordr.   r.   r/   _set_aux_loss%  s    z$DetaForObjectDetection._set_aux_lossr{  N)r~   r   r}  r~  rE  r  labelsr   rB  rF  r   c           %         s  |
dk	r|
n| j j}
| j||||||||	|
d	}|
r:|jn|d }|
rL|jn|d }|
r^|jn|d }g }g }t|jd D ]}|dkr|}n|dd|d f }t|}| j	| |dd|f }| j
| |dd|f }|jd dkr|| }nB|jd dkr$|d	ddf  |7  < |}ntd
|jd  | }|| || q|tj|dd}tj|dd}|dddf }|dddf }d\}}|dk	rt| j j| j j| j jd}dddg}t|| j j| j j|| j jd}||j i }||d< ||d< | j jrR|
r|jn|d }| 	|}| 
| }| ||}||d< | j jrv|j } |j| d|d< |||d| j jd| j j d< | j jri }!t| j j!d D ]" |!" fdd# D  q"|! t$fdd% D }|
sN|dk	r$||f| | }"n||f| }"|dk	rF|f|" n|"}#|#S t&|||||j'|j(|j)|j*|j+|j,|j-|j|j|j|j|jd}$|$S )a  
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetaForObjectDetection
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large")
        >>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to COCO API
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
        ...     0
        ... ]
        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected cat with confidence 0.683 at location [345.85, 23.68, 639.86, 372.83]
        Detected cat with confidence 0.683 at location [8.8, 52.49, 316.93, 473.45]
        Detected remote with confidence 0.568 at location [40.02, 73.75, 175.96, 117.33]
        Detected remote with confidence 0.546 at location [333.68, 77.13, 370.12, 187.51]
        ```N)r   r}  r~  rE  r  r   rB  rF  r   r   r
   r   re   r   .z.reference.shape[-1] should be 4 or 2, but got r   )NNN
class_cost	bbox_cost	giou_costr  boxescardinality)matcherr"  focal_alphalossesr   r<   r=   r>   )Zpred_logitsr=   r  )loss_ce	loss_bbox	loss_giouc                    s    i | ]\}}|d    |qS r   r.   rA   kr?  rB   r.   r/   
<dictcomp>  s      z2DetaForObjectDetection.forward.<locals>.<dictcomp>c                 3   s&   | ]}|kr | |  V  qd S r   r.   )rA   r  )r;   weight_dictr.   r/   r@    s      z1DetaForObjectDetection.forward.<locals>.<genexpr>)r:   r;   r<   r=   r>   r!   r2   r3   r&   r4   r5   r6   r"   r#   r1   r7   r8   ).r|   rC  rr   r"   r1   r#   rG   r   rT   rK  rJ  r   rR  r   r+   r   DetaHungarianMatcherr  r  r  DetaLossr  r  r   r   r   Zauxiliary_lossr  r-  r8   r7   Zbbox_loss_coefficientZgiou_loss_coefficientrI  updateitemsr   keysr9   r!   r2   r3   r&   r4   r5   r6   )%r^   r~   r   r}  r~  rE  r  r  r   rB  rF  r  r$   init_referenceZinter_referencesZoutputs_classesZoutputs_coordsr;  	referencer  r  Zoutputs_coord_logitsr  r<   r=   r:   r>   r  r  	criterionZoutputs_lossrT  Zenc_outputs_coordZaux_weight_dictr   r  Zdict_outputsr.   )rB   r;   r  r/   ri   -  s    8



  






 

zDetaForObjectDetection.forward)	NNNNNNNNN)r'   r(   r)   Z_tied_weights_keysr   r[   r+   jitZunusedr  r   r  r   r9   r  r,   r   r  r   dictr   r   r   ri   rj   r.   r.   r`   r/   r    s8   	(

         
r  c                 C   sX   |   } | d} d| | d }| d|d }d|d |d   }| | S )a  
    Compute the DICE loss, similar to generalized IOU for masks

    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs (0 for the negative class and 1 for the positive
                 class).
    r   r   re   )rR  r   r   )rL  targets	num_boxes	numeratordenominatorr:   r.   r.   r/   	dice_loss  s    
r        ?alphagammac           
      C   s|   |   }tjj| |dd}|| d| d|   }|d| |  }|dkrj|| d| d|   }	|	| }|d | S )a  
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        alpha (`float`, *optional*, defaults to `0.25`):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to `2`):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    noneZ	reductionr   r   )rR  r	   r   Z binary_cross_entropy_with_logitsr(  r   )
rL  r  r  r  r  ZprobZce_lossZp_tr:   Zalpha_tr.   r.   r/   sigmoid_focal_loss  s    r  c                       sb   e Zd ZdZd fdd	Zdd Ze dd Zd	d
 Z	dd Z
dd Zdd Zdd Z  ZS )r  a  
    This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervised class and box).

    Args:
        matcher (`DetaHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        num_classes (`int`):
            Number of object categories, omitting the special no-object category.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`List[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    Fc                    sP   t    || _|| _|| _|| _|| _|| _| jr<t | _	| jrLt
|| _d S r   )rZ   r[   r  r"  r  r  rc  assign_second_stageDetaStage1Assignerstg1_assignerDetaStage2Assignerstg2_assigner)r^   r  r"  r  r  r   rc  r  r`   r.   r/   r[   #  s    

zDetaLoss.__init__c                 C   s   d|krt d|d }| |}tdd t||D }tj|jdd | jtj|j	d}|||< tj
|jd |jd	 |jd d	 g|j|j|j	d
}	|	d|dd	 |	ddddddf }	t||	|| jdd|jd	  }
d|
i}|S )z
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        r<   z#No logits were found in the outputsc                 S   s    g | ]\}\}}|d  | qS class_labelsr.   )rA   tr   Jr.   r.   r/   rE   E  s    
 z(DetaLoss.loss_labels.<locals>.<listcomp>Nr   r   r   r   )r   layoutr   re   r  r  )KeyError_get_source_permutation_idxr+   r   r  fullr   r"  int64r   r]   r   r  scatter_r   r  r  )r^   r  r  indicesr  Zsource_logitsrU  Ztarget_classes_oZtarget_classesZtarget_classes_onehotr  r  r.   r.   r/   loss_labels;  s4    
   zDetaLoss.loss_labelsc                 C   sf   |d }|j }tjdd |D |d}|d|jd d kd}tj|	 |	 }	d|	i}
|
S )z
        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.

        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
        r<   c                 S   s   g | ]}t |d  qS r  rn   r>  r.   r.   r/   rE   f  s     z-DetaLoss.loss_cardinality.<locals>.<listcomp>r   re   r   Zcardinality_error)
r   r+   r  Zargmaxr   r   r	   r   l1_lossr   )r^   r  r  r  r  r<   r   Ztarget_lengthsZ	card_predZcard_errr  r.   r.   r/   loss_cardinality\  s    zDetaLoss.loss_cardinalityc                 C   s   d|krt d| |}|d | }tjdd t||D dd}tjj||dd}i }	| | |	d	< d
t	t
t|t| }
|
 | |	d< |	S )a<  
        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.

        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        r=   z#No predicted boxes found in outputsc                 S   s    g | ]\}\}}|d  | qS r  r.   )rA   r  r   rB   r.   r.   r/   rE   y  s    
 z'DetaLoss.loss_boxes.<locals>.<listcomp>r   r   r  r  r  r   r  )r  r  r+   r   r  r	   r   r  r   Zdiaggeneralized_box_iour   )r^   r  r  r  r  rU  Zsource_boxesZtarget_boxesr  r  r  r.   r.   r/   
loss_boxesn  s    
zDetaLoss.loss_boxesc                 C   s4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r.   r+   Z	full_like)rA   rB   r  r   r.   r.   r/   rE     s    
 z8DetaLoss._get_source_permutation_idx.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r.   r.   )rA   r  r   r.   r.   r/   rE     s     r+   r   r   )r^   r  	batch_idxZ
source_idxr.   r.   r/   r    s    z$DetaLoss._get_source_permutation_idxc                 C   s4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r.   r  )rA   rB   r   r  r.   r.   r/   rE     s    
 z8DetaLoss._get_target_permutation_idx.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r.   r.   )rA   r   r  r.   r.   r/   rE     s     r  )r^   r  r  Z
target_idxr.   r.   r/   _get_target_permutation_idx  s    z$DetaLoss._get_target_permutation_idxc                 C   s<   | j | j| jd}||kr*td| d|| ||||S )N)r  r  r  zLoss z not supported)r  r  r  r   )r^   r:   r  r  r  r  Zloss_mapr.   r.   r/   get_loss  s    zDetaLoss.get_lossc              
      s  dd |  D }| jr&| ||}n| ||}tdd |D }tj|gtjtt	|
 jd}tj|dd }i }| jD ]}|| ||||| qd|krt|d D ]X\ }| js| ||}| jD ]6}| |||||}	 fd	d|	  D }	||	 qqd
|kr|d
 }
t|}|D ]}t|d |d< q0| jr`| |
|}n| |
|}| jD ]N}i }|dkrd|d< | j||
|||f|}	dd |	  D }	||	 qr|S )a  
        This performs the loss computation.

        Args:
             outputs (`dict`, *optional*):
                Dictionary of tensors, see the output specification of the model for the format.
             targets (`List[dict]`, *optional*):
                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
                losses applied, see each loss' doc.
        c                 S   s   i | ]\}}|d kr||qS )r>   r.   r  r.   r.   r/   r    s       z$DetaLoss.forward.<locals>.<dictcomp>c                 s   s   | ]}t |d  V  qdS )r  Nr  )rA   r  r.   r.   r/   r@    s     z#DetaLoss.forward.<locals>.<genexpr>r   r   rN   r>   c                    s    i | ]\}}|d    |qS r  r.   r  r  r.   r/   r    s      r  r  FrP   c                 S   s   i | ]\}}|d  |qS )Z_encr.   r  r.   r.   r/   r    s      )r  r  r  r  r   r+   r  r   nextitervaluesr   rO   r   r  r  r  r   r?   r@   
zeros_likerc  r  )r^   r  r  Zoutputs_without_auxr  r  r  r:   r>   Zl_dictr  Zbin_targetsZbtkwargsr.   r  r/   ri     sD    "






zDetaLoss.forward)FF)r'   r(   r)   r*   r[   r  r+   rv   r  r  r  r  r  ri   rj   r.   r.   r`   r/   r    s     !

r  c                       s(   e Zd ZdZ fddZdd Z  ZS )r  a  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s   | ]\}}t ||V  qd S r   )r	   r   )rA   r_   r  r.   r.   r/   r@    s     z1DetaMLPPredictionHead.__init__.<locals>.<genexpr>)rZ   r[   r  r	   rF   r  r5  )r^   r   r   r  r  hr`   r.   r/   r[     s    
zDetaMLPPredictionHead.__init__c                 C   s>   t | jD ].\}}|| jd k r0tj||n||}q
|S )Nr   )r   r5  r  r	   r   Zrelu)r^   rQ   rB   layerr.   r.   r/   ri     s    (zDetaMLPPredictionHead.forwardr   r.   r.   r`   r/   r    s   r  c                       s<   e Zd ZdZdeeed fddZe dd Z  Z	S )	r  a  
    This class computes an assignment between the targets and the predictions of the network.

    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).

    Args:
        class_cost:
            The relative weight of the classification error in the matching cost.
        bbox_cost:
            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
        giou_cost:
            The relative weight of the giou loss of the bounding box in the matching cost.
    r   r  c                    sL   t    t| dg || _|| _|| _|dkrH|dkrH|dkrHtdd S )NZscipyr   z#All costs of the Matcher can't be 0)rZ   r[   r   r  r  r  r   )r^   r  r  r  r`   r.   r/   r[   	  s    
zDetaHungarianMatcher.__init__c                 C   sT  |d j dd \}}|d dd }|d dd}tdd |D }td	d |D }d
}	d}
d|	 ||
  d| d    }|	d| |
  |d    }|dd|f |dd|f  }tj||dd}tt|t| }| j	| | j
|  | j|  }|||d }dd |D }dd t||dD }dd |D S )a  
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
            targets (`List[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth
                 objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        r<   Nr   r   r   r=   c                 S   s   g | ]}|d  qS r  r.   r>  r.   r.   r/   rE   -	  s     z0DetaHungarianMatcher.forward.<locals>.<listcomp>c                 S   s   g | ]}|d  qS r  r.   r>  r.   r.   r/   rE   .	  s     r  r   g:0yE>r$  re   c                 S   s   g | ]}t |d  qS r  r  r>  r.   r.   r/   rE   A	  s     c                 S   s   g | ]\}}t || qS r.   r   )rA   rB   cr.   r.   r/   rE   B	  s     c                 S   s0   g | ](\}}t j|t jd t j|t jd fqS )r   )r+   r  r  )rA   rB   jr.   r.   r/   rE   C	  s     )r   r   rR  r+   r   rP   Zcdistr  r   r  r  r  r   cpur   r   )r^   r  r  r   r   Zout_probZout_bboxZ
target_idsZtarget_bboxr  r  Zneg_cost_classZpos_cost_classr  r  r  Zcost_matrixsizesr  r.   r.   r/   ri   	  s"    " zDetaHungarianMatcher.forward)r   r   r   )
r'   r(   r)   r*   r   r[   r+   rv   ri   rj   r.   r.   r`   r/   r    s   
r  )r  r   c                 C   sH   |   r&| jtjtjfkr| S |  S | jtjtjfkr<| S |  S d S r   )	Zis_floating_pointr   r+   r   Zfloat64r   Zint32r  r   )r  r.   r.   r/   _upcastG	  s    r  )r  r   c                 C   sH   t | } | dddf | dddf  | dddf | dddf   S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    Nr   r   r
   r   )r  r  r.   r.   r/   box_areaP	  s    r  c           
      C   s   t | }t |}t| d d d d df |d d d df }t| d d d dd f |d d dd f }|| jdd}|d d d d df |d d d d df  }|d d d f | | }|| }	|	|fS )Nr   r   rN   r   )r  r+   rM   rL   rO   )
boxes1boxes2Zarea1Zarea2Zleft_topZright_bottomwidth_heightZinterunioniour.   r.   r/   box_ioua	  s    ..,r  c                 C   s*  | ddddf | ddddf k  s:td|  |ddddf |ddddf k  sttd| t| |\}}t| dddddf |ddddf }t| dddddf |ddddf }|| jdd}|dddddf |dddddf  }||| |  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    Nr   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rN   r   )ru  r   r  r+   rL   rM   rO   )r  r  r  r  top_leftbottom_rightr  Zarear.   r.   r/   r  r	  s    	,,..,r  c                 C   sH   t j r8|  dkr*| d dS |  dS | jddS dS )z
    A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
    https://github.com/pytorch/pytorch/issues/38718
    r   r   T)as_tupleN)r+   r  Zis_scriptingr   r   r  Zunbind)rQ   r.   r.   r/   nonzero_tuple	  s
    
r  c                   @   s<   e Zd ZdZdee ee edddZdd Z	dd	 Z
d
S )DetaMatchera  
    This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
    have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.

    The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
    prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
    intersection-over-union overlap values.

    The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
    matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
    F
thresholdsr  allow_low_quality_matchesc                 C   s   |dd }|d dk r t d|dtd  |td tdd t|dd |dd D spt d	td
d |D st dt|t|d krt d|| _|| _|| _	dS )a6  
        Args:
            thresholds (`list[float]`):
                A list of thresholds used to stratify predictions into levels.
            labels (`list[int`):
                A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
                signifying {ignore, negative class, positive class}, respectively.
            allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
                If `True`, produce additional matches for predictions with maximum match quality lower than
                high_threshold. See `set_low_quality_matches_` for more details.

            For example,
                thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
                be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
                thus will be considered as true positives.
        Nr   zThresholds should be positivers  c                 s   s   | ]\}}||kV  qd S r   r.   )rA   lowhighr.   r.   r/   r@  	  s     z'DetaMatcher.__init__.<locals>.<genexpr>re   r   zThresholds should be sorted.c                 s   s   | ]}|d kV  qdS ))re   r   r   Nr.   )rA   lr.   r.   r/   r@  	  s     z&All labels should be either -1, 0 or 1z<Number of labels should be equal to number of thresholds - 1)
r   insertr   r   ru  r  rn   r  r  r  )r^   r  r  r  r.   r.   r/   r[   	  s    (zDetaMatcher.__init__c                 C   s   |  dkst| dkr^|j|dfdtjd}|j|df| jd tjd}||fS t	|dkspt|j
dd\}}|j| dtjd}t| j| jdd | jdd D ]"\}}}	||k||	k @ }
|||
< q| jr| || ||fS )a  
        Args:
            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
                pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
                (due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).

        Returns:
            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
                ground-truth index in [0, M)
            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
                whether a prediction is a true or false positive or ignored
        r   r   r   r   r   Nre   )r   AssertionErrornumelZnew_fullr   r+   r  r  Zint8ru  rM   r  r  r  set_low_quality_matches_)r^   match_quality_matrixZdefault_matchesZdefault_match_labelsZmatched_valsmatchesmatch_labelsr   r  r  Zlow_highr.   r.   r/   __call__	  s$    
  ,
zDetaMatcher.__call__c                 C   s8   |j dd\}}t||dddf k\}}d||< dS )a  
        Produce additional matches for predictions that have only low-quality matches. Specifically, for each
        ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each
        prediction in that set, if it is unmatched, then match it to the ground-truth G.

        This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`.
        r   r   N)rM   r  )r^   r  r  Zhighest_quality_foreach_gtr   Zpred_inds_with_highest_qualityr.   r.   r/   r  	  s    	z$DetaMatcher.set_low_quality_matches_N)F)r'   r(   r)   r*   r   r   r   r   r[   r  r  r.   r.   r.   r/   r  	  s   #)r  )r  num_samplespositive_fractionbg_labelc                 C   s   t | dk| |k@ d }t | |kd }t|| }t| |}|| }t| |}tj| |jdd| }tj| |jdd| }	|| }
||	 }|
|fS )aC  
    Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
    negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
    and then try to fill the remaining slots with negatives.

    Args:
        labels (Tensor): (N, ) label vector with values:
            * -1: ignore
            * bg_label: background ("negative") class
            * otherwise: one or more foreground ("positive") classes
        num_samples (int): The total number of labels with value >= 0 to return.
            Values that are not sampled will be filled with -1 (ignore).
        positive_fraction (float): The number of subsampled labels with values > 0
            is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
            `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
            positives, the sample is filled with negatives. If there are also not enough negatives, then as many
            elements are sampled as is possible.
        bg_label (int): label index of background ("negative") class.

    Returns:
        pos_idx, neg_idx (Tensor):
            1D vector of indices. The total length of both is `num_samples` or fewer.
    re   r   r   N)r  r   rL   r  r+   Zrandpermr   )r  r	  r
  r  Zpositivenegativenum_posZnum_negZperm1Zperm2pos_idxneg_idxr.   r.   r/   subsample_labels
  s    r  c           
      C   s   t |dkr| |fS |jdd\}}|| j|dd\}}|d d d f d|}tdd t||D }tdd t||D }	||	fS )	Nr   T)Zreturn_countsr   r   c                 S   s   g | ]\}}|d | qS r   r.   )rA   r  prr.   r.   r/   rE   ;
  s     z&sample_topk_per_gt.<locals>.<listcomp>c                 S   s   g | ]\}}|d | qS r   r.   )rA   r  gtr.   r.   r/   rE   <
  s     )rn   uniquer  r   r+   r   r  )
pr_indsgt_indsr  r  Zgt_inds2countsZscoresZpr_inds2Zpr_inds3Zgt_inds3r.   r.   r/   sample_topk_per_gt2
  s    r  c                       sH   e Zd Zd fdd	ZejejejdddZddd	Zd
d Z  Z	S )r  r   c                    s<   t    d| _d| _|| _tdgddgdd| _|| _d S )Nr  i  g333333?r   r   Tr  )rZ   r[   r
  r  batch_size_per_imager  proposal_matcherr  )r^   r   max_kr`   r.   r/   r[   B
  s    
zDetaStage2Assigner.__init__)matched_idxsmatched_labels
gt_classesc                 C   sz   |  dk}|r4|| }| j||dk< d||dk< nt|| j }t|| j| j| j\}}tj||gdd}||| fS )aA  
        Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification
        labels.

        Args:
            matched_idxs (Tensor): a vector of length N, each is the best-matched
                gt index in [0, M) for each proposal.
            matched_labels (Tensor): a vector of length N, the matcher's label
                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
            gt_classes (Tensor): a vector of length M.

        Returns:
            Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length,
            the classification label for
                each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the
                background (num_classes).
        r   re   r   )r  r  r+   r  r  r  r
  r   )r^   r  r  r  Zhas_gtZsampled_fg_idxsZsampled_bg_idxssampled_idxsr.   r.   r/   _sample_proposalsJ
  s       z$DetaStage2Assigner._sample_proposalsFc                 C   s   t |}g }g }t|D ]}tt|| d t|d |  \}}	| |\}
}| |
||| d \}}||| jk }|
| }| |||\}}|	||f |	| q|r||fS |S )Nr  r  r  )
rn   rG   r  r   rS  r  r  r  postprocess_indicesr   )r^   r  r  Zreturn_cost_matrixbsr  Ziousr  r  r   r  r  r  Zsampled_gt_classespos_pr_indspos_gt_indsr.   r.   r/   ri   n
  s6      
zDetaStage2Assigner.forwardc                 C   s   t |||| jS r   r  r  r^   r  r  r  r.   r.   r/   r   
  s    z&DetaStage2Assigner.postprocess_indices)r   )F)
r'   r(   r)   r[   r+   r   r  ri   r   rj   r.   r.   r`   r/   r  A
  s   $
r  c                       s6   e Zd Zd fdd	Zdd Zdd	 Zd
d Z  ZS )r  333333?ffffff?r   c                    sF   t    d| _d| _|| _|| _|| _t||gdddgdd| _d S )Nr   r   r   re   r   Tr  )	rZ   r[   r
  r  r  t_lowt_highr  anchor_matcher)r^   r(  r)  r  r`   r.   r/   r[   
  s    
  zDetaStage1Assigner.__init__c                 C   s@   t || j| jd\}}|d |d|d |d|d |S )a-  
        Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value
        (-1) for all elements that are not included in the sample.

        Args:
            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
        r   re   r   )r  r  r
  Zfill_r  )r^   labelr  r  r.   r.   r/   _subsample_labels
  s
    
z$DetaStage1Assigner._subsample_labelsc              	   C   s   t |}g }t|D ]}|d | }t || d dkrh|tjg tj|jdtjg tj|jdf qtt|| d t|\}}| 	|\}	}
| 
|
}
tt |}||
dk }|	| }| |||\}}||j||j }}|||f q|S )Nanchorsr  r   r   r   )rn   rG   r   r+   r   rt  r   r  r   r*  r,  r   r   r   )r^   r  r  r!  r  r  r-  r  r   r  r  Zall_pr_indsr"  r#  r.   r.   r/   ri   
  s4    
zDetaStage1Assigner.forwardc                 C   s   t |||| jS r   r$  r%  r.   r.   r/   r   
  s    z&DetaStage1Assigner.postprocess_indices)r&  r'  r   )r'   r(   r)   r[   r,  ri   r   rj   r.   r.   r`   r/   r  
  s   r  )rJ   )N)r  r   )br*   r?   r   r   dataclassesr   typingr   r   r   r   r   r+   Ztorch.nn.functionalr	   r   r   r   Zactivationsr   Z
file_utilsr   r   r   r   r   r   Zmodeling_outputsr   Zmodeling_utilsr   Zpytorch_utilsr   rP  r   r   r   autor   Zconfiguration_detar   Z
get_loggerr'   loggerZtransformers.image_transformsr   Ztorchvision.ops.boxesr   Zscipy.optimizer   r  Z_CHECKPOINT_FOR_DOCZ"DETA_PRETRAINED_MODEL_ARCHIVE_LISTr    r0   r9   rI   rT   ModulerU   rq   rt   r   r   r   r   r   rz   r   r   r   r  r  r  r'  ZDETA_START_DOCSTRINGr  r3  r/  rW  r  r  r   r  r  r  r  r  r  r  r  r  objectr  r  r  r  r  r.   r.   r.   r/   <module>   s    

$9O
(.'   'tuXq"'  >    h OQ	n+O