o
    W+ iA                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ d&ddZd	ed
ededefddZG dd de	jZG dd de	jZG dd de	jZG dd dej	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#G dd  d e	jZ$G d!d" d"e	jZ%ej&ej'ej(d#G d$d% d%eZ)dS )'z This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    RDINOHead implementation is adapted from DINO framework.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r
   )lenshapemaxlongitemtorchZaranger
   r   expand	unsqueezeZ	as_tensor)lengthmax_lenr   r
   mask r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/audio/sv/rdino.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr	      )mathceil)r   r   r   r   Zn_stepsZL_outpaddingr   r   r   get_padding_elem&   s   r"   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr	   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r!   groupsbias)
super__init__r   r   r   r!   padding_modennr#   conv)
selfout_channelsr   in_channelsr   r   r!   r&   r'   r*   	__class__r   r   r)   5   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr$   Zcausalr	   r   Zvalidz1Padding must be 'same', 'valid' or 'causal'. Got )	r!   _manage_paddingr   r   r   Fpad
ValueErrorr,   )r-   xZnum_padZwxr   r   r   forwardS   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r"   r3   r4   r*   )r-   r6   r   r   r   r   r!   r   r   r   r2   h   s   
zConv1d._manage_padding)r	   r	   r$   r	   Tr%   )__name__
__module____qualname__r)   r7   intr2   __classcell__r   r   r0   r   r#   3   s     r#   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r(   r)   r+   r?   norm)r-   
input_sizerB   rC   r0   r   r   r)   x   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rD   r-   r6   r   r   r   r7      s   
zBatchNorm1d.forward)r@   rA   r:   r;   r<   r)   r7   r>   r   r   r0   r   r?   v   s
    r?   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr	   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r/   r.   r   r   r&   rE   )r(   rI   r)   r#   r,   
activationr?   rD   )r-   r/   r.   r   r   rK   r&   r0   r   r   r)      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rF   )rD   rK   r,   rG   r   r   r   r7      s   zTDNNBlock.forward)r:   r;   r<   r+   ReLUr)   r7   r>   r   r   r0   r   rI      s
    rI   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r	   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rI   ).0ir   Zhidden_channelZ
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r	   )r(   rM   r)   r+   
ModuleListrangeblocksscale)r-   r/   r.   rW   r   r   r0   rR   r   r)      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr	   dimr   )	enumerater   chunkrW   rV   appendcat)r-   r6   yrQ   Zx_iZy_ir   r   r   r7      s   zRes2NetBlock.forward)rN   rO   r	   rH   r   r   r0   r   rM      s    rM   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr	   r/   r.   r   T)Zinplace)r(   r_   r)   r#   conv1r   r+   rL   reluconv2ZSigmoidsigmoid)r-   r/   se_channelsr.   r0   r   r   r)      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr8   r   r
   r	   r   TrY   Zkeepdim)
r   r   r
   r   summeanrb   ra   rd   rc   )r-   r6   lengthsLr   totalsr   r   r   r7      s   

zSEBlock.forwardrF   rH   r   r   r0   r   r_      s    
r_   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rO   r	   r`   )r(   r)   rB   global_contextrI   tdnnr+   ZTanhtanhr#   r,   )r-   channelsattention_channelsrp   r0   r   r   r)      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )Nr8   r   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr   )rh   r   sqrtr   powclamp)r6   mrY   rB   ri   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r
   rf   r	   Trg   rX   z-inf)r   rB   r   Zonesr
   r   r   rp   rh   floatrepeatr]   r,   rr   rq   Zmasked_fillr3   Zsoftmax)r-   r6   rj   rk   rz   r   rl   ri   ry   ZattnZpooled_statsr   r   r   r7      s(   


z"AttentiveStatisticsPooling.forward)ro   TrF   rH   r   r   r0   r   rn      s    rn   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrN   ro   r	   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr	   )r   r   rK   r&   r`   )r(   r)   r.   rI   tdnn1rM   res2net_blocktdnn2r_   se_blockshortcutr#   )	r-   r/   r.   res2net_scalere   r   r   rK   r&   r0   r   r   r)   %  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rF   )r   r~   r   r   r   )r-   r6   rj   Zresidualr   r   r   r7   N  s   



zSERes2NetBlock.forwardrF   )	r:   r;   r<   r   r+   rL   r)   r7   r>   r   r   r0   r   r}   #  s    )r}   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )r   r   r   r   i   )   rO   rO   rO   r	   )r	   r   rO      r	   ro   rN   T)r	   r	   r	   r	   r	   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r	   )r   re   r   r   rK   r&   r8   )r&   )rt   rp   r   rJ   r`   )r(   r)   r   rs   r+   rT   rV   r\   rI   rU   r}   mfarn   aspr?   asp_bnr#   fc)r-   rE   r
   Zlin_neuronsrK   rs   Zkernel_sizesZ	dilationsrt   r   re   rp   r&   rQ   r0   r   r   r)   a  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r	   r   )rj   NrX   )Z	transposerV   	TypeErrorr\   r   r]   r   r   r   r   Zsqueeze)r-   r6   rj   Zxllayerr   r   r   r7     s    



zECAPA_TDNN.forwardrF   )
r:   r;   r<   __doc__r   r+   rL   r)   r7   r>   r   r   r0   r   r   [  s    Hr   c                       s:   e Zd Z						d fdd	Zd	d
 Zdd Z  ZS )	RDINOHeadFTrO             c	                    s  t    t|d}|dkrt||| _nJt||g}	|r'|	t| |	t  t	|d D ]}
|	t|| |rI|	t| |	t  q4|	t|| tj
|	 | _t||| _| | j tjtj||dd| _| jjjd |rd| jj_d S d S )Nr	   r   F)r'   )r(   r)   r   r+   Linearmlpr\   r?   ZGELUrU   Z
Sequential	add_layerapply_init_weightsutilsZweight_norm
last_layerZweight_gdataZfill_Zrequires_grad)r-   Zin_dimZout_dimZuse_bnZnorm_last_layerZnlayersZ
hidden_dimZbottleneck_dimZadd_dimZlayers_r0   r   r   r)     s0   
	
zRDINOHead.__init__c                 C   sV   t |tjr%tjjj|jdd t |tjr'|jd ur)tj|jd d S d S d S d S )Ng{Gz?)ry   r   )	
isinstancer+   r   r   initZtrunc_normal_weightr'   Z	constant_)r-   rx   r   r   r   r     s   zRDINOHead._init_weightsc                 C   s8   |  |}| |}tjj|ddd}| |}||fS )Nr8   r   )rY   p)r   r   r+   
functional	normalizer   )r-   r6   Zvicr_outr   r   r   r7     s
   


zRDINOHead.forward)FTrO   r   r   r   )r:   r;   r<   r)   r   r7   r>   r   r   r0   r   r     s    "r   c                       s$   e Zd Z fddZdd Z  ZS )Combinec                    s   t t|   || _|| _d S rF   )r(   r   r)   backbonehead)r-   r   r   r0   r   r   r)     s   
zCombine.__init__c                 C   s   |  |}| |}|S rF   )r   r   )r-   r6   outputr   r   r   r7     s   

zCombine.forwardrH   r   r   r0   r   r     s    r   )module_namec                       sD   e Zd Zdeeef f fddZdd Zdd Zdd	d
Z	  Z
S )SpeakerVerification_RDINOmodel_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| j|d| _t| jt	ddd	| _|d
 }| 
| | j  d S )NZchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   )rs   r   i   TZpretrained_model)r(   r)   r   Zother_configr5   feature_dimr   embedding_modelr   r   ,_SpeakerVerification_RDINO__load_check_pointeval)r-   	model_dirr   argskwargsZchannels_configpretrained_model_namer0   r   r   r)     s$   

z"SpeakerVerification_RDINO.__init__c                 C   s>   t |jdkr|jd dksJ d| |}| j|}|S )Nr   r   r	   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   +_SpeakerVerification_RDINO__extract_featurer   r   )r-   audiofeatureZ	embeddingr   r   r   r7   $  s   
z!SpeakerVerification_RDINO.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)Znum_mel_binsr   Trg   )KaldiZfbankr   ri   r   )r-   r   r   r   r   r   Z__extract_feature-  s   
z+SpeakerVerification_RDINO.__extract_featureNc                 C   sR   |st d}t jtj| j||d}dd |d  D }| jj	|dd d S )Nr   )Zmap_locationc                 S   s   i | ]\}}| d d|qS )zmodule. )replace)rP   kvr   r   r   
<dictcomp>9  s    z@SpeakerVerification_RDINO.__load_check_point.<locals>.<dictcomp>ZteacherT)strict)
r   r
   loadospathjoinr   itemsr   Zload_state_dict)r-   r   r
   Z
state_dictZstate_dict_tear   r   r   Z__load_check_point3  s   

z,SpeakerVerification_RDINO.__load_check_pointrF   )r:   r;   r<   r   strr   r)   r7   r   r   r>   r   r   r0   r   r   	  s
    	r   )NNN)*r   r   r   typingr   r   r   r   Ztorch.nnr+   Ztorch.nn.functionalr   r3   Ztorchaudio.compliance.kaldiZ
complianceZkaldir   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Zmodelscope.utils.constantr   r   r=   r"   Moduler#   r?   rI   rM   r_   rn   r}   r   r   r   Zregister_moduleZspeaker_verificationZrdino_tdnn_svr   r   r   r   r   <module>   s6   
C'=8o2