o
    rqi/  ã                   @   sH  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
„ d
ej#ƒZ$ddd„Z%ddd„Z&G dd„ dej'ƒZ(G dd„ dej'ƒZ)G dd„ dej'ƒZ*ej+e j,ej-dG dd„ deƒƒZ.dS )aŸ   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
    fusion (LFF) fuses the features within one single residual block to extract the local signal.
    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
é    N)ÚAnyÚDictÚUnion)ÚModels)ÚMODELSÚ
TorchModel)ÚAFF)ÚTasks)Úcreate_devicec                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr   Ú__init__)ÚselfÚinplace©Ú	__class__© új/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2Net.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r   Ú__name__)r   Zinplace_strr   r   r   Ú__repr__   s   
ÿÿzReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r   r   r      s    r   é   c                 C   ó   t j| |d|dddS )z1x1 convolution without paddingr   r   F©Úkernel_sizeÚstrideÚpaddingÚbias©ÚnnÚConv2d©Ú	in_planesZ
out_planesr    r   r   r   Úconv1x1$   ó   úr(   c                 C   r   )z3x3 convolution with paddingé   r   Fr   r#   r&   r   r   r   Úconv3x3/   r)   r+   c                       ó*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	ÚBasicBlockERes2Neté   r   é    c           
   	      s6  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }t| jƒD ]}	| t||ƒ¡ | t 	|¡¡ q/t |¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dksx|| j| krt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nç      P@T©r   r   F©r   r    r"   )r   r-   r   ÚintÚmathÚfloorr(   Úconv1r$   ÚBatchNorm2dÚbn1ÚnumsÚrangeÚappendr+   Ú
ModuleListÚconvsÚbnsr   ÚreluÚ	expansionÚconv3Úbn3Ú
SequentialÚshortcutr%   r    ÚwidthÚscale)
r   r'   Úplanesr    Ú	baseWidthrF   rE   r=   r>   Úir   r   r   r   =   s<   
ûú
zBasicBlockERes2Net.__init__c                 C   sÔ   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S ©Nr   r   )r6   r8   r?   ÚtorchÚsplitrE   r:   r9   r=   r>   ÚcatrA   rB   rD   ©r   ÚxZresidualÚoutZspxrI   Úspr   r   r   Úforward\   s(   







zBasicBlockERes2Net.forward©r   r/   r.   ©r   r   r   r@   r   rR   r   r   r   r   r   r-   :   s    r-   c                       r,   )	ÚBasicBlockERes2Net_AFFr.   r   r/   c              	      sj  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }g }	t| jƒD ]}
| t||ƒ¡ |	 t 	|¡¡ q1t| jd ƒD ]
}| t|d¡ qKt |¡| _t |	¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dks’|| j| krªt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nr0   r   ©ÚchannelsTr1   Fr2   )r   rU   r   r3   r4   r5   r(   r6   r$   r7   r8   r9   r:   r;   r+   r   r<   r=   r>   Úfuse_modelsr   r?   r@   rA   rB   rC   rD   r%   r    rE   rF   )r   r'   rG   r    rH   rF   rE   r=   rX   r>   rI   Újr   r   r   r   |   sD   
ûú
zBasicBlockERes2Net_AFF.__init__c                 C   sà   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]7}|dkr)|| }n| j|d  ||| ƒ}| j	| |ƒ}|  | j
| |ƒ¡}|dkrM|}qt ||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rJ   )r6   r8   r?   rK   rL   rE   r:   r9   rX   r=   r>   rM   rA   rB   rD   rN   r   r   r   rR       s(   







zBasicBlockERes2Net_AFF.forwardrS   rT   r   r   r   r   rU   y   s    $rU   c                       sB   e Zd Zeeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚERes2Net)r*   é   é   r*   r/   éP   éÀ   ZTSTPFc	           	         sì  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d |d	 ddddd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t|d	 d| _t|d d| _t|d d| _|dks¶|dkr¸dnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _ | jrêt	j!|dd| _"t	 ||¡| _#d S t	 $¡ | _"t	 $¡ | _#d S )Né   r   r*   Fr   r   )r    r.   r[   )r   r!   r    r"   é   rV   ÚTAPZTSDP)Zin_dim)Zaffine)%r   rZ   r   r'   Úfeat_dimÚ	embed_dimr3   Z	stats_dimÚtwo_emb_layerr$   r%   r6   r7   r8   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Úlayer1_downsampleÚlayer2_downsampleÚlayer3_downsampler   Úfuse_mode12Úfuse_mode123Úfuse_mode1234Zn_statsÚgetattrÚpooling_layersr@   ÚpoolZLinearÚseg_1ZBatchNorm1dÚseg_bn_1Úseg_2ZIdentity)	r   ÚblockZ
block_fuseÚ
num_blocksÚ
m_channelsrb   rc   Zpooling_funcrd   r   r   r   r   À   sz   	ÿÿÿÿÿúúú	
ÿÿ
zERes2Net.__init__c                 C   sL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S )Nr   )r;   r'   r@   r$   rC   )r   rv   rG   rw   r    ÚstridesZlayersr   r   r   re     s   
zERes2Net._make_layerc                 C   sØ   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	||¡}|  
|¡}|  |¡}|  ||¡}	|  |¡}
|  |	¡}|  |
|¡}|  |¡}|  |¡}| jrjt |¡}|  |¡}|  |¡}|S |S )Nr   r.   r   )ZpermuteZ
unsqueeze_ÚFr?   r8   r6   rf   rg   rj   rm   rh   rk   rn   ri   rl   ro   rr   rs   rd   rt   ru   )r   rO   rP   Zout1Zout2Zout1_downsampleZ
fuse_out12Zout3Zfuse_out12_downsampleZfuse_out123Zout4Zfuse_out123_downsampleZfuse_out1234ÚstatsZembed_aZembed_br   r   r   rR     s*   












zERes2Net.forward)	r   r   r   r-   rU   r   re   rR   r   r   r   r   r   rZ   ¾   s    øDrZ   )Úmodule_namec                       sH   e Zd ZdZdeeef f‡ fdd„Zdd„ Zdd„ Z	dd
d„Z
‡  ZS )ÚSpeakerVerificationERes2Neta  Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
    of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
    interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    Úmodel_configc                    s–   t ƒ j||g|¢R i |¤Ž || _| jd | _| jd | _|| _d| _t| jd ƒ| _t	| j| jd| _
|d }|  |¡ | j
 | j¡ | j
 ¡  d S )Nrc   rW   r]   Údevice)rc   rx   Zpretrained_model)r   r   r~   rc   rx   Zother_configÚfeature_dimr
   r   rZ   Úembedding_modelÚ._SpeakerVerificationERes2Net__load_check_pointÚtoÚeval)r   Ú	model_dirr~   ÚargsÚkwargsÚpretrained_model_namer   r   r   r   6  s   ÿ
z$SpeakerVerificationERes2Net.__init__c                 C   sl   t |tjƒrt |¡}t|jƒdkr| d¡}t|jƒdks"J dƒ‚|  |¡}|  	| 
| j¡¡}| ¡  ¡ S )Nr   r   r.   zFmodelscope error: the shape of input audio to model needs to be [N, T])Ú
isinstanceÚnpZndarrayrK   Z
from_numpyÚlenÚshapeÚ	unsqueezeÚ-_SpeakerVerificationERes2Net__extract_featurer   rƒ   r   ÚdetachÚcpu)r   ÚaudioÚfeatureZ	embeddingr   r   r   rR   I  s   

ÿþþ
z#SpeakerVerificationERes2Net.forwardc                 C   s0   t j|| jd}||jddd }| d¡}|S )N)Znum_mel_binsr   T)ÚdimZkeepdim)ÚKaldiZfbankr€   Úmeanr   )r   r‘   r’   r   r   r   Z__extract_featureW  s   
z-SpeakerVerificationERes2Net.__extract_featureNc                 C   s8   |st  d¡}| jjt jtj | j|¡|ddd d S )Nr   )Zmap_locationT)Ústrict)	rK   r   r   Zload_state_dictÚloadÚosÚpathÚjoinr…   )r   rˆ   r   r   r   r   Z__load_check_point]  s   
þ
üz.SpeakerVerificationERes2Net.__load_check_point)N)r   r   r   Ú__doc__r   Ústrr   r   rR   rŽ   r‚   r   r   r   r   r   r}   +  s    r}   )r   )/r›   r4   r˜   Útypingr   r   r   ÚnumpyrŠ   rK   Ztorch.nnr$   Ztorch.nn.functionalZ
functionalrz   Ztorchaudio.compliance.kaldiZ
complianceZkaldir”   Z)modelscope.models.audio.sv.pooling_layersÚmodelsr‘   Úsvrq   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z!modelscope.models.audio.sv.fusionr   Zmodelscope.utils.constantr	   Zmodelscope.utils.devicer
   ZHardtanhr   r(   r+   ÚModuler-   rU   rZ   Zregister_moduleZspeaker_verificationZeres2net_svr}   r   r   r   r   Ú<module>   s2   

?Emÿ