o
    rqi                     @   s
  d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlm  mZ d dlm  mZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dl m!Z! d dl"m#Z# G d	d
 d
ej$Z%G dd dej$Z&ej'e!j(ej)dG dd deZ*dS )    N)OrderedDict)AnyDictUnion)Models)MODELS
TorchModel)BasicResBlockCAMDenseTDNNBlock
DenseLayer	StatsPool	TDNNLayerTransitLayerget_nonlinear)Tasks)create_devicec                       s:   e Zd Zeddgddf fdd	Zdd Zdd	 Z  ZS )
FCM       P   c                    s   t t|   || _tjd|ddddd| _t|| _| j	|||d dd| _
| j	|||d dd| _tj||ddddd| _t|| _||d	  | _d S )
N      F)kernel_sizestridepaddingbiasr   r   )r   )r   r      )superr   __init__	in_planesnnZConv2dconv1ZBatchNorm2dbn1_make_layerlayer1layer2conv2bn2out_channels)selfblock
num_blocksZ
m_channelsfeat_dim	__class__ g/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/audio/sv/DTDNN.pyr      s,   zFCM.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )appendr   Z	expansionr    
Sequential)r)   r*   Zplanesr+   r   stridesZlayersr/   r/   r0   r#   4   s   
zFCM._make_layerc                 C   sv   | d}t| | |}| |}| |}t| | |}|j	}|
|d |d |d  |d }|S )Nr   r   r   r   )	unsqueezeFZrelur"   r!   r$   r%   r'   r&   shapeZreshape)r)   xoutr6   r/   r/   r0   forward<   s   


"zFCM.forward)__name__
__module____qualname__r	   r   r#   r9   __classcell__r/   r/   r-   r0   r      s    r   c                       s6   e Zd Z								d fd	d
	Zdd Z  ZS )CAMPPlusr      r         batchnorm-reluTsegmentc	                    s  t t|   t|d| _| jj}	|| _tt	dt
|	|dddd|dfg| _|}	ttdd	d
D ]=\}
\}}}t||	||| ||||d}| jd|
d  | |	||  }	| jd|
d  t|	|	d d|d |	d }	q2| jdt||	 | jdkr| jdt  | jdt|	d |dd n	| jdksJ d|  D ]}t|tjtjfrtj|jj |jd urtj|j qd S )N)r,   Ztdnn   r   r   )r   dilationr   
config_str)         )r   r   r   )r   r   r   )
num_layersZin_channelsr(   Zbn_channelsr   rF   rG   memory_efficientzblock%dz	transit%dF)r   rG   Zout_nonlinearrC   statsZdenseZ
batchnorm_)rG   framez6`output_level` should be set to 'segment' or 'frame'. )r   r>   r   r   headr(   output_levelr    r2   r   r   xvector	enumeratezipr
   Z
add_moduler   r   r   r   modules
isinstanceZConv1dZLinearinitZkaiming_normal_weightdatar   Zzeros_)r)   r,   Zembedding_sizeZgrowth_rateZbn_sizeZinit_channelsrG   rL   rP   ZchannelsirK   r   rF   r*   mr-   r/   r0   r   J   s|   	
	





zCAMPPlus.__init__c                 C   s<   | ddd}| |}| |}| jdkr|dd}|S )Nr   r   r   rN   )ZpermuterO   rQ   rP   Z	transpose)r)   r7   r/   r/   r0   r9      s   


zCAMPPlus.forward)r   r?   r   r@   rA   rB   TrC   )r:   r;   r<   r   r9   r=   r/   r/   r-   r0   r>   H   s    Br>   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )SpeakerVerificationCAMPPlusa
  A fast and efficient speaker embedding model, using a 2-dimensional convolution residual network as the head
    and a densely connected time delay neural network as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _|| _| jd | _| jd | _t| jd | _t| j| j| _	|d }| 
| | j	| j | j	  d S )NZ	fbank_dimemb_sizedeviceZpretrained_model)r   r   r]   Zother_configfeature_dimr^   r   r_   r>   embedding_model._SpeakerVerificationCAMPPlus__load_check_pointtoeval)r)   	model_dirr]   argskwargspretrained_model_namer-   r/   r0   r      s   
z$SpeakerVerificationCAMPPlus.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r   zFmodelscope error: the shape of input audio to model needs to be [N, T])rU   npZndarraytorchZ
from_numpylenr6   r4   -_SpeakerVerificationCAMPPlus__extract_featurera   rc   r_   detachcpu)r)   audiofeatureZ	embeddingr/   r/   r0   r9      s   


z#SpeakerVerificationCAMPPlus.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )Znum_mel_binsT)dimZkeepdim)KaldiZfbankr4   r`   meanr1   rj   cat)r)   ro   featuresaurp   r/   r/   r0   Z__extract_feature   s   
z-SpeakerVerificationCAMPPlus.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )Nrn   )Zmap_locationT)strict)	ra   Zload_state_dictrj   loadospathjoinre   r_   )r)   rh   r/   r/   r0   Z__load_check_point   s   
z.SpeakerVerificationCAMPPlus.__load_check_point)r:   r;   r<   __doc__r   strr   r   r9   rl   rb   r=   r/   r/   r-   r0   r\      s    
r\   )+ry   collectionsr   typingr   r   r   numpyri   rj   Ztorch.nnr    Ztorch.nn.functionalZ
functionalr5   Ztorchaudio.compliance.kaldiZ
complianceZkaldirr   Zmodelscope.metainfor   Zmodelscope.modelsr   r   Z'modelscope.models.audio.sv.DTDNN_layersr	   r
   r   r   r   r   r   Zmodelscope.utils.constantr   Zmodelscope.utils.devicer   Moduler   r>   Zregister_moduleZspeaker_verificationZcampplus_svr\   r/   r/   r/   r0   <module>   s&   $0M