o
    rqiÊU  ã                   @   sF  d dl Z d dlZd dlmZmZmZ d dlZd dlZ	d dl
m  mZ d dlZd dlm  mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- dZ.e)ƒ Z/e"j0ej1dG dd„ de ƒƒZ2G dd„ de	j3ƒZ4dS )é    N)ÚDictÚOptionalÚUnion)Úautocast)ÚDataset)Útqdm)ÚTrainers)ÚModelÚ
TorchModel)Ú	MsDataset)ÚBaseTrainer)ÚTRAINERS)ÚDEFAULT_MODEL_REVISIONÚ	ModelFile)Úcreate_device)Ú
get_logger)Úget_dist_infoÚget_local_rankÚ	init_distúsi-snr)Úmodule_namec                   @   s   e Zd ZdZdddefdededee deeee	f  deeee	f  dee fd	d
„Z
dejjfdd„Zdd„ Zdedeeef fdd„ZdS )ÚSeparationTraineraa  A trainer is used for speech separation.

    Args:
        model: id or local path of the model
        work_dir: local path to store all training outputs
        cfg_file: config file of the model
        train_dataset: dataset for training
        eval_dataset: dataset for evaluation
        model_revision: the git version of model on modelhub
    NÚmodelÚwork_dirÚcfg_fileÚtrain_datasetÚeval_datasetÚmodel_revisionc              	   K   sj  t |tƒr|  ||¡| _|d u rtj | jtj¡}n|d us"J dƒ‚tj 	|¡| _t
 | |¡ |  ¡ | _|| _| dd ¡d urEt|d ƒ tƒ \}}	|	dk| _| dd¡}
| jr`tƒ }d|› }
t|
ƒ| _d|vr{t| jjdƒstJ dƒ‚| jjj| _n|d | _|| _|| _tj | jd	¡}| j| jjj| jjjj| jjjj | jjjj!| jjj"j#| jjj"j$| jjj"j%d
œ}ddl&m'} t(|ƒ}|||d| _)W d   ƒ n1 sÑw   Y  t*j+| j||d dddddddœ}| jj,dkrû| jj,› d| jj-› |d< t*j.j/ 0| j¡| _1| j1| j)d< | j)d  2d| j1i¡ | j 3¡ }| j)d  2|¡ t4|| j)d | j)|| j)d d| _5d S )Nz?Config file should not be None if model is not from pretrained!Úlauncheré   ÚdeviceZgpuzcuda:Ú
max_epochsz1max_epochs is missing from the configuration filezhparams.yaml)Úoutput_folderÚseedÚlrÚweight_decayÚclip_grad_normÚfactorÚpatienceÚdont_halve_until_epochr   )Úload_hyperpyyaml)Ú	overrides)Zexperiment_directoryZhyperparams_to_saver+   FÚcpuZnccl)Údebugr    Zdata_parallel_backendZdistributed_launchZdistributed_backendZfind_unused_parametersÚcudaú:Úepoch_counterÚcheckpointerÚcounterÚ	optimizer)ÚmodulesZ	opt_classÚhparamsÚrun_optsr1   )6Ú
isinstanceÚstrZget_or_download_model_dirÚ	model_dirÚosÚpathÚjoinr   ZCONFIGURATIONÚdirnamer   Ú__init__Úbuild_modelr   r   Úgetr   r   Ú_distr   r   r    ÚhasattrÚcfgÚtrainr!   Z_max_epochsr   r   r#   r3   r$   r%   r&   Úlr_schedulerr'   r(   r)   Zhyperpyyamlr*   Úopenr5   ÚsbZcreate_experiment_directoryÚtypeÚindexÚutilsZ
epoch_loopZEpochCounterr0   Zadd_recoverablesÚas_dictÚ
SeparationÚ	separator)Úselfr   r   r   r   r   r   ÚkwargsÚ_Z
world_sizeZdevice_nameZ
local_rankZhparams_filer+   r*   Zfinr6   r4   © rQ   ús/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/trainers/audio/separation_trainer.pyr>   .   s–   
	ÿ
ÿ€




ÿþ






ð
ÿýú
ÿ
ûzSeparationTrainer.__init__Úreturnc                 C   sD   t j| j| jdd}t|tƒrt|dƒr|jS t|tj	j
ƒr |S dS )z1 Instantiate a pytorch model and return.
        T)Zcfg_dictZtrainingr   N)r	   Zfrom_pretrainedr9   rC   r7   r
   rB   r   ÚtorchÚnnÚModule)rN   r   rQ   rQ   rR   r?   ‘   s   
ÿÿzSeparationTrainer.build_modelc                 O   s,   | j j| j| j| j| jd | jd d d S )NÚdataloader_opts)Ztrain_loader_kwargsZvalid_loader_kwargs)rM   Úfitr0   r   r   r5   )rN   ÚargsrO   rQ   rQ   rR   rD   ›   s   
ûzSeparationTrainer.trainÚcheckpoint_pathc                 O   sB   |r|| j j_n| jj| jd | jj| j| j d t	d}t	|iS )N)r    rW   )Ztest_loader_kwargsZmin_key)
r5   r1   Zcheckpoints_dirr   Zload_check_pointr    rM   Úevaluater   ÚEVAL_KEY)rN   rZ   rY   rO   ÚvaluerQ   rQ   rR   r[   ¤   s   ýzSeparationTrainer.evaluate)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r8   r   r   r   r   r>   rT   rU   rV   r?   rD   r   Úfloatr[   rQ   rQ   rQ   rR   r   !   s0    úÿþýüû
úc
	
ÿr   c                   @   sb   e Zd ZdZddd„Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ ZdS )rL   z:A subclass of speechbrain.Brain implements training steps.Nc                    s–  |\}}|  ˆ j¡|  ˆ j¡}}tj‡fdd„tˆ jjƒD ƒdd  ˆ j¡‰|tjj	krmt 
¡ 4 ˆ jjs:ˆ jjrGˆ  ˆ|¡\}‰ˆ d¡}ˆ jjrRˆ j ||¡}ˆ jjr^ˆ  |ˆ¡\}‰W d  ƒ n1 shw   Y  ˆ jd |ƒ}ˆ jd |ƒ}t |gˆ jj ¡}|| ‰tj‡ ‡fdd„tˆ jjƒD ƒdd}| d	¡}	| d	¡}
|	|
krºt |d
d
d
|	|
 f¡}|ˆfS |dd…d|	…dd…f }|ˆfS )z?Forward computations from the mixture to the separated signals.c                    s   g | ]}ˆ | d    d¡‘qS )r   éÿÿÿÿ)Ú	unsqueeze©Ú.0Úi)ÚtargetsrQ   rR   Ú
<listcomp>½   s    ÿÿz.Separation.compute_forward.<locals>.<listcomp>rc   ©ÚdimNÚencoderZmasknetc                    s$   g | ]}ˆ j d  ˆ| ƒ d¡‘qS )Údecoderrc   )r4   rd   re   )rN   Úsep_hrQ   rR   ri   Ú   s    ÿÿr   r   )Útor    rT   ÚcatÚranger5   Únum_spksrG   ÚStageÚTRAINÚno_gradÚuse_speedperturbÚuse_rand_shiftÚadd_speed_perturbÚsumZuse_wavedropZwavedropZlimit_training_signal_lenÚcut_signalsr4   ÚstackÚsizeÚFÚpad)rN   Úmixrh   ÚstageÚnoiseZmix_lensZmix_wZest_maskZ
est_sourceZT_originZT_estrQ   )rN   rn   rh   rR   Úcompute_forward´   sL   

þûú	

€ö
þû

þzSeparation.compute_forwardc                 C   s   | j  ||¡S )zComputes the sinr loss)r5   Úloss)rN   Úpredictionsrh   rQ   rQ   rR   Úcompute_objectivesê   s   zSeparation.compute_objectivesc                 C   sf  |j }|j|jg}| jjdkr| |j¡ | jr´tƒ ; |  	||t
jj¡\}}|  ||¡}| jjrK| jj}|||k }| ¡ dkrF| ¡ }n	tdƒ n| ¡ }W d  ƒ n1 sYw   Y  || jjk r™| ¡ dkr™| j |¡ ¡  | jjdkrŒ| j | j¡ tjj | j ¡ | jj¡ | j  | j¡ | j !¡  n|  j"d7  _"t# $d %| j"¡¡ t &d¡ '| j(¡|_)nt|  	||t
jj¡\}}|  ||¡}| jjrÞ| jj}|||k }| ¡ dkrÝ| ¡ }n| ¡ }|| jjk r| ¡ dkr| ¡  | jjdkrtjj | j ¡ | jj¡ | j  ¡  n|  j"d7  _"t# $d %| j"¡¡ t &d¡ '| j(¡|_)| j *¡  | +¡  ,¡ S )zTrains one batché   r   zloss has zero elements!!Nr   zNinfinite loss or empty loss! it happened {} times so far - skipping this batch)-Úmix_sigÚs1_sigÚs2_sigr5   rr   ÚappendÚs3_sigZauto_mix_precr   r‚   rG   rs   rt   r…   Zthreshold_bylossÚ	thresholdZnelementÚmeanÚprintZloss_upper_limZscalerÚscaleZbackwardr&   Zunscale_r3   rT   rU   rJ   Zclip_grad_norm_r4   Ú
parametersÚstepÚupdateZnonfinite_countÚloggerÚinfoÚformatZtensorro   r    ÚdataZ	zero_gradÚdetachr,   )rN   ÚbatchÚmixturerh   r„   rƒ   ÚthZloss_to_keeprQ   rQ   rR   Ú	fit_batchï   sz   
ÿ

€óþþ
ÿ€ÿþ
zSeparation.fit_batchc                 C   sè   |j }|j}|j|jg}| jjdkr| |j¡ t 	¡  |  
|||¡\}}|  ||¡}W d  ƒ n1 s6w   Y  |tjjkrn| jjrnt| jdƒrd| jjdkrc|  |d |||¡ | j jd7  _n
|  |d |||¡ | ¡  ¡ S )z/Computations needed for validation/test batchesr†   NÚn_audio_to_saver   rc   )Úidr‡   rˆ   r‰   r5   rr   rŠ   r‹   rT   ru   r‚   r…   rG   rs   ÚTESTÚ
save_audiorB   rœ   r   r—   )rN   r˜   r€   Úsnt_idr™   rh   r„   rƒ   rQ   rQ   rR   Úevaluate_batch5  s&   
ÿý€zSeparation.evaluate_batchc                 C   s°   d|i}|t jjkr|| _|t jjkrVt| jjtj	ƒr/| j | j
g||¡\}}t | j
|¡ n
| jj
jjd d }| jjj||dœ| j|d | jjd|d idgd dS dS )z"Gets called at the end of a epoch.r   r   r$   )Úepochr$   )Z
stats_metaÚtrain_statsZvalid_stats)ÚmetaZmin_keysN)rG   rs   rt   r£   ZVALIDr7   r5   rE   Ú
schedulersZReduceLROnPlateaur3   Zupdate_learning_rateZoptimZparam_groupsZtrain_loggerZ	log_statsr1   Zsave_and_keep_only)rN   r€   Z
stage_lossr¢   Zstage_statsZ
current_lrZnext_lrrQ   rQ   rR   Úon_stage_endM  s0   ÿ
ÿþú

þízSeparation.on_stage_endc           
      C   sh  d}d}| j jr«g }d}t|jd ƒD ]-}| j  |dd…dd…|f |¡}| |¡ |dkr4|jd }q|jd |k r@|jd }q| j jrud}t|jd ƒD ]&}t | j j	| j j
d¡}||  | j¡||< tj|| |d fdd||< qN|r«| j jrtj|jd ||jd |jtjd	}t|ƒD ]\}}|| dd…d|…f |dd…dd…|f< q‘| d¡}	|	|fS )
z=Adds speed perturbation and random_shift to the input signalsrc   FTNr   ©r   r   )ZshiftsÚdims)r    Zdtype)r5   rv   rq   ÚshapeZspeedperturbrŠ   rw   rT   ÚrandintZ	min_shiftZ	max_shiftro   r    ZrollZzerosrb   Ú	enumeratery   )
rN   rh   Z	targ_lensZmin_lenZ	recombineZnew_targetsrg   Ú
new_targetZ
rand_shiftr   rQ   rQ   rR   rx   m  sL   ÿ

€
ÿÿû,
zSeparation.add_speed_perturbc                 C   sp   t  ddtd|jd | jj ƒ d¡ ¡ }|dd…||| jj …dd…f }|dd…||| jj …f }||fS )z‡This function selects a random segment of a given length within the mixture.
        The corresponding targets are selected accordinglyr   r   r§   N)rT   rª   Úmaxr©   r5   Ztraining_signal_lenÚitem)rN   r™   rh   Z	randstartrQ   rQ   rR   rz   œ  s    ýüÿÿ
ÿzSeparation.cut_signalsc                 C   s6   t |dƒr	| ¡  | ¡ D ]}||kr|  |¡ qdS )z3Reinitializes the parameters of the neural networksÚreset_parametersN)rB   r¯   r4   Úreset_layer_recursively)rN   ÚlayerZchild_layerrQ   rQ   rR   r°   ª  s   

€þz"Separation.reset_layer_recursivelyc                 C   s*  ddl m} tj | jjd¡}g }g }g }g }g d¢}tjj	j
|fi | jj¤Ž}	t|dƒ(}
tj|
|d}| ¡  t|	dd}t|ƒD ]×\}}|j\}}|j}|j|jg}| jjd	krd| |j¡ t ¡  |  |j|tjj¡\}}W d
  ƒ n1 sw   Y  |  ||¡}tj|g| jj dd}|  |j!¡}|  ||¡}| "¡ | "¡  }||d  #¡  $¡  %¡ |d  #¡  &¡  $¡  %¡ ƒ\}}}}||d  #¡  $¡  %¡ |d  #¡  &¡  $¡  %¡ ƒ\}}}}| "¡ | "¡  }|d | "¡ || '¡  | '¡  dœ}| (|¡ | | "¡ ¡ | | "¡ ¡ | | '¡  ¡ | | '¡  ¡ qFdt) *|¡ "¡ t) *|¡ "¡ t) *|¡ "¡ t) *|¡ "¡ dœ}| (|¡ W d
  ƒ n	1 sJw   Y  W d
  ƒ n	1 sZw   Y  t+ ,d -t) *|¡ "¡ ¡¡ t+ ,d -t) *|¡ "¡ ¡¡ t+ ,d -t) *|¡ "¡ ¡¡ t+ ,d -t) *|¡ "¡ ¡¡ d
S )zVThis script computes the SDR and SI-SNR metrics and saves
        them into a csv filer   )Úbss_eval_sourcesztest_results.csv)r    ÚsdrÚsdr_ir   zsi-snr_iÚw)Ú
fieldnamesT)Zdynamic_ncolsr†   Nrc   rj   ZavgzMean SISNR is {}zMean SISNRi is {}zMean SDR is {}zMean SDRi is {}).Zmir_eval.separationr²   r:   r;   r<   r5   r"   rG   ZdataioZ
dataloaderZmake_dataloaderrW   rF   ÚcsvÚ
DictWriterÚwriteheaderr   r«   r‡   r   rˆ   r‰   rr   rŠ   r‹   rT   ru   r‚   rs   rž   r…   r{   ro   r    r   Útr,   Únumpyr—   r®   ÚwriterowÚnpÚarrayr“   r”   r•   )rN   Z	test_datar²   Ú	save_fileZall_sdrsZ
all_sdrs_iZ
all_sisnrsZall_sisnrs_iZcsv_columnsZtest_loaderZresults_csvÚwriterrº   rg   r˜   r™   Zmix_lenr    rh   r„   ZsisnrZmixture_signalZsisnr_baselineZsisnr_ir³   rP   Zsdr_baseliner´   ÚrowrQ   rQ   rR   Úsave_results²  s   ÿÿÿ


ÿÿÿÿþþû
ûÃ€ûDzSeparation.save_resultsc           	   	   C   sN  t j | jjd¡}t j |¡st  |¡ t| jjƒD ]^}|ddd…|f }|| 	¡  
¡  d }t j |d ||d ¡¡}t || d¡ ¡ | jj¡ |ddd…|f }|| 	¡  
¡  d }t j |d ||d ¡¡}t || d¡ ¡ | jj¡ q|d ddd…f }|| 	¡  
¡  d }t j |d |¡¡}t || d¡ ¡ | jj¡ dS )	zFsaves the test audio (mixture, targets, and estimated sources) on diskZaudio_resultsr   Ng      à?zitem{}_source{}hat.wavr   zitem{}_source{}.wavzitem{}_mix.wav)r:   r;   r<   r5   Zsave_folderÚexistsÚmkdirrq   rr   Úabsr­   r•   Ú
torchaudioÚsaverd   r,   Zsample_rate)	rN   r    r™   rh   r„   Z	save_pathÚnsÚsignalr¿   rQ   rQ   rR   rŸ     s8   
ÿþÿþÿzSeparation.save_audio)N)r^   r_   r`   ra   r‚   r…   r›   r¡   r¦   rx   rz   r°   rÂ   rŸ   rQ   rQ   rQ   rR   rL   ±   s    
6F /^rL   )5r·   r:   Útypingr   r   r   r»   r½   ZspeechbrainrG   Zspeechbrain.nnet.schedulersZnnetr¥   rT   Ztorch.nn.functionalrU   Z
functionalr}   rÆ   Ztorch.cuda.ampr   Ztorch.utils.datar   r   Zmodelscope.metainfor   Zmodelscope.modelsr	   r
   Zmodelscope.msdatasetsr   Zmodelscope.trainers.baser   Zmodelscope.trainers.builderr   Zmodelscope.utils.constantr   r   Zmodelscope.utils.devicer   Zmodelscope.utils.loggerr   Zmodelscope.utils.torch_utilsr   r   r   r\   r“   Zregister_moduleZspeech_separationr   ZBrainrL   rQ   rQ   rQ   rR   Ú<module>   s6    