o
    rqig                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z" e" Z#dd Z$				dddZ%dddZ&G dd dZ'dS )    N)OrderedDict)Lock)get_am_datasetsget_voc_datasets)model_builder)criterion_builder)GAN_TrainerSambert_Trainerdistributed_init)KanTtsLinguisticUnit)
DataLoader)TtsCustomParams)TtsModelConfigurationExceptionTtsModelNotExistsException)
get_loggerc                 C   s   t dd |  D S )Nc                 s   s    | ]
}|j r| V  qd S )N)Zrequires_gradZnumel).0p r   h/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/audio/tts/voice.py	<genexpr>   s    z#count_parameters.<locals>.<genexpr>)sum
parameters)modelr   r   r   count_parameters   s   r      333333?mean_stdc           	      C   s(  |dkrP|}| d d df }| d d df }d|||k < d|||k< ||dd d d f  |ddd d f  }||||k < || d d df< || d d df< | S |}| d d df }| d d df }d|||k < d|||k< ||d |d   |d  }||||k < || d d df< || d d df< | S )Nr                 ?   r   r   )	melZf0_thresholdZuv_threshold	norm_type
f0_featureZf0_mvnZf0ZuvZf0_global_max_minr   r   r   	denorm_f0    s2   ,r%   c                 C   sN   |   }t| d d df |k d }d|d d df< d|d d df |< |S )Nr   r   r    r   )clonetorchwhere)r"   	thresholdZres_melindexr   r   r   binarizeF   s
   r+   c                   @   sn   e Zd Zdi ddfddZdd Zdd	 Zd
d Zdd Zdd Zde	 fddZ
de	 fddZdd ZdS )VoiceNTFc                 C   s  || _ || _|| _|| _tj std| _d| _ndtj	j
_t \| _| _| _| _t|dkr|tj | _|tj | _tj| jsMtj|| j| _tj| js]tj|| j| _|tj }|tj }tj|sttj||}tj|stj||}| || _| || _|tjd| _ tj| j stj|| j | _ |tj!d| _"tj| j"stj|| j"| _"|tj#d| _$tj| j$stj|| j$| _$|tj%d| _&tj| j&stj|| j&| _&nNtj|d| _$tj|d	d
| _tj|dd
| _tj|d	d| _ | tj|d	d| _| tj|dd| _tj|d	d| _&tj|ddd| _"t'(d| j d| j  t'(d| j$  t'(d| j  t'(d| j  t'(d| j  d| j"  t'(d| j&  tj)| jst*dtj)| jst*dt| jdkrt+dt| jdkrt+dt,| jd}t-j.|t-j/d| _0W d    n	1 sw   Y  t,| jd}t-j.|t-j/d| _1W d    n	1 sw   Y  d| j0vrt*d| j0d dd| _2d| _3t4 | _5t6| j0| _7| j78 | _9| jr0t:d }	| j9; D ]\}
}|
|	v r.|d! | j9|
< q| j0d" d# d$ <| j9 | j0d" d# d$ d%d| _=| j=rk| jsktj)| j sdt*d&| j  d't>.| j | _?| j0d" d# d$ d(d| _@| j@r| js| j0d" d# d$ d)d*| _A| jAd*krtj)| j&st+d+| j& d't>.| j&| _Bd S | j0d" d# d$ d,d-}| j0d" d# d$ d.d/}||g| _Bd S d S d S )0NcpuFTr   zse.npyzse.onnxzaudio_config.yamlzmvn.npyamconfig.yamlZvocckptsez
am_config=z voc_config=zaudio_config=z	am_ckpts=z
voc_ckpts=zse_path=z se_model_path=z	mvn_path=z,modelscope error: am configuration not foundz-modelscope error: voc configuration not foundz)modelscope error: am model file not foundz*modelscope error: voc model file not foundrLoaderZlinguistic_unitzno linguistic_unit in am configlanguageZPinYin)ZsyZtoneZsyllable_flagZword_segmentZemotionspeakerr!   ModelKanTtsSAMBERTparamsZSEzse enabled but se_file: not existsZNSFnsf_norm_typer   zf0_mvn_file: nsf_f0_global_minimumg      >@nsf_f0_global_maximumg     І@)C
voice_name
voice_pathignore_maskis_trainr'   cudaZis_availabledevicedistributedbackendsZcudnnZ	benchmarkr
   
local_rank
world_sizelenr   Z	AM_CONFIGZam_config_pathZ
VOC_CONFIGZvoc_config_pathospathisabsjoinZAM_CKPTZVOC_CKPT	scan_ckptam_ckpts	voc_ckptsgetZSE_FILEZse_pathZSE_MODELZse_model_pathZAUIDO_CONFIGaudio_configZMVN_FILEZmvn_pathloggerinfoexistsr   r   openyamlloadr4   	am_config
voc_configZ	lang_typemodel_loadedr   lockr   	ling_unitget_unit_sizeling_unit_sizesetitemsupdate	se_enablenpr1   
nsf_enabler;   r$   )selfr>   r?   Zcustom_ckptr@   rA   Zam_ckptZvoc_ckptfZ
target_setkvr<   r=   r   r   r   __init__Q   sL  







zVoice.__init__c                 C   s   |}d}t j|sd}t j|}t |}t|dkri S i }|D ]=}t|d dkr.q#|dd  dkr`|dd dkr`|d	d }t|d
d }t j||}	|r\|	|kr\q#|	||< q#t	t
| }
|
S )NFTr      .pth
   
checkpoint._r   )rI   rJ   isdirdirnamelistdirrH   splitintrL   r   sortedr`   )re   Z	ckpt_pathZselect_targetZinput_not_dirfilelistZckptsfilenameZfilename_prefixidxrJ   Zodr   r   r   rM      s,   
 zVoice.scan_ckptc                 C   sd   t | j| j\| _}}| jd | _tj| jtt	| j | jd}| jj
|d dd | j  d S )Nr8   Zmap_locationr   F)strict)r   rX   rC   Zam_modelr.   r'   rW   rN   nextreversedload_state_dicteval)re   rp   Z
state_dictr   r   r   load_am   s   zVoice.load_amc                 C   s   ddl m} |di | jd d d | _tj| jtt| j | j	d}| j
|d d  | jd d d d	 d
krFddlm} | | _| j  | j | j	 d S )Nr   )	Generatorr7   r   r9   rz   r   	generatorZout_channelsr!   )PQMFr   )Zkantts.models.hifigan.hifiganr   rY   	voc_modelr'   rW   rO   r|   r}   rC   r~   Zkantts.models.pqmfr   Zremove_weight_normr   to)re   r   Zstatesr   r   r   r   load_vocoder   s   
zVoice.load_vocoderc              	   C   s  | j J t 3 | j|}d}| j r1t||  | j	}tj
|gddd}nNt||  | j	}|d }t||  | j	}|d }t||  | j	}|d }t||  | j	}	tj
||||	gddd}|d }t||  | j	d}
|d }| jrt| jjt|| dd | j	dd d d dd d f }nt||  | j	dd d d df }td| j	 |
d d }| |d d d dd d f |
d d d df ||}|d }|d }t|d  }|dd |d d f  }| jr-t|| j| jd}|W  d    W  d    S 1 sAw   Y  W d    d S 1 sRw   Y  d S )	Nr   r   )dimr!   )Zaxispostnet_outputsLR_length_rounded)r#   r$   )r[   r'   no_gradr\   Zencode_symbol_sequenceZ
using_byteZ
from_numpylongr   rC   stack	unsqueezerb   r1   repeatrH   floatZzerossizer.   ru   itemr-   rd   r%   r;   r$   )re   
symbol_seqZinputs_feat_lstZinputs_feat_indexZinputs_byte_indexZinputs_lingZ	inputs_syZinputs_toneZinputs_syllableZ	inputs_wsZ
inputs_emoZ
inputs_spkZ
inputs_lenresr   r   Zvalid_lengthZmel_postr   r   r   
am_forward  s   



.$zVoice.am_forwardc                 C   s   t  ; || j}| jjrt|}|ddd}| |}t	| jdr-| j
|}|d  }|W  d    S 1 sBw   Y  d S )Nr!   r   Zpqmfr   )r'   r   r   rC   r   rd   r+   Z	transposer   hasattrZ	synthesisviewr-   numpy)re   Zmelspecxyr   r   r   vocoder_forwardQ  s   

$zVoice.vocoder_forwardc           !         s  t d t| jdkrtd|dd}|dk r!|dd}n|dd}|d	d}	t| jd
}
tj	|
tj
d}W d    n1 sFw   Y  t|d
}
|tj	|
tj
d || W d    n1 skw   Y  d }|rtt| j}| j| }tj|std| dn|| jvrtd| | j| }|	dkr|	| }||d< t d|  tdt |d< ddlm} ||d< ttj|dd}
tj||
tjd d W d    n1 sw   Y  | D ]\}}t | d|  q| jrtj |d< d|d< | jrd}d}nd}d}|d d d d d  fd!d"|D }t||||d# d$| d%\}}t d&t| d' t d(t| d' d d d)}| jr}dd*lm } ||| j!dd+|d,< |ry||| j!dd+nd |d-< t"|| jrdnd|j#|d. |d/ |d, |d0 d1}|rt"|| jrdnd|j#|d. |d/ |d- |d0 d1nd }|j$% }|d d d | t&|| j'| j(| j\}}}t)|| j'}t*|||||| j'||||||d2 |d3 |d4 |d5 d6}|d ur
|+|dd t d7| d' z|,  W d S  t-t.fyL }  z,t j/| dd8 |0tjtj|d9d:|j1 d; t d<|j1 d= W Y d } ~ d S d } ~ ww )>NzTRAIN SAMBERT....r   "resume pretrain but model is emptyresume_from_stepsr   resume_from_latestTFtrain_stepsr2   r3   latest model:r:   no such model from steps:train_max_stepsTRAINING steps: %Y-%m-%d %H:%M:%Screate_time__version__modelscope_versionr/   wDumperZdefault_flow_style = ZrankrD   r   g{Gz?r7   r8   r9   ZFPc                    s"   g | ]}t j| sd ndqS )zraw_metafile.txtzfprm_metafile.txt)rI   rJ   rL   )r   dZ	fp_enabler   r   
<listcomp>  s    
z'Voice.train_sambert.<locals>.<listcomp>Zallow_cacher    )Zsplit_ratioThe number of training files = ro   !The number of validation files = trainvalidDistributedSamplerZdatasetZnum_replicasshuffler   r   
batch_sizenum_workers
pin_memoryr   
collate_fnr   r   samplerr   save_interval_stepseval_interval_stepslog_intervalZ	grad_norm)configr   	optimizer	scheduler	criterionrC   r   train_loadervalid_loader	max_stepssave_dirsave_intervalvalid_intervalr   Z	grad_clipSuccessfully resumed from exc_infor0   checkpoint-rl    Successfully saved checkpoint @ steps.)2rR   rS   rH   rN    TtsTrainingInvalidModelExceptionrP   rU   rQ   rV   rW   r4   ra   r|   r}   rI   rJ   rT   timestrftime	localtime
modelscoper   rL   dumpr   r`   rD   r'   Zget_rankrb   r   torch.utils.data.distributedr   rG   r   r   r\   r]   r   rC   rF   r   r	   load_checkpointr   	ExceptionKeyboardInterrupterrorsave_checkpointsteps)!re   work_dir	stage_dirdata_dirconfig_pathignore_pretrainhparams
from_stepsfrom_latestr   rf   r   resume_fromr   r   keyvalueZvalid_enableZvalid_split_ratioZ	meta_filetrain_datasetvalid_datasetr   r   train_dataloadervalid_dataloaderr^   r   r   r   r   trainerer   r   r   train_sambert]  s"  








	



zVoice.train_sambertc                 C   s  t d t| jdkrtd|dd}|dk r!|dd}n|dd}|d	d}	t| jd
}
tj	|
tj
d}W d    n1 sFw   Y  t|d
}
|tj	|
tj
d || W d    n1 skw   Y  d }|rtt| j}| j| }tj|std| dn|| jvrtd| | j| }|	dkr|	}||d< t d|  t d|  tdt |d< ddlm} ||d< ttj|dd}
tj||
tjd d W d    n1 sw   Y  | D ]\}}t | d|  qt||\}}t dt| d t dt| d d d d}| jrEddlm} ||| jdd|d < ||| jdd|d!< t|| jrMdnd|j |d" |d# |d  |d$ d%}t|| jrgdnd|j |d" |d# |d! |d$ d%}t!|| j"| j#| j\}}}t$|| j"}t%|||||| j"||||||d& |d' |d( d)}|d ur|&| t d*| d z|'  W d S  t(t)fy } z,t j*|dd+ |+tjtj|d,d-|j, d. t d/|j, d0 W Y d }~d S d }~ww )1NzTRAIN HIFIGAN....r   r   r   r   r   TFr   r2   r3   r   r:   r   r   r   zresume from: r   r   r   r   r/   r   r   r   r   ro   r   r   r   r   r   r   r   r   r   r   r   r   Zlog_interval_steps)r   r   r   r   r   rC   r   r   r   r   r   r   r   r   r   r   r0   r   rl   r   r   )-rR   rS   rH   rO   r   rP   rU   rQ   rV   rW   r4   ra   r|   r}   rI   rJ   rT   r   r   r   r   r   rL   r   r   r`   r   rD   r   r   rG   r   r   r   rC   rF   r   r   r   r   r   r   r   r   r   )re   r   r   r   r   r   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   train_hifigan   s   












zVoice.train_hifiganc                 C   sR   | j  | js|   |   d| _W d    n1 sw   Y  | | |S )NT)r[   rZ   r   r   r   r   )re   r   r   r   r   forward  s   zVoice.forward)__name__
__module____qualname__ri   rM   r   r   r   r   dictr   r   r   r   r   r   r   r,   O   s*    
 	E
 )
 r,   )r   r   r   N)r   )(rI   pickleZpklr   collectionsr   	threadingr   jsonr   rc   r'   rV   Zkantts.datasets.datasetr   r   Zkantts.modelsr   Zkantts.train.lossr   Zkantts.train.trainerr   r	   r
   Z kantts.utils.ling_unit.ling_unitr   Ztorch.utils.datar   Z"modelscope.utils.audio.audio_utilsr   Z%modelscope.utils.audio.tts_exceptionsr   r   Zmodelscope.utils.loggerr   rR   r   r%   r+   r,   r   r   r   r   <module>   s6   

&	