o
    rqi                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlZ	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZ dd
lmZ G dd deZdS )    N)Path)AnyDict)CompositeAudioFeatureTransform)S2TDataConfig)pre_chinese)ModeKeys   )OfaBasePreprocessor)
Text2Phonec                       s   e Zd Zejf fdd	Zdeeef deeef fddZ	deeef deeef fddZ
deeef deeef fd	d
Zdd Zdd Z  ZS )OfaASRPreprocessorc                    s   t t| j|||g|R i | tttj|d| _t	
| jdd| _t	
| jdd| _ttj|d| _| tj|d\| _| _dS )	zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        zfbank_config.yamltrainTtestFztext2phone_dict.txtzphone_dict.txtN)superr   __init__r   r   ospathjoinZdata_cfgr   Zfrom_config_dictZget_feature_transformsZtrain_audio_feature_transformsZtest_audio_feature_transformsr   text2phone_tokenizerbuild_phone_dictphone_to_idid_to_phone)selfcfgZ	model_dirmodeargskwargs	__class__ g/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/preprocessors/ofa/asr.pyr      s$   zOfaASRPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r   r   ZTRAIN_build_train_sample_build_infer_sample)r   r!   r   r   r    __call__2   s   

zOfaASRPreprocessor.__call__c                 C   sH  t g d}| || jd  }tj|ddd\}}| jtj|gtj	d||ddd}tdg}|||| jd  d	}|d
 }	| j
dkrVt|	| j}	| j|	dd|d< n!|	| j }	|	  }
d|
d | j }	| j|	dd|d< | |	d }tdg}|d |d< ||d< ||d< t| j|d d d g|d< |S )N)g?      ?g?wav>  TsrmonoZdtypeZtarget_sample_rateZis_traintext)fbank
fbank_masklabelr1   zhF)Zadd_bostarget r	      
phone_itemZphone_target
phone_maskZprev_output_tokens)randomchoiceget_audio_bytes
column_maplibrosaloadprepare_fbanktorchtensorfloat32languager   Zmax_tgt_lengthZtokenize_text	translateZtranstabstripsplitr   to_phonecatZbos_item)r   r!   speedaudio_bytesr'   r*   r/   r0   sampler3   Ztarget_token_listr6   r7   r   r   r    r#   8   s@   
z&OfaASRPreprocessor._build_train_samplec           	      C   s   d}|  || jd  }tj|ddd\}}| jtj|gtjd||ddd}tdg}||d	}d
| jv rG| jd
 |v rG|| jd
  |d< tg d|d< tdg|d< |S )Nr&   r'   r(   Tr)   r,   Fr-   )r/   r0   r.   r1   )   rL   rL   r6   r7   )r;   r<   r=   r>   r?   r@   rA   rB   )	r   r!   rI   rJ   r'   r*   r/   r0   rK   r   r   r    r$   ^   s"   
z&OfaASRPreprocessor._build_infer_samplec                    s.    j |}t fdd|dD }|S )Nc                    s   g | ]} j | qS r   )r   ).0xr   r   r    
<listcomp>w   s    z/OfaASRPreprocessor.to_phone.<locals>.<listcomp>r4   )r   Ztransr@   rA   rF   )r   r.   Zphonesidsr   rO   r    rG   u   s   zOfaASRPreprocessor.to_phonec                 C   sx   t  }t  }t|d%}t|D ]\}}| dd }|||< |||< qW d    ||fS 1 s3w   Y  ||fS )Nrr4   r   )dictopen	enumeraterE   rF   )r   Zphone_dict_pathr   r   Zphone_dict_fileilinephoner   r   r    r   z   s   

z#OfaASRPreprocessor.build_phone_dict)__name__
__module____qualname__r   Z	INFERENCEr   r   strr   r%   r#   r$   rG   r   __classcell__r   r   r   r    r      s    """&r   )r   r9   pathlibr   typingr   r   r=   Z	soundfileZsfr@   Z%fairseq.data.audio.feature_transformsr   Z)fairseq.data.audio.speech_to_text_datasetr   Zmodelscope.utils.chinese_utilsr   Zmodelscope.utils.constantr   baser
   Zutils.text2phoner   r   r   r   r   r    <module>   s   