o
    rqi!                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZd dl	m   m
Z d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ G dd deZd	d
 ZG dd dZeejG dd deZdS )    N)AnyDictTupleUnion)File)Preprocessor)PREPROCESSORS)FieldsModeKeysc                       sR   e Zd ZdZejfdedef fddZdeee	f deee	f fdd	Z
  ZS )
AudioBrainPreprocessorzA preprocessor takes audio file path and reads it into tensor

    Args:
        takes: the audio file field name
        provides: the tensor field name
        mode: process mode, default 'inference'
    takesprovidesc                    sB   t t| j|g|R i | || _|| _dd l}|jjj| _d S )Nr   )superr   __init__r   r   ZspeechbrainZdataio
read_audio)selfr   r   modeargskwargssb	__class__ e/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/preprocessors/audio.pyr      s
   zAudioBrainPreprocessor.__init__datareturnc                 C   s   |  || j }||| j< |S N)r   r   r   )r   r   resultr   r   r   __call__&   s   
zAudioBrainPreprocessor.__call__)__name__
__module____qualname____doc__r
   Z	INFERENCEstrr   r   r   r   __classcell__r   r   r   r   r      s    *r   c                 C   s   t | ddd}| }|d}|d|}|d|}tj||d | tjdd	}|d
}|d|}|d|}tj||d | tjdd	}|  ||fS )Nrzutf-8)encodingZAddShift[]    )ZdtypesepZRescale)openreadfindnp
fromstringfloat32close)filenamefpZall_strZpos1pos2Zpos3meanscaler   r   r   load_kaldi_feature_transform,   s   

r8   c                   @   s0   e Zd ZdZ			dddZdd Zd	d
 ZdS )Featurez%Extract feat from one utterance.
    specNFc                 C   s   || _ || _|d |d  d | _|d |d  d | _tj| jdd| _d| _|durNtj	
|rNtd|  t|\}}t|| _t|| _d	| _|rg| j | _| jri| j | _| j | _dS dS dS )
aF  

        Args:
            fbank_config (dict):
            feat_type (str):
                raw: do nothing
                fbank: use kaldi.fbank
                spec: Real/Imag
                logpow: log(1+|x|^2)
            mvn_file (str): the path of data file for mean variance normalization
            cuda:
        Zframe_lengthZsample_frequencyi  Zframe_shiftF)ZperiodicNzloading mvn file: T)fbank_config	feat_typen_fft
hop_lengthtorchZhamming_windowwindowmvnospathexistsprintr8   
from_numpyshiftr7   cuda)r   r;   r<   Zmvn_filerH   rG   r7   r   r   r   r   ?   s6   zFeature.__init__c              	   C   s   | j dkr|S | j dkr-ddlm  m} t|jdkr!|d}|j|fi | j}|S | j dkrUt	j
|d | j| j| j| jdd	d
}t	j|j|jgdddd}|S | j dkr|t	j
|| j| j| j| jdd	d
}t	|d }t	d| dd}|S )zm

        Args:
            utt: in [-32768, 32767] range

        Returns:
             [..., T, F]
        rawfbankr   Nr)   r:   i   FT)centerZreturn_complexdimZlogpow   )r<   Ztorchaudio.compliance.kaldiZ
compliancekaldilenshapeZ	unsqueezerJ   r;   r?   Zstftr=   r>   r@   catrealimagZpermuteabslog)r   ZuttrQ   featr:   Zabspowr   r   r   computee   sB   
	



zFeature.computec                 C   s   | j r|| j }|| j }|S r   )rA   rG   r7   )r   rY   r   r   r   	normalize   s   

zFeature.normalize)r:   NF)r   r    r!   r"   r   rZ   r[   r   r   r   r   r9   ;   s    
&(r9   c                   @   sN   e Zd ZdZdd Zdeeeee	f f deee	f fddZ
edd	 Zd
S )LinearAECAndFbanki>  c                 C   sT   dd l }d| j | _|d | _t|d |d |d | _| | _|d dk| _d S )	Nr   i   linear_aec_delayr;   r<   rA   Zmask_onnearend_mic)	MinDAECSAMPLE_RATEtrunc_lengthr]   r9   featureloadmitaecmask_on_mic)r   Z	io_configr_   r   r   r   r      s   

zLinearAECAndFbank.__init__r   r   c                 C   s  t |tr| |d \}}| |d \}}t|}n%| |d \}}| |d \}}d|v r=| |d \}}nt|}| j||\}}}}	tt| j	| g}
t
|
|g}tt|t|t|t|	t|}d}t|| j}||| ||| ||| |	|| ||| f\}}}}	}t }tt|}| j|}tj||gdd}tt|}| j|}tj||gdd}tt|	}	| j|	}tj||gdd}| j|}|durtt|}| jr|}n|}|||d}|S )	u7   Linear filtering the near end mic and far end audio, then extract the feature.

        Args:
            data: Dict with two keys and correspond audios: "nearend_mic" and "farend_speech".

        Returns:
            Dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
        r   r)   r^   farend_speechnearend_speechrM   N)basetargetrb   )
isinstancetupleload_wavr/   Z
zeros_likerd   Zdo_linear_aecZzerosintr]   ZconcatenateminrR   ra   r?   ZFloatTensorrF   r1   rb   rZ   rT   r[   re   )r   r   r^   fsrf   rg   Zout_micZout_refZ
out_linearZout_echoZextra_zerosZflenZfstartrY   Zfbank_nearend_micZfbank_out_linearZfbank_out_echorh   Zout_datar   r   r   r      sT   
	

zLinearAECAndFbank.__call__c                 C   s   dd l }t| trt| } nt| trt| }t|} n
tdt	|  dt
| \}}t|jdkr;td|tjkrH|||tj}|tjtjfS )Nr   zUnsupported input type: .r)   z(modelscope error:The audio must be mono.)librosarj   bytesioBytesIOr#   r   r-   	TypeErrortypewavrR   rS   
ValueErrorr\   r`   ZresampleZastyper/   r1   )Zinputsrq   Z
file_bytesZsample_rater   r   r   r   rl      s   



zLinearAECAndFbank.load_wavN)r   r    r!   r`   r   r   r   r   r#   r   r   staticmethodrl   r   r   r   r   r\      s    *	Cr\   )rs   rB   typingr   r   r   r   numpyr/   Zscipy.io.wavfileZwavfilerw   r?   Zmodelscope.fileior   Zmodelscope.preprocessorsr   Z modelscope.preprocessors.builderr   Zmodelscope.utils.constantr	   r
   r   r8   r9   Zregister_moduleZaudior\   r   r   r   r   <module>   s   
Y