o
    W+ i2K                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZm Z m!Z!m"Z" ee Z#eZ$e	e%ee#e$f f Z&eeeee  f Z'eeeee  f Z(e)e*Z+d	ej,d
e%de-deej,ee% f fddZ.dej/de&dej/fddZ0ej1ddd		d5de%de%dee% de2def
ddZ3	d6de%de%dee& defd d!Z4					d7d"ee% de%dee% d#eee%  dee% d$eee&  de2de'fd%d&Z5		 			d8d	ej,d
e%d'ed(eee  d)e-d*ee- d+ee- d,e2de(fd-d.Z6	 			d9d	ej,d
e%d)e-d*ee- d+ee- d,e2fd/d0Z7G d1d2 d2eZ8G d3d4 d4e8Z9dS ):    N)DictIterableListOptionalTupleUnion)UnicoreDataset
data_utils)utils   )	NumpyDict	TorchDict)process_featuresprocess_labels)add_assembly_featuresconvert_monomer_features
merge_msaspair_and_mergepost_processconfigmodenum_resreturnc                 C   s   t | }|| }|  |jd u r||_W d    n1 s w   Y  |jj|jj }|jjr7||jj7 }|jj	rA||jj
7 }|| jrL||jj7 }||fS N)copydeepcopyunlockedZ	crop_sizecommonZunsupervised_featuresZrecycling_featuresZuse_templatesZtemplate_featuresZis_multimerZmultimer_featuresZ
supervisedZsupervised_features)r   r   r   cfgZmode_cfgfeature_names r    u/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/science/unifold/dataset.pymake_data_config!   s   



r"   all_atom_positions	operationc                 C   sD   |dkr| S |\}}t |dd}t |d}| |j | S )NI   )nparrayreshapeT)r#   r$   ZrotZtransr    r    r!   process_label6   s   r+      T)maxsizer   Fsequence_idmonomer_feature_diruniprot_msa_dir
is_monomerc                 C   s   t tj||  d}t|}i |}|d urOt tj||  d}|r?t|d |d |d |d \|d< |d< |S t |}dD ]}|| ||< qF|S )Nz.feature.pkl.gzz.uniprot.pkl.gzZmsaZdeletion_matrix)Zmsa_all_seqZmsa_species_identifiers_all_seqZdeletion_matrix_all_seq)r
   load_pickleospathjoinr   r   Zconvert_all_seq_feature)r.   r/   r0   r1   Zmonomer_featureZchain_featureZall_seq_featurekeyr    r    r!   load_single_feature@   s0   

r7   label_id	label_dirsymmetry_operationc                 C   sJ   t tj||  d}|d urt|d ||d< dd | D }|S )Nz.label.pkl.gzr#   c                 S   s   i | ]\}}|d v r||qS )Zaatyper#   Zall_atom_mask
resolutionr    .0kvr    r    r!   
<dictcomp>n   s    z%load_single_label.<locals>.<dictcomp>)r
   r2   r3   r4   r5   r+   items)r8   r9   r:   labelr    r    r!   load_single_labeld   s   rD   sequence_ids	label_idssymmetry_operationsc           
         s    fdd| D }|d ur@t |t | ksJ d usJ |d u r*dd |D }fddt||D }dd t||D  t|}|d urPdd |D }nd }tjdd |D tjd}	 rf|d	 }nt|}t|}|	|d
< ||fS )Nc                    s   g | ]	}t | qS r    )r7   r>   s)r1   r/   r0   r    r!   
<listcomp>   s    zload.<locals>.<listcomp>c                 S   s   g | ]}d qS )r%   r    )r>   _r    r    r!   rJ      s    c                    s   g | ]
\}}t | |qS r    )rD   )r>   llo)r9   r    r!   rJ      s    
c                 S   s   g | ]	\}}| |qS r    )update)r>   frL   r    r    r!   rJ      s    c                    s   g | ]  fd ddD qS )c                    s   i | ]}| | qS r    r    r>   r?   rO   r    r!   rA      s    z#load.<locals>.<listcomp>.<dictcomp>r;   r    )r>   r    rQ   r!   rJ      s
    

c                 S      g | ]}|d  qS )
seq_lengthr    )r>   cr    r    r!   rJ          )Zdtyper   asym_len)lenzipr   r'   r(   Zint64r   r   )
rE   r/   r0   rF   r9   rG   r1   Zall_chain_featuresZall_chain_labelsrV   r    )r1   r9   r/   r0   r!   loadv   s6   


rY   featureslabelsseed	batch_idxdata_idxis_distillationc              	   C   s  |dkr8|d us
J t j||dd tjd| jjd }tj | | jk }	W d    n1 s2w   Y  n| jj}d}	t	||d< t	|	|d< t	||d< |r[d	|v r[|
d	 t	|d
 }
t| ||
d\}}|d ury|d d d|d< t j||dd> tjdd|d< tj||d}dd | D }t  t||j|| }W d    n1 sw   Y  W d    n1 sw   Y  |d urdd |D }t  t|}W d    ||fS 1 sw   Y  ||fS )NtrainZ	recyclingr6   r   r   Znum_recycling_itersuse_clamped_faper_   Z
msa_chainsrS   )r   r   r<   Zprotein_featurei{  Zcrop_and_fix_size_seed)Zdesired_keysc                 S      i | ]
\}}|t |qS r    torchZtensorr=   r    r    r!   rA          zprocess.<locals>.<dictcomp>c                 S   s   g | ]}d d |  D qS )c                 S   rd   r    re   r=   r    r    r!   rA      rg   z&process.<locals>.<listcomp>.<dictcomp>rB   )r>   rL   r    r    r!   rJ      s    zprocess.<locals>.<listcomp>)r	   
numpy_seedr'   randomrandintr   Zmax_recycling_itersrandZuse_clamped_fape_probintpopr"   r)   r
   filterrB   rf   Zno_gradr   r   )r   r   rZ   r[   r\   r]   r^   r_   Z	num_itersrb   r   r   r   r    r    r!   process   sN   




rp   c           
   	   K   sR   d|vr|n| d}tdi |d|i\}}	t| |||	||||\}}	||	fS )Nr1   r    )rn   rY   rp   )
r   r   r\   r]   r^   r_   Zload_kwargsr1   rZ   r[   r    r    r!   load_and_process   s   rq   c                   @   sj   e Zd Z				dddZdd Zdd	d
Zdd Zdd Zedd Z	ede
eee f fddZdS )UnifoldDatasetr`   NF c	                 C   s  || _ dd }	|	tj | j || d }
|	tj | j || d | _| | j| _i | _| jD ]}| j| }|
| | j|< q1|
| _t	d
t| jt| j tj | j d| _tj | j d| _tj | j |d }|d	krtj |r|s|	|| _t	d

t| j tj | j d| _tj | j d| _nd | _|jt  |jd  | _|d ur|| j nt| j| _|| _| | j\| _| _| _| | j\| _| _| _| jd ur| | j\| _| _ | _!|j"| _#|| _$|j%| _%d S )Nc                 S   s   t t| dddS )Nrutf-8encoding)jsonrY   open)filenamer    r    r!   	load_json   s   z*UnifoldDataset.__init__.<locals>.load_jsonz_sample_weight.jsonz_multi_label.jsonz$load {} chains (unique {} sequences)pdb_features
pdb_labelszsd_train_sample_weight.jsonr`   z"load {} self-distillation samples.Zsd_featuresZ	sd_labelsr   )&r4   r3   r5   multi_label_inverse_mapinverse_multi_labelsample_weightZseq_sample_weightloggerinfoformatrW   feature_path
label_pathisfilesd_sample_weightsd_feature_pathsd_label_path
batch_sizedistributed_utilsZget_data_parallel_world_sizeZupdate_freqdata_lenr   cal_sample_weightnum_seqseq_keysseq_sample_prob	num_chain
chain_keyssample_probsd_num_chainsd_chain_keyssd_sample_probdatar   r\   sd_prob)selfargsr\   r   	data_pathr   max_step
disable_sdjson_prefixr{   r   chainentityZsd_sample_weight_pathr    r    r!   __init__   sx   








zUnifoldDataset.__init__c                    s>   t   }t   fdd|D }t|}|||fS )Nc                    s   g | ]} |  qS r    r    rP   r   Z
sum_weightr    r!   rJ   1  s    z4UnifoldDataset.cal_sample_weight.<locals>.<listcomp>)listkeyssumvaluesrW   )r   r   Z	prot_keysr   Znum_protr    r   r!   r   .  s
   
z UnifoldDataset.cal_sample_weightc                 C   s
  d}| j dkrvtj| j|dd] | jd ur!tjdd | jk nd}|r7tjj	| j
| jd}| j| }|}n/|sNtjj	| j| jd}| j| }| j| }ntjj	| j| jd}| j| }tj	| j| }W d    n1 spw   Y  n
| j| }| j| }|||fS )NFr`   Zdata_samplera   r   r   )p)r   r	   ri   r\   r   r'   rj   rl   r   choicer   r   r   r   r   r   r   r   r   r   r~   )r   idxsample_by_seqr_   Zprot_idxZ
label_nameZseq_nameZseq_idxr    r    r!   sample_chain5  s@   







zUnifoldDataset.sample_chainc           	      C   sn   | j |dd\}}}|s| j| jfn| j| jf\}}t| j| j| j|| j	 |||g|d |g|d dd\}}|S )NT)r   
r]   r^   r_   rE   r/   r0   rF   r9   rG   r1   )
r   r   r   r   r   rq   r   r   r\   r   )	r   r   r.   r8   r_   Zfeature_dirr9   rZ   rK   r    r    r!   __getitem__R  s2   

zUnifoldDataset.__getitem__c                 C   s   | j S r   )r   r   r    r    r!   __len__i  s   zUnifoldDataset.__len__c                 C   s   t j| ddS )Nr   dim)r	   collate_dict)samplesr    r    r!   collaterl  s   zUnifoldDataset.collatermappingc              
   C   s`   i }|   D ]'\}}|D ] }||v r(|| }||ks(J d| d| d| d|||< qq|S )Nzmultiple entities (z, z) exist for reference .rh   )r   Zinverse_mappingentrefsrefZent_2r    r    r!   r   q  s   

zUnifoldDataset._inverse_mapr`   NFrs   )F)__name__
__module____qualname__r   r   r   r   r   staticmethodr   r   strr   r   r    r    r    r!   rr      s    
?

 rr   c                       s   e Zd Z				ddejdedejded	ed
ee dedef fddZ	dd Z
edd Zedd Zedd Zedd Z  ZS )UnifoldMultimerDatasetr`   NFrs   r   r\   r   r   r   r   r   r   c	           
   
      s   t  |||||||| || _tttj| j|d dd| _	| 
| j| _tj| jd| _tj| jd| _tj| jd| _|j| _| jdkrk| | j| j	| j| j\| _| _| | j\| _| _| _d S d S )Nzpdb_assembly.jsonru   rv   r|   Zpdb_uniprotsr}   r`   )superr   r   rx   rY   ry   r3   r4   r5   pdb_assembly
get_chainsr   
pdb_chainsmonomer_feature_pathuniprot_msa_pathr   
max_chainsr   filter_pdb_by_max_chainsr   r   r   r   r   )
r   r   r\   r   r   r   r   r   r   kwargs	__class__r    r!   r     s8   


zUnifoldMultimerDataset.__init__c                    s    |\}}}|r|g}|g}jd j}}}	d }
nE|  jv rGjdkrG fddj  d D }dd j  d D }
nj  }d }
fdd|D }jjj	}}}	t
jjj|j |||||||	|
dd	S )
Nr`   c                    s   g | ]} d  | qS )rK   r    )r>   id)pdb_idr    r!   rJ     s    
z6UnifoldMultimerDataset.__getitem__.<locals>.<listcomp>chainsc                 S   s   g | ]}|qS r    r    )r>   tr    r    r!   rJ     s    Zopersc                    s   g | ]} j | qS r    )r   )r>   Zchain_idr   r    r!   rJ     s    
Fr   )r   r   r   get_pdb_namer   r   r   r   r   r   rq   r   r\   r   )r   r   Zseq_idr8   r_   rF   rE   r   r   r   rG   r    )r   r   r!   r     sR   





z"UnifoldMultimerDataset.__getitem__c                 C   sh   t | dkrd S dd | D }dd | D }z	tj|dd}W n ty+   td|w |s0d }||fS )Nr   c                 S   rR   )r   r    rH   r    r    r!   rJ     rU   z3UnifoldMultimerDataset.collater.<locals>.<listcomp>c                 S   s    g | ]}|d  dur|d  qS )r   Nr    rH   r    r    r!   rJ     s     r   r   zcannot collate features)rW   r	   r   BaseException
ValueError)r   ZfeatsZlabsr    r    r!   r     s   
zUnifoldMultimerDataset.collaterc                 C   s   |  dd S )NrK   r   )split)r   r    r    r!   r     s   z#UnifoldMultimerDataset.get_pdb_namec                 C   s:   i }| D ]}t |}||vrg ||< || | q|S r   )r   r   append)Zcanon_chain_mapr   r   pdbr    r    r!   r     s   
z!UnifoldMultimerDataset.get_chainsc                    s   i  | D ])}||v rt || d }||kr| |  |< qt | | }|dkr-| |  |< q fddD }tdt | t    dt |  dt t |  dt  d| 
  |fS )	Nr   r   c                    s$   i | ]}t | v r|| qS r    )r   r   rP   Znew_pdb_chainsr   r    r!   rA     s
    zCUnifoldMultimerDataset.filter_pdb_by_max_chains.<locals>.<dictcomp>zfiltered out z / z PDBs (z chains) by max_chains )rW   r   r   )r   r   r   r   r   sizeZnew_sample_weightr    r   r!   r     s0   z/UnifoldMultimerDataset.filter_pdb_by_max_chainsr   )r   r   r   mlc
ConfigDictrm   r   r   boolr   r   r   r   r   r   r   __classcell__r    r    r   r!   r     s>    	!1


	r   )NFr   )NNNNF)Nr   NNF)r   NNF):r   loggingr3   typingr   r   r   r   r   r   rx   Zml_collectionsr   numpyr'   rf   Zunicore.datar   r	   Zunicore.distributedr
   r   r   Zdata.data_opsr   r   Zdata.processr   r   Zdata.process_multimerr   r   r   r   r   ZRotationZTranslationr   Z	OperationZNumpyExampleZTorchExample	getLoggerr   r   r   rm   r"   Zndarrayr+   	lru_cacher   r7   rD   rY   rp   rq   rr   r   r    r    r    r!   <module>   s    



&



8
	
4
 