o
    rqiA@                     @   sf  d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ G dd	 d	eZ	
d,ddZdefddZdededededef
ddZdedefddZdedefddZ	
	d-dedededeeef d ee d!ededefd"d#Z	$	
		
	d.dedededeeef d ee d%ed&ed'ed!ededefd(d)Zd*d+ ZdS )/    N)AnyDictList)ModeKeys   )OfaBasePreprocessor)get_database_matches)dump_db_json_schemac                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )OfaTextToSqlPreprocessorz0
    OFA preprocessor for text to sql tasks
    c                    sl   t t| j|||g|R i | | jjdd| _| jdd| _d| _i | _	t
jt
j|d| _dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        promptz . generating sql code.max_struct_length   	databaseN)superr
   __init__cfgmodelgetinstruction_textr   	separatordb_schema_cacheospathjoinabspathdatabase_path)selfr   Z	model_dirmodeargskwargs	__class__ l/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/modelscope/preprocessors/ofa/text2sql.pyr      s   

z!OfaTextToSqlPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S N)r   r   ZTRAIN_build_train_sample_build_infer_sample)r   r%   r#   r#   r$   __call__-   s   

z!OfaTextToSqlPreprocessor.__call__c                 C   sl  d| j v r	d|v sJ d|| j d  }|| j}t|dks$J d|\}}}|| jvrAt| jd | d | d || j|< d|  d| j	 }t
|||| j| j| | jjd	}|d
 }|d }|d }	|d }
d||| j }| || j }|d| j	| j d  }| jd|	dddd| j }t|| jg}t| j|g}d||||
d}|S )a  
        build sample for training tasks.

        step 1. Get the input question and database id from text input
        step 2. Get the database structure input
        step 3. Add a pseudo ids for every input.
        step 4. Calculate the target and previous output items.
        text;there must be `text` column in task key map and source data   z=invalid input, should contain query, question and database id/.sqlite NT	struct_intext_inseq_out	db_struct{} ; structured knowledge: {}    {}F)Zadd_bosZadd_eos        )idsourcetargetZprev_output_tokensr4   )
column_mapsplitr   lenr   r	   r   r   stripmax_src_lengthseq2seq_inputr   r   formatr   tokenize_textr   Zmax_tgt_lengthtorchcatZeos_itemZbos_item)r   r%   r+   Ztextsqueryquestiondb_id
seq_inputsr1   r3   r4   src_itemZtgt_itemZtarget_itemZprev_output_itemsampler#   r#   r$   r(   3   sf   	



z,OfaTextToSqlPreprocessor._build_train_samplec           	      C   s6  d| j v r	d|v sJ d|| j d  }|| j d d}| }|| jvr9t| jd | d | d || j|< d|  d| j }t	d||| j| j| | j
j}|d	 }|d
 }d||| j }| || j }|d| j| j d  }d||d}d| j v r| j d |v rd|| j d  |d< |S )z
        build sample for inference tasks.

        step 1. Get the input question and database id from text input
        step 2. Get the database structure input
        step 3. Add a pseudo ids for every input.
        r+   r,   r   Zculture_companyr.   r/   r0   Nr1   r4   r5   r6   r8   )r9   r:   r4   Zsolutionr7   label)r<   r   r?   r   r	   r   r   r=   r@   rA   r   r   rB   r   rC   r   )	r   r%   r+   rH   rI   r1   r4   rJ   rK   r#   r#   r$   r)   k   sD   

z,OfaTextToSqlPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   Z	INFERENCEr   r   strr   r*   r(   r)   __classcell__r#   r#   r!   r$   r
      s    ""*8r
   Fc           
      C   sN   t | ||||}t||d  }|s|||dS t||\}}	|||	|dS )Nserialized_schema)r1   r2   r4   )r1   r2   r3   r4   )form_input_for_constructionspider_add_serialized_schemar?   spider_pre_process_one_function)
rF   rG   rH   db_pathschemar   Zis_trainexrS   r3   r#   r#   r$   rA      s&   rA   itemc                 C   s2   d}t | d | d d|jd}|| d   |fS )N rF   rH   T)rF   rH   normalize_querytarget_with_db_idrG   )spider_get_targetr]   r?   )rZ   r   prefixr3   r#   r#   r$   rV      s   rV   rF   rH   r\   r]   r&   c                 C   s.   |rt ndd }|r| d||  S || S )Nc                 S   s   | S r'   r#   )xr#   r#   r$   <lambda>   s    z#spider_get_target.<locals>.<lambda>z | )	normalize)rF   rH   r\   r]   
_normalizer#   r#   r$   r^      s   r^   c                 C   s(   dd }dd }dd }|||| S )Nc                 S   s   |  ddS )N , , )replacesr#   r#   r$   	comma_fix   s   znormalize.<locals>.comma_fixc                 S   s   d |  S )Nr0   )r   r=   rg   r#   r#   r$   white_space_fix   s   z"normalize.<locals>.white_space_fixc                 S   s   t ddd | S )Nz\b(?<!['\"])(\w+)(?!['\"])\bc                 S   s   |  d S )Nr   )grouplower)matchr#   r#   r$   ra      s    z*normalize.<locals>.lower.<locals>.<lambda>)resubrg   r#   r#   r$   rl      s   znormalize.<locals>.lowerr#   )rF   ri   rj   rl   r#   r#   r$   rb      s   rb   rY   c                 C   s   t |dr%t| d | d | d | d | d | d | d |jd	d
	}d|iS t| d | d | d | d | d ddd	|jd	d
}d|iS )NZschema_serialization_with_nlrG   rW   rH   db_column_namesdb_table_namesdb_primary_keysdb_foreign_keysT)	rG   rW   rH   rp   rq   rr   rs   $schema_serialization_with_db_contentr\   peteshawF)
rG   rW   rH   rp   rq   schema_serialization_typeschema_serialization_randomizedschema_serialization_with_db_idrt   r\   rS   )getattr!serialize_schema_natural_languagert   serialize_schema)rY   r   rS   r#   r#   r$   rU      s8   
rU   TrG   rW   rp   rq   rt   c	           %         s  | dd  fdd|D  d}	dd }
dd	 }d
d }dd }|d }tt|d |d }|	g}g }g }d}t|D ]\}} rH| n|}|| g }g }g }tt|d |d D ]J\}\}}|dkrkq` rq| n|}|| ||kr|| ||v r|| |rt| |||d | d | d d}|r||| |f q`|||}|| |
d |}|| t|dkr||}|| q>|D ](\}}||d |  } || }!||d |  }"|| }#|| |!|"|#}$||$ qd |S )Nz contains tables such as re   c                    s   g | ]
} r
|  n|qS r#   )rl   ).0namer\   r#   r$   
<listcomp>      z5serialize_schema_natural_language.<locals>.<listcomp>.c                 S   s
   |  dS )Nz is the primary key.r#   )Zprimary_keyr#   r#   r$   &table_description_primary_key_template  s   
zQserialize_schema_natural_language.<locals>.table_description_primary_key_templatec                 S   s   d|  dd | dS )NzTable z has columns such as re   r   r   )r}   Zcolumn_namesr#   r#   r$   table_description  s   z<serialize_schema_natural_language.<locals>.table_descriptionc                 S   s   d dd | D  S )Nr[   c                 S   s   g | ]
\}}d  ||qS )z"The {} contains values such as {}.)rB   )r|   columnvaluer#   r#   r$   r   
  r   zPserialize_schema_natural_language.<locals>.value_description.<locals>.<listcomp>r   )Zcv_pairsr#   r#   r$   value_description	  s   z<serialize_schema_natural_language.<locals>.value_descriptionc              	   S   s   d| d|  d| d| d	S )NzThe z of z is the foreign key of r   r#   )Ztable_1Zcolumn_1Ztable_2Zcolumn_2r#   r#   r$   foreign_key_description  s   zBserialize_schema_natural_language.<locals>.foreign_key_description	column_idother_column_idtable_idcolumn_namer   r.   r/   rG   
table_namer   rW   r0   )r   listzip	enumeraterl   appendr   r>   )%rG   rW   rH   rp   rq   rr   rs   rt   r\   Zoverall_descriptionr   r   r   r   ZdescriptionsZdb_table_name_strsZdb_column_name_strs	value_sepr   r   Ztable_name_strcolumnsZcolumn_value_pairsprimary_keysr   r`   yZ
column_strmatchesZtable_description_columns_strZ!table_description_primary_key_strZvalue_description_strZx_table_nameZx_column_nameZy_table_nameZy_column_nameZforeign_key_description_strr#   r~   r$   rz      s   







rz   ru   rv   rw   rx   c
              	      s   |dkrd}
d}d
d dddn|dkr&d	}
d
}d
d dddnt dtdtdtf	fdd 
fddt|D }|rRt| |ra|
jd|| }|S ||}|S )NverbosezDatabase: {db_id}. z. z"Table: {table}. Columns: {columns}re   z{column} ({values})z{column}ru   z
 | {db_id}r[   z | {table} : {columns}rd   z{column} ( {values} )r   r   r&   c                    sh   r|  n|}r.t| |d  d  d d}|r( j||dS j|dS j|dS )Nr.   r/   r   )r   values)r   )rl   r   rB   r   )r   r   Zcolumn_name_strr   )column_str_with_valuescolumn_str_without_valuesrH   rW   r\   rG   rt   r   r#   r$   get_column_strk  s"   

z(serialize_schema.<locals>.get_column_strc                    s\   g | ]*\ j r ntfd dt fddtd d dqS )c                    s    | d dS )Nr   )r   r   r#   r   )r   r   r#   r$   ra     s    z-serialize_schema.<locals>.<listcomp>.<lambda>c                    s   | d  kS )Nr   r#   r   )r   r#   r$   ra     s    r   r   )tabler   )rB   rl   r   mapfilterr   )r|   )
column_seprp   r   r\   	table_str)r   r   r$   r   ~  s$    
z$serialize_schema.<locals>.<listcomp>)rH   )NotImplementedErrorrQ   r   randomshufflerB   r   )rG   rW   rH   rp   rq   rv   rw   rx   rt   r\   Z	db_id_strZ	table_sepZtablesrS   r#   )r   r   r   rp   rH   rW   r   r\   rG   rt   r   r   r$   r{   K  s@   (

r{   c                 C   sr   | ||||d dd |d D dd |d D d|d dd |d	 D d
d |d D dd |d D dd	S )NZtable_names_originalc                 S      g | ]\}}|qS r#   r#   r|   r   r   r#   r#   r$   r         z/form_input_for_construction.<locals>.<listcomp>Zcolumn_names_originalc                 S      g | ]\}}|qS r#   r#   r   r#   r#   r$   r     r   )r   r   Zcolumn_typesc                 S   s   g | ]}d |iqS )r   r#   )r|   r   r#   r#   r$   r     s    r   c                 S   r   r#   r#   r|   r   r   r#   r#   r$   r     r   Zforeign_keysc                 S   r   r#   r#   r   r#   r#   r$   r     r   )r   r   )	rF   rG   rH   rW   rq   rp   Zdb_column_typesrr   rs   r#   )rF   rG   rH   rW   rX   r#   r#   r$   rT     s0   rT   )F)FT)ru   FTFT)r   r   rn   typingr   r   r   rD   Zmodelscope.utils.constantr   baser   Zutils.bridge_content_encoderr   Zutils.get_tablesr	   r
   rA   dictrV   rQ   boolr^   rb   rU   rz   r{   rT   r#   r#   r#   r$   <module>   s    


'
	

\
	

N