o
    {qiy                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z m!Z!m"Z" ddlm#Z$ ddlm%Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl1m4Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZL ddlMmNZO ddlPm#ZQ ddlPmRZRmSZS erddlTZUeVeWZXeeg eeFe f f eeeYgef e eFf ZZeeg eeFe f f ef Z[G dd de\Z]G d d! d!eYZ^G d"d# d#eYZ_	$ddd+d,Z`dd/d0ZaG d1d2 d2eDZbdd4d5Zcdd;d<Zddd?d@ZeddBdCZfddKdLZgddRdSZhddUdVZiddXdYZjddcddZkddfdgZlddidjZmdddddkddtduZnddddvddydzZodd{dd~dZpddddvdddZqddddvdddZrdd{dddZs			ddddZtG dd deDddZuejvG dd dZwdddZxdddZydZzdddddddddddZ{dddddddddddZ|dZ}e}e|_ e}~dde{_ dS )z>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)TYPE_CHECKINGAnyCallableOptionalUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                   @  s   e Zd ZdZdS )InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__ r8   r8   n/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/langchain/smith/evaluation/runner_utils.pyr3   K   s    r3   c                   @  s$   e Zd ZdZd	ddZd	ddZdS )

TestResultz1A dictionary of the results of a single test run.returnpd.DataFramec                 C  s.   |   }dd |jD }|jddj|ddS )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        c                 S  s"   g | ]}| d s|dv r|qS ))inputs.outputs.	reference>   outputinput)
startswith).0colr8   r8   r9   
<listcomp>b   s    z5TestResult.get_aggregate_feedback.<locals>.<listcomp>all)include   )Zaxis)to_dataframecolumnsZdescribeZdrop)selfZdfZto_dropr8   r8   r9   get_aggregate_feedbackU   s
   z!TestResult.get_aggregate_feedbackc              
   C  sL  zddl }W n ty } zd}t||d}~ww g }g }| d  D ]z\}}|d }|d}	t|	tr@dd |	 D }
n|	du rGi }
nd|	i}
i d	d |d
  D |
}d|v rzt|d trt|dd |d  D  n|d |d< |i dd |D |d|d |dd || || q$|j||dS )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackr@   c                 S     i | ]
\}}d | |qS )r>   r8   rC   kvr8   r8   r9   
<dictcomp>{       z+TestResult.to_dataframe.<locals>.<dictcomp>c                 S  rO   )r=   r8   rP   r8   r8   r9   rS      rT   rA   r?   c                 S  rO   )z
reference.r8   rP   r8   r8   r9   rS      rT   c                 S  s   i | ]
}d |j  |jqS )z	feedback.)keyZscore)rC   fr8   r8   r9   rS      rT   Errorexecution_timerun_id)errorrX   rY   )index)	pandasImportErroritemsget
isinstancedictupdateappendZ	DataFrame)rK   pdemsgindicesrecords
example_idresultrN   Zoutput_r@   rr8   r8   r9   rI   j   sR   



zTestResult.to_dataframeN)r;   r<   )r4   r5   r6   r7   rL   rI   r8   r8   r8   r9   r:   R   s    
r:   c                      s,   e Zd ZdZd fdd	ZdddZ  ZS )	EvalErrorz"Your architecture raised an error.rW   BaseExceptionkwargsr   r;   Nonec                   s   t  jdd|i| d S )NrW   r8   )super__init__)rK   rW   rn   	__class__r8   r9   rq      s   zEvalError.__init__namestrc              
   C  s:   z| | W S  t y } zd| d}t||d }~ww )Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rK   rt   re   rf   r8   r8   r9   __getattr__   s   

zEvalError.__getattr__)rW   rm   rn   r   r;   ro   )rt   ru   r;   r   )r4   r5   r6   r7   rq   ry   __classcell__r8   r8   rr   r9   rl      s    rl   <my_dataset>llm_or_chain_factoryMODEL_OR_CHAIN_FACTORYdataset_nameru   r;   MCFc                   sX  t | tr+|   jj}| jdur% jjj}d| d| d| d}t| fddS t | tr2| S t | tr?| fddS t| rt	| rTt
tt| fd	dS z|  }W n# ty|   tt| }t|}td
| t|fdd Y S w tt| t |tr|S t	tt|rt
tt|fddS t |tsfddS S | S )zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.Na$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                         S Nr8   r8   )chainr8   r9   <lambda>       z(_wrap_in_chain_factory.<locals>.<lambda>c                     r   r   r8   r8   )lcfr8   r9   r      r   c                     r   r   r8   r8   	runnable_r8   r9   r      r   z'Wrapping function %s as RunnableLambda.c                     r   r   r8   r8   )wrappedr8   r9   r      r   c                     r   r   r8   r8   r   r8   r9   r      r   c                     s   t  S r   )r   r8   )constructorr8   r9   r          )r`   r+   rs   r4   Zmemory
ValueErrorr   r   callabler"   r!   r
   r   	TypeErrorinspect	signatureloggerinfor   )r|   r~   Zchain_classZmemory_classrf   _modelZ	user_funcsigr8   )r   r   r   r   r   r9   _wrap_in_chain_factory   sV   











r   inputsdict[str, Any]c                 C  sD  | sd}t |g }d| v r)t| d ts#dt| d j }t || d g}ncd| v rRt| d tr?tdd | d D sMdt| d j }t || d }n:t| dkrtt	| 
 }t|tri|g}n#t|trztd	d |D rz|}nd
|  }t |d|  }t |t|dkr|d S dt| d}t |)zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc                 s      | ]}t |tV  qd S r   r`   ru   rC   ir8   r8   r9   	<genexpr>       

z_get_prompt.<locals>.<genexpr>z,Expected list of strings for 'prompts', got rH   c                 s  r   r   r   r   r8   r8   r9   r         z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r3   r`   ru   typer4   listrF   lennextitervalues)r   rf   r   Zprompt_r8   r8   r9   _get_prompt   sB   



r   c                   @  s   e Zd ZU dZded< dS )ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zlist[BaseMessage]messagesNr4   r5   r6   r7   __annotations__r8   r8   r8   r9   r     s   
 r   ra   c                 C  s   | sd}t ||  }d| v r|d|d< nt| dkr(tt|  |d< d|v rW|d }t|trAt	dd |D rA|g}t|dkrQt
|d |d< |S d}t |d	|  }t |)
zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rA   rH   c                 s  r   r   )r`   ra   r   r8   r8   r9   r   5  r   z _get_messages.<locals>.<genexpr>r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r3   copypopr   r   r   r   r`   r   rF   r   )r   rf   Z
input_copyZraw_messagesr8   r8   r9   _get_messages   s2   r   first_exampler%   input_mapperOptional[Callable[[dict], Any]]ro   c                 C  s   |r/|| j pi }t|ts+t|trtdd |D s-d| dt| d}t|d S d S z
t| j p5i  W d S  tyd   zt| j pFi  W Y d S  tyc } zd| j  d}t||d }~ww w )Nc                 s  r   r   r`   r   rC   rf   r8   r8   r9   r   R  r   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   r`   ru   r   rF   r   r3   r   r   )r   r   Zprompt_inputrf   Zerr2r8   r8   r9   +_validate_example_inputs_for_language_modelJ  s:   

r   r   r+   c                 C  s   |r8|| j pi }t|j|}t|ts%d| dt| d}t||r6d|j d|  }t|dS | j }t|j|}t	|dkrRt	|jdkrRdS |rcd|j d|  }t|dS )	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rH   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencer`   ra   r   r3   keysr   )r   r   r   Zfirst_inputsZmissing_keysrf   r8   r8   r9   "_validate_example_inputs_for_chainm  sB   
r   examplec                 C  sV   t |trt| | dS | }t |trt| || dS t |tr)td| dS dS )z9Validate that the example inputs are valid for the model.z Skipping input validation for %sN)r`   r   r   r+   r   r   r   debug)r   r|   r   r   r8   r8   r9   _validate_example_inputs  s   


r   exampleslist[Example]r0   "Optional[smith_eval.RunEvalConfig]	data_typer$   Optional[list[RunEvaluator]]c           	      C  s   |r>t | trd\}}d}nd}|  }t |tr|jnd}t |tr%|jnd}t||||d jr7t|d jnd||}|S d}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )r`   r   r+   r   Zoutput_keys_load_run_evaluatorsoutputsr   )	r|   r   r0   r   
run_inputsrun_outputsrun_typer   run_evaluatorsr8   r8   r9   _setup_evaluation  s&   
r   r   smith_eval.RunEvalConfigr   Optional[list[str]]Optional[str]c                 C  n   d }| j r| j }|r||vrtd|| |S |r%t|dkr%|d }|S |d ur5t|dkr5td| |S )NzZInput key %s not in chain's specified input keys %s. Evaluation behavior may be undefined.rH   r   zChain expects multiple input keys: %s, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   r8   r8   r9   _determine_input_key  s&   	r   r   c                 C  r   )Nz`Prediction key %s not in chain's specified output keys %s. Evaluation behavior may be undefined.rH   r   zChain expects multiple output keys: %s, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   r8   r8   r9   _determine_prediction_key  s&   r   example_outputsc                 C  s\   | j r| j }|r||vrd| d| }t||S |r*t|dkr*tt|}|S d }|S )NzReference key z! not in Dataset example outputs: rH   )reference_keyr   r   r   r   )r   r   r   rf   r8   r8   r9   _determine_reference_key  s   r   eval_configYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]eval_llmOptional[BaseLanguageModel]r   r   r   r   r   c              	   C  sN  t | tr| S t | ttfr!t | tst| } t| |d}| j}	nDt | tjrRd|i|  }
t| j	fi |
}| j	j}	t | tj
rQ| jpF|}| jpK|}| jpP|}nt| rZt| S dt|  }t|t |tr|jr~|d u r~d|	 d| d}t|tjj|||||||	gd}|S t |trd|	 d	}t|d|	 d
}t|)N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)r`   r   r-   ru   r,   valuesmith_eval_configZ
EvalConfigZ
get_kwargsZevaluator_typeZSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r/   Zrequires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer.   NotImplementedError)r   r   r   r   r   r   r   r   Z
evaluator_Zeval_type_tagrn   rf   r    r8   r8   r9   _construct_run_evaluator  sZ   







r   2tuple[Optional[str], Optional[str], Optional[str]]c                 C  s(   t | |}t| |}t| |}|||fS r   )r   r   r   )r   r   r   r   r   r   r   r8   r8   r9   	_get_keysT  s   



r   list[RunEvaluator]c                 C  s   g }d\}}}	| j s| jr!tdd | jD r!t| |||\}}}	| j D ]}
t|
| j||||	||}|| q$| jp<g }|D ]7}t|trL|| q?t|t	ra|t
jj||||||	d q?t|rm|t| q?d| d}t||S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    NNNc                 s  r   r   )r`   r/   )rC   re   r8   r8   r9   r   u  r   z'_load_run_evaluators.<locals>.<genexpr>)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr   r   r   rc   r`   r   r/   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r    r   Zcustom_evaluatorrf   r8   r8   r9   r   `  s\   








r   r   	callbacksr   metadatar   r   r   r   r   r   Optional[dict[str, Any]]Union[str, BaseMessage]c                  s   |dur9||}t |tst |tr/tdd |D r/| j|t||p$g |p'i ddI dH S d| d}t|zt|}| j|t||pFg |pIi ddI dH }	W |	S  tyy   t|}
| jd	i |
dt||pjg |pmi diI dH }	Y |	S w )
a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r8   r8   r9   r     r   z_arun_llm.<locals>.<genexpr>r   r   r   r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r8   )	r`   ru   r   rF   ainvoker   r3   r   r   )r   r   r   r   r   r   prompt_or_messagesrf   r   
llm_output
llm_inputsr8   r8   r9   	_arun_llm  sZ   
	r   r   r   r   Union[Chain, Runnable]Union[dict, str]c          
        s   |du r|n||}t | tr;t |tr;t|dkr;| jr;tt| }| j|t	||p.g |p1i ddI dH }|S t	|p?g ||pCi d}	| j||	dI dH }|S )z%Run a chain asynchronously on inputs.NrH   r   r   r   r   r   )
r`   r+   ra   r   r   r   r   r   r   r   
r   r   r   r   r   r   Zinputs_valr@   runnable_configr8   r8   r9   _arun_chain  s4   
r   )r   r   'Union[dict, str, LLMResult, ChatResult]c          	   
     s   t |trdnd}d}z<t |tr*t|| jpi |d |d ||ddI dH }n| }t|| jp3i |d |d ||ddI dH }|}W |S  tyk } ztd|| j	| j| t
|d	}W Y d}~|S d}~ww )
a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr+   Nr   r   r   r   z*%s failed for example %s with inputs %s
%srW   )r`   r   r   r   r_   r   	Exceptionr   r   idrl   )	r   r   r|   r   chain_or_llmrj   r@   r   re   r8   r8   r9   _arun_llm_or_chain  sJ   
	
r  c                C  s   |dur7||}t |tst |tr-tdd |D r-| j|t||p#g |p&i dd}|S d| d}t|zt|}	| j|	t||pDg |pGi dd}W |S  tyn   t|}
| jd
i |
dt||pei d	i}Y |S w )a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r8   r8   r9   r   g  r   z_run_llm.<locals>.<genexpr>r   r   z'Input mapper returned invalid format:  r   r   )r   r   r8   )	r`   ru   r   rF   invoker   r3   r   r   )r   r   r   r   r   r   r   r   rf   Zllm_promptsr   r8   r8   r9   _run_llmJ  sR   
 
r  c          
      C  s   |du r|n||}t | tr7t |tr7t|dkr7| jr7tt| }| j|t	||p-g |p0i dd}|S t	|p;g ||p?i d}	| j||	d}|S )zRun a chain on inputs.NrH   r   r   r   )
r`   r+   ra   r   r   r   r   r   r  r   r   r8   r8   r9   
_run_chain  s2   
r	  c          
      C  s   t |trdnd}d}z6t |tr&t|| jpi |d |d ||dd}n| }t|| jp/i |d |d ||dd}|}W |S  tyj } zt|j}	t	
d|| j| j|	| t|d	}W Y d}~|S d}~ww )
a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r  r+   Nr   r   r   r   zC%s failed for example %s with inputs %s
Error Type: %s, Message: %sr  )r`   r   r  r   r_   r	  r  r   r4   r   r   r  rl   )
r   r   r|   r   r  rj   r@   r   re   Z
error_typer8   r8   r9   _run_llm_or_chain  sL   
	
r
  clientr   project_nameproject_metadatadataset_versionOptional[Union[str, datetime]]1tuple[MCF, TracerSession, Dataset, list[Example]]c              
   C  sj  t ||}| j|d}t| j|j|d}	|	s!d| d}
t|
dd |	D }|r.t|nd }|r6| nd }z'|p<i }t }|rJi |d|i}||d< | j	||j|rYd	|ini |d
}W n3 t
ttfy } z$dt|vrq t }d| d| d| d}d| d| }
t|
|d }~ww |jd|j  }td| d| d| d|j dd ||||	fS )N)r~   )Z
dataset_idZas_ofzDataset z has no example rows.c                 S  s   g | ]}|j r|j qS r8   )modified_at)rC   exr8   r8   r9   rE     s    z%_prepare_eval_run.<locals>.<listcomp>gitr  r   )Zreference_dataset_idZproject_extrar   zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   Zread_datasetr   Zlist_examplesr  r   max	isoformatr   Zcreate_projectr)   r(   ru   uuiduuid4urlprint)r  r~   r|   r  r  r   r  wrapped_modeldatasetr   rf   r  Zmax_modified_atZinferred_versionZgit_infoprojectre   uidZexample_msgZcomparison_urlr8   r8   r9   _prepare_eval_run  sn   
	

r  c                   @  s*   e Zd ZU dZded< ded< ded< dS )	
_RowResultz5A dictionary of the results for a single example row.z Optional[list[EvaluationResult]]rN   zOptional[float]rX   r   rY   Nr   r8   r8   r8   r9   r   *  s
   
 r   F)totalc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZded< d>ddZd?ddZd@ddZdAd!d"Z		#dBdCd&d'Z
e				(			dDdEd<d=ZdS )F_DatasetRunContainerz3A container to help manage the state of a eval run.r   r  r'   r  r   r  r   r   zlist[RunnableConfig]configsNz6Optional[list[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsbatch_resultsr   all_eval_resultsdict[str, _RowResult]r;   ra   c                 C  s   i }t | j|D ]M\}}tt|t|ji }|j|dg |d|dd|t|j< t|t	r?|j
|t|j d< n	||t|j d< |jrU|j|t|j d< q|S )NrN   rX   rY   )rA   rN   rX   rY   rW   r@   r?   )zipr   r
   r   r_   ru   r  r   r`   rl   rW   r   )rK   r%  r&  rM   r   r@   Z
row_resultr8   r8   r9   _merge_test_outputs=  s   

z(_DatasetRunContainer._merge_test_outputsrunsdict[str, Run]
list[dict]c              
     s   | j }|sg S  fdd| jD }g }tj J}|D ]>}z+||| j}t|tr-| }|t	t| |j
| jjfi |d | jjd W q tyY   tdt| Y qw W d    |S 1 sew   Y  |S )Nc                   s   g | ]	} t |j qS r8   )ru   r  rC   r   r*  r8   r9   rE   W  s    z>_DatasetRunContainer._run_batch_evaluators.<locals>.<listcomp>)rY   Z
project_idz Error running batch evaluator %s)r$  r   
concurrentfuturesThreadPoolExecutorr`   r   ra   rc   r
   submitr  Zcreate_feedbackr  r  r  r   	exceptionrepr)rK   r*  r   Z	runs_listaggregate_feedbackexecutorZ	evaluatorrj   r8   r.  r9   _run_batch_evaluatorsS  s<   

z*_DatasetRunContainer._run_batch_evaluators,tuple[dict[str, _RowResult], dict[str, Run]]c                 C  s   i }i }| j D ]d}tt|d D ]Z}t|tr3|j}| D ]\\}}}|t|i 	d|i qqt|t
rj|j}	|	rH|	jrH|	j|	j  nd }
|	rQt|	jnd }|t|ji 	|
||	d |	|t|j< qqttttf ||fS )Nr   rN   )rX   rY   run)r#  r
   r   r`   r   Zlogged_eval_resultsr^   
setdefaultru   rb   r   Z
latest_runend_time
start_timetotal_secondsr  ri   ra   r   )rK   r&  all_runsccallbackZeval_results_ri   rR   r9  rX   rY   r8   r8   r9   _collect_metricsl  s<   


z%_DatasetRunContainer._collect_metrics-list[Union[dict, str, LLMResult, ChatResult]]r:   c                 C  sX   t d t  |  \}}d }| jrt d | |}| ||}t| jj	||dS )Nz#Waiting for evaluators to complete.zRunning session evaluators.)r  rM   Zaggregate_metrics)
r   r   r   rB  r$  r7  r)  r:   r  rt   )rK   r%  r&  r>  r5  rM   r8   r8   r9   _collect_test_results  s   


z*_DatasetRunContainer._collect_test_resultsFverboseboolc              
   C  s   |  |}|r,z
| }t| W n ty+ } ztjd|dd W Y d }~nd }~ww z| jj| jj	t
tjd W |S  tyX } ztjd|dd W Y d }~|S d }~ww )Nz&Failed to print aggregate feedback: %sT)exc_info)r;  zFailed to close project: %s)rD  rL   _display_aggregate_resultsr  r   r   r  Zupdate_projectr  r  r   nowr   utc)rK   r%  rE  rM   Zagg_feedbackre   r8   r8   r9   finish  s(   

z_DatasetRunContainer.finish   r~   ru   r|   r}   r  r   r0   r   r   r   r   r   concurrency_levelintr  r   revision_idr  Optional[Union[datetime, str]]c              	     s  |pt  }|
r|	si }	|	d|
i t ||||	|d\}}}p%g jdp-i  D ]\}}d| d|  q0djd i|
rM|
d< t|}t	||||j
pZtjt|d || tt| fdd	|D }|  ||||r|jd
S d d
S )NrO  )r  r   r  r  zgit:=r  r   c              
     sB   g | ]}t tj |jd tpg  |jddgdqS ))r  r  ri   r   )r   r  ri   max_concurrency)r   r   rR  r   )r   r   rt   r  r   r-  r  rM  progress_barr  r   Zrun_metadatar   r8   r9   rE     s*    z0_DatasetRunContainer.prepare.<locals>.<listcomp>)r  r  r  r   r#  r$  )r1   Zrandom_namerb   r  r   r_   r^   rc   r   r   r   r$   kvr   r2   ZProgressBarCallbackr   r$  )clsr  r~   r|   r  r0   r   r   rM  r  rO  r  r  r  r   rQ   rR   r#  r8   rS  r9   prepare  sT   	
z_DatasetRunContainer.prepare)r%  r   r&  r'  r;   ra   )r*  r+  r;   r,  )r;   r8  )r%  rC  r;   r:   )F)r%  r   rE  rF  r;   r:   )NNNrL  NNN)r  r   r~   ru   r|   r}   r  r   r0   r   r   r   r   r   rM  rN  r  r   rO  r   r  rP  r;   r"  )r4   r5   r6   r7   r   r$  r)  r7  rB  rD  rK  classmethodrW  r8   r8   r8   r9   r"  2  s.   
 



r"  rF  c                  C  sD   zddl m}  |  }|  d uodtt|v W S  ty!   Y dS w )Nr   )get_ipythonZzmqshellF)ZIPython.core.getipythonrY  ru   r   r]   )rY  resr8   r8   r9   _is_jupyter_environment  s   r[  aggregate_resultsr<   c                 C  sT   t  rddlm}m} ||d ||  d S | jdd dd}td t| d S )	Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                 S  s   | dS )Nz.2fr8   )xr8   r8   r9   r     r   z,_display_aggregate_results.<locals>.<lambda>right)Zfloat_formatjustifyz
 Experiment Results:)r[  IPython.displayr]  r^  Z	to_stringr  )r\  r]  r^  Zformatted_stringr8   r8   r9   rH    s   rH  a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)rL  )r0   r  rM  r  r  rE  rO  Optional[Client]rP  rM  rN  rE  rO  rn   r   c                  s   |
 dd }|rtdtdd |	d u rt d}	|
 dd }|r)tdddd |
r8tdd	|
  d
dd | p<t } tj| |||||||||	|d}t	j
|jd dgttjt|j|d|j|jR  I d H }|j||dS )Nr   0.0.305TmessagependingrO  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   rf  Zremovalr  rO  r  r   rR  r|   r   rE  )r   r   _INPUT_MAPPER_DEP_WARNINGr   r_   r   r   r"  rW  runnable_utilsZgather_with_concurrencyr#  map	functoolspartialr  r  r   rK  )r  r~   r|   r0   r  rM  r  r  rE  rO  rn   r   r   	containerr%  r8   r8   r9   arun_on_dataset'  sb   
ru  c                  s"  |
 dd rtdtdd |
 dd }|rtdddd |	d u r(t d}	|
r7tdd	|
  d
dd | p;t } tj| ||||||||	|d |dkr` fddt	 j
 jD }n*t jd }t|tjt jd j
 j}W d    n1 sw   Y   j||dS )Nr   rd  Tre  r   rh  ri  rO  rj  r   rk  rl  r   c                   s"   g | ]\}}t || jd qS )rm  )r
  r  )rC   r   r   rt  r   r8   r9   rE     s    z"run_on_dataset.<locals>.<listcomp>rm  rn  )r   r   ro  r   r_   r   r   r"  rW  r(  r   r#  r   Zget_executor_for_configr   rq  rr  rs  r
  r  rK  )r  r~   r|   r0   r  rM  r  r  rE  rO  rn   r   r%  r6  r8   rv  r9   run_on_dataseth  sh   

rw  a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()r{   )r|   r}   r~   ru   r;   r   )r   r   r;   ru   )r   r   r;   ra   )r   r%   r   r   r;   ro   )r   r%   r   r+   r   r   r;   ro   )r   r%   r|   r   r   r   r;   ro   )
r|   r   r   r   r0   r   r   r$   r;   r   )r   r   r   r   r;   r   )r   r   r   r   r;   r   )r   r   r   r   r;   r   )r   r   r   r   r   ru   r   r$   r   r   r   r   r   r   r   r   r;   r   )
r   r   r   r   r   r   r   r   r;   r   )r   r   r   ru   r   r$   r   r   r   r   r   r   r;   r   )r   r   r   r   r   r   r   r   r   r   r   r   r;   r   )r   r   r   r   r   r   r   r   r   r   r   r   r;   r   )
r   r%   r   r   r|   r   r   r   r;   r   )r   r   r   r   r   r   r   r   r   r   r   r   r;   r   r   )r  r   r~   ru   r|   r}   r  ru   r  r   r   r   r  r  r;   r  )r;   rF  )r\  r<   r;   ro   )r  rc  r~   ru   r|   r}   r0   r   r  rP  rM  rN  r  r   r  r   rE  rF  rO  r   rn   r   r;   r   )r7   
__future__r   concurrent.futuresr/  dataclassesrr  r   loggingr  r   r   typingr   r   r   r   r	   r
   Zlangchain_core._apir   Zlangchain_core.callbacksr   Zlangchain_core.language_modelsr   Zlangchain_core.messagesr   r   Zlangchain_core.outputsr   r   Zlangchain_core.runnablesr   r   r   r   r   r   rp  Z!langchain_core.tracers.evaluationr   r   Z langchain_core.tracers.langchainr   Zlangsmith.clientr   Zlangsmith.envr   r   Zlangsmith.evaluationr   r   r    r   Zlangsmith.run_helpersr!   r"   Zlangsmith.schemasr#   r$   r%   r&   r'   Zlangsmith.utilsr(   requestsr)   Ztyping_extensionsr*   Zlangchain.chains.baser+   Zlangchain.evaluation.loadingr,   Zlangchain.evaluation.schemar-   r.   r/   Zlangchain.smithr0   r   Zlangchain.smith.evaluationr   r1   r2   r\   rd   	getLoggerr4   r   ra   r}   r   r  r3   r:   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r
  r  r   	dataclassr"  r[  rH  ro  ru  rw  Z_RUN_ON_DATASET_DOCSTRINGreplacer8   r8   r8   r9   <module>   s     	
H
=1


*
#
(





E
KI)@G)@? 
K

FMk
