o
    0 i                     @   sx  d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	 e	
eZe r*d dlZdd Z			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZeeeeeedZ		d3dedededee d ee f
d!d"Zd2d	ed ee fd#d$Zd2d	ed ee fd%d&Zd2d	ed ee fd'd(Zd2d	ed ee fd)d*Zd2d	ed ee fd+d,Z d2d	ed ee fd-d.Z!eeeee e!dZ"d2d	ed ee fd/d0Z#dS )4    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                    s,   dd dd  t  fdd}|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 S   s   t |d }t| jdr| jj}n| jj}||kr8t| ds-| j| j||d d\| _}| jd| jdd dS | j	
|| _	| jd| j	dd dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr	   max_position_embeddingsrope_init_fnr
   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r	   _ r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update+   s   

z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s   t |d }|| jkr#| j| j||d\}| _| jd|dd || _|| jk rD| j| jkrF| j	|| _| jd| jdd | j| _dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   Zmax_seq_len_cachedr   r   Zattention_scalingr   Zoriginal_max_seq_lenr   r   )r   r   r   r   r   r   r   r   dynamic_frequency_update>   s   
z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    sB   d| j v r | ||jd n| j dkr| ||jd | ||S )Ndynamic)r   longrope)	rope_typer   )r   xr   r    r   rope_forwardr   r   wrapperQ   s
   

z$dynamic_rope_update.<locals>.wrapperr   )r&   r'   r   r%   r   dynamic_rope_update   s
   r(   r   r   ztorch.devicer   returnztorch.Tensorc           	      C   sn   | j }t| dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj|tj	d|   }||fS )	a  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r/   )

rope_thetagetattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser*   r,   dimattention_factorr   r   r   r    _compute_default_rope_parameters\   s   ,r<   c                 C   s*   | j d }t| ||\}}|| }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingr<   )r   r   r   r=   r   r;   r   r   r   '_compute_linear_scaling_rope_parameters   s   
r?   c                 C   s   | j }t| dd}t| d| j| j }t|| }| j}| jd }d}	|du r*|}nt|tj	r?t
|tj||j|jd}nt||}||| | |d  ||d    }d|tjd	|dtjd
j|tjd|   }
|
|	fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r*   r+   r,   r=   Nr/   r   r   r-   r   r.   r0   )r1   r2   r3   r4   r5   r   r>   
isinstancer   ZTensormaximumtensorr/   r   r   r6   r7   r   r8   )r   r   r   r9   r*   r,   r:   r   r=   r;   r   r   r   r   _compute_dynamic_ntk_parameters   s$   *

$,rD   c                    s  | j }t| dd}t| d| j| j }t|| }| jd }| jd}| jd}	| jd}
| jdp8| j}dd
d}|du rW|	rS|
rSt|||	|||
 }n||}| jdp^d}| jdpfd	}dd   fdd}dd }|t	
d|dj|t	jd|  }d| }d||  }| jdd}|||||||\}}d	||||d j|t	jd }|d	|  ||  }||fS )ak  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r*   r+   r,   r=   r;   mscalemscale_all_dimr	   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r+   g?)mathlog)scalerE   r   r   r   
get_mscale:  s   z,_compute_yarn_parameters.<locals>.get_mscaleN	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr-   )rG   rH   pi)Znum_rotationsr:   r9   r   r   r   r   find_correction_dimL  s   *z5_compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|rt |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rG   floorceilr   min)Zlow_rotZhigh_rotr:   r9   r   truncatelowhighrO   r   r   find_correction_rangeP  s   

z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r.   r   r   )r   r6   float32clamp)rR   r   r:   Zlinear_funcZ	ramp_funcr   r   r   linear_ramp_factorY  s
   z4_compute_yarn_parameters.<locals>.linear_ramp_factorr   r-   r0   rS   T)r   )r1   r2   r3   r4   r5   r>   getr   r8   r   r6   r   )r   r   r   r9   r*   r,   r:   r=   r;   rE   rF   r	   rJ   rK   rM   rW   rZ   Z	pos_freqsZinv_freq_extrapolationZinv_freq_interpolationrS   rT   rU   Zinv_freq_extrapolation_factorr   r   rV   r   _compute_yarn_parameters   s>   8

	"
 
r\   c                 C   s  | j }t| dd}t| d| j| j }t|| }| jd }| jd }| jd}	| jd}
t| dd	 }r=| j| }	n| j}|
d	u rZ|	dkrKd}
nt	d
t
|	t
|  }
|rj||krjtj|tj|d}n	tj|tj|d}tjd|dtj|d | }d|||   }||
fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r*   r+   r,   long_factorshort_factorr=   r;   r	   Nr   r@   r   r-   )r1   r2   r3   r4   r5   r>   r[   r   rG   sqrtrH   r   rC   rX   r6   r7   r8   )r   r   r   r9   r*   r,   r:   r]   r^   r=   r;   r	   Zext_factorsZinv_freq_shaper   r   r   r   _compute_longrope_parameterss  s*   /

r`   c                 C   s   t | ||\}}| jd }| jd }| jd }| jd }|| }	|| }
dtj | }t||	k|| |}|| | ||  }d| | | ||  }||
k  ||	k  }t|||}||fS )ap
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r=   low_freq_factorhigh_freq_factorr	   r-   r   )r<   r>   rG   rN   r   where)r   r   r   r   r;   r=   ra   rb   Zold_context_lenZlow_freq_wavelenZhigh_freq_wavelenZwavelenZinv_freq_llamaZsmooth_factorZsmoothed_inv_freqZis_medium_freqr   r   r   _compute_llama3_parameters  s   *



rd   )defaultZlinearr!   yarnr"   Zllama3r#   received_keysrequired_keysoptional_keysignore_keysc                 C   s   d|v r|dh8 }| d |dur||8 }|| }|r&td|  d| |dur1|| | }n|| }|rDtd|  d|  dS dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper#   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r#   rg   rh   ri   rj   Zmissing_keysZunused_keysr   r   r   _check_received_keys  s   	

rp   c                 C   s@   | j }|d|dd }dh}t| }t||||d d S )Nr#   rk   rj   )r>   r[   setkeysrp   )r   rj   r>   r#   rh   rg   r   r   r   !_validate_default_rope_parameters0  s
   rt   c                 C   sx   | j }|d|dd }ddh}t| }t||||d |d }|d u s0t|tr0|dk r:td|  d S d S )Nr#   rk   r=   rq   r+   8`rope_scaling`'s factor field must be a float >= 1, got 	r>   r[   rr   rs   rp   rA   r8   rn   ro   )r   rj   r>   r#   rh   rg   r=   r   r   r   (_validate_linear_scaling_rope_parameters8  s   rw   c                 C   s   | j }|d|dd }ddh}dh}t| }t|||||d |d }|d u s4t|tr4|dk r>td|  d S d S )Nr#   rk   r=   r	   rq   r+   ru   rv   )r   rj   r>   r#   rh   ri   rg   r=   r   r   r   )_validate_dynamic_scaling_rope_parametersD  s   rx   c              	   C   s  | j }|d|dd }ddh}h d}t| }t|||||d |d }|d u s5t|tr5|dk r=td|  |d}|d urWt|trO|d	k rWtd
|  |d}	|	d urmt|	tsmtd|	  |d}
|
d urt|
tstd|
  |	pd|
pdk rtd|	 d|
 d | j d}|d ur| j	| }||krt
d| d| d| d d S d S t
d d S )Nr#   rk   r=   >   rS   rK   rM   r;   rE   r	   rF   rq   r+   ru   r;   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rK   z6`rope_scaling`'s beta_fast field must be a float, got rM   z6`rope_scaling`'s beta_slow field must be a float, got rL   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r	   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)r>   r[   rr   rs   rp   rA   r8   rn   ro   r   warning_once)r   rj   r>   r#   rh   ri   rg   r=   r;   rK   rM   r	   Zimplicit_factorr   r   r   _validate_yarn_parametersR  sR   	



r{   c                 C   s  | j }|d|dd }h d}h d}t| }t|||||d t| dd}t| d| j| j }t|| }	|d	}
t	|
t
sUtd
d |
D rUtd|
  t|
|	d krltd|	d  dt|
  |d}t	|t
stdd |D rtd|  t||	d krtd|	d  dt|  t| drtd d S |d}|d u rtd nt	|tr|dk rtd|  |d}|d urt	|tr|dk rtd|  d S d S d S )Nr#   rk   >   r]   r#   r^   >   r=   r;   r	   rq   r*   r+   r,   r^   c                 s       | ]
}t |ttfV  qd S NrA   r5   r8   .0r$   r   r   r   	<genexpr>      z0_validate_longrope_parameters.<locals>.<genexpr>zC`rope_scaling`'s short_factor field must be a list of numbers, got r-   z5`rope_scaling`'s short_factor field must have length z, got r]   c                 s   r|   r}   r~   r   r   r   r   r     r   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r	   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.r=   z1Missing required keys in `rope_scaling`: 'factor'ru   r;   g        ry   )r>   r[   rr   rs   rp   r2   r3   r4   r5   rA   listallrn   ro   lenr   rz   r8   )r   rj   r>   r#   rh   ri   rg   r*   r,   r:   r^   r]   r=   r;   r   r   r   _validate_longrope_parameters  sH   




r   c           
      C   s6  | j }|d|dd }h d}t| }t||||d |d }|d u s0t|tr0|dk r8td|  |d }|d	 }|d u sIt|tsQtd
|  |d u sZt|tsbtd|  ||krqtd| d|  |d }	|	d u s~t|	t	std|	  |	| j
krtd|	 d| j
  d S d S )Nr#   rk   >   rb   ra   r	   r#   r=   rq   r=   r+   ru   ra   rb   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r	   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)r>   r[   rr   rs   rp   rA   r8   rn   ro   r5   r   )
r   rj   r>   r#   rh   rg   r=   ra   rb   r	   r   r   r   _validate_llama3_parameters  sL   
r   c                 C   sd   t | dd}|du rdS |d|dd}t|}|dur'|| |d dS td| d dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    r>   Nr#   rk   re   rq   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r2   r[   ROPE_VALIDATION_FUNCTIONSrn   ro   )r   rj   r>   r#   Zvalidation_fnr   r   r   rope_config_validation  s   

r   )NNNr}   )NN)$rG   	functoolsr   typingr   Zconfiguration_utilsr   utilsr   r   Z
get_logger__name__rn   r   r(   r5   tupler8   r<   r?   rD   r\   r`   rd   ZROPE_INIT_FUNCTIONSstrrr   rp   rt   rw   rx   r{   r   r   r   r   r   r   r   r   <module>   s   
?

,

,

E

~

S

E
B2&
