
    Ei%x                        d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ  ee          Zdd
ddddZd!dZd"dZd#dZd$dZ G d d          Zd S )%u  
混合检索引擎 —— BM25 全文检索 + kNN 向量检索，通过 RRF 融合排序，支持权限过滤和内容去重。
Hybrid search engine – BM25 + kNN with RRF fusion, permission filtering,
and content deduplication.

双层检索架构:
  - 在内容层 (gov_doc_chunks) 按 acl_ids 过滤并检索
  - 按 content_hash 分组去重
  - 查询文档层 (gov_doc_meta) 获取版本信息
  - 根据权限匹配精度选择每个 content_hash 的最佳版本

Two-layer architecture:
  - Searches ``gov_doc_chunks`` (content layer) with ``acl_ids`` filter
  - Groups results by ``content_hash`` for deduplication
  - Queries ``gov_doc_meta`` (document layer) for version info
  - Selects best version per content_hash based on permission priority
    )annotations)date)Any)settings)EmbeddingService)PermissionContext)ESClientHYBRID_RRF_PIPELINE)
get_logger            )U_O_D_A_R_valuer   returnstrc                    | dv r| S dS )uH   将检索范围参数标准化为 title/content/doc_number/all 之一。>   titlecontent
doc_numberall r   s    0D:\work\zm-rag\backend\app\core\search_engine.py_normalize_search_scoper    %   s    2225    c                ~    |                      dd                               dd                               dd          S )u9   转义通配符查询中的特殊字符，防止注入。\z\\*z\*?z\?)replacer   s    r   _escape_wildcard_queryr'   ,   s6    ==v&&..sE::BB3NNNr!   
query_textsearch_scopedict[str, Any]c                    t          |          }|dk    rddd| iiiS |dk    rddd| iiiS |dk    r+ddddt          |                                            diiiS d	| d
dgddiS )uZ   根据检索范围构建对应的 ES 文本查询子句（match/wildcard/multi_match）。r   matchqueryr   r   wildcardr   r$   multi_matchztitle^3best_fields)r-   fieldstype)r    r'   strip)r(   r)   scopes      r   _build_text_query_clauser5   1   s    #L11E'GZ#89::	)gz%:;<<N!7
8H8H8J8J!K!KNNN
 	
 	 ),!
 
 r!   doc_acl_ids	list[str]user_tokensintc                    d}t          |           }|D ]9}||v r3|dd         }t          |t                              |d                    }:|S )zIReturn the best (lowest) permission priority for a doc given user tokens.c   Nr   )setmin_ACL_PREFIX_PRIORITYget)r6   r8   bestacl_settokenprefixs         r   _best_match_priorityrD   I   sc    D+G C CG2A2YFt155fbAABBDKr!   c                      e Zd ZdZd2dZddd	d
d3dZd4dZed5d            Zd6dZ	d7d Z
d8d$Zd9d%Zd:d)Zddd*d;d0Zd<d1ZdS )=SearchEngineu   混合检索引擎，支持权限感知检索和基于 content_hash 的内容去重。

    Hybrid search with permission-aware retrieval and content deduplication.	es_clientr	   embedding_servicer   c                "    || _         || _        d S )N)_es
_embedding)selfrG   rH   s      r   __init__zSearchEngine.__init__Y   s    +r!   Nr      )filterspage	page_sizer-   r   permr   rO   dict[str, Any] | NonerP   r9   rQ   r   r*   c               ^  K   t          |pi                     d                    }|dv o| j        j        }d}|rk	 | j                            |           d{V }nI# t          $ r<}	t                              dt          |	          |dd                    Y d}	~	nd}	~	ww xY w|| 
                    ||||pi ||          }
| j                            |
t          j        t                     d{V \  }}|sht                              d	           | 
                    |d||pi ||          }
| j        j                            t          j        |

           d{V }nN| 
                    |d||pi ||          }
| j        j                            t          j        |

           d{V }t%          |t&                    r|n|j        }d|
v }|r|                     |          }n|                     ||          }|r)d |D             }|                     ||           d{V }ni }|                     |||          }|                     |          }|                    di                               di                               dt5          |                    }|||||dS )u  执行混合检索：生成查询向量 → 构建 ES 查询 → 按 content_hash 分组 → 批量获取版本 → 选择最佳版本。

        Execute a hybrid search with permission filtering and content dedup.

        Returns:
            {
                "total": int,         # unique content items
                "page": int,
                "page_size": int,
                "documents": [...],   # one per unique content_hash
                "aggregations": {...},
            }
        r)   >   r   r   Nembedding_failed_bm25_fallback(   )errorr-   )r(   query_vectorrR   rO   rP   rQ   )indexpipelinesearch_hybrid_to_bm25_fallbackrY   bodycollapsec                    g | ]
}|d          S )content_hashr   ).0gs     r   
<listcomp>z'SearchEngine.search.<locals>.<listcomp>   s    HHHAa/HHHr!   aggregationstotal_contentr   )totalrP   rQ   	documentsrd   )r    r?   rJ   should_use_hybridrK   embed_single	Exceptionloggerwarningr   _build_queryhybrid_searchr   es_chunk_indexr
   inforawsearch
isinstancedictr]   _extract_collapse_groups_aggregate_by_content_hash_fetch_versions_build_document_results_extract_aggregationslen)rL   r-   rR   rO   rP   rQ   r)   need_vectorrX   emb_erres_bodyresponseokrq   use_collapsecontent_groupscontent_hashesversions_maprg   rd   re   s                        r   rr   zSearchEngine.search]   s     , /2/B/B>/R/RSS
 .. +* 	 ,0 	%)_%A%A%%H%HHHHHHH   4g,,*          #'' )2# (  G "&!7!7-, "8 " "      LHb
  <===++$!%#Mr' ,   "&!4!4"1  "5 " "       '' !2# (  G "X\00- 1        H
 %Xt44Ghh(- "W, 	M!::3??NN!<<S)LLN
  	HHHHHN!%!5!5nd!K!KKKKKKKLLL 00L$
 
	
 11#66 GGNB''S"%%S#i..)) 	 #""(
 
 	
s    A 
B#'2BB#r(   rX   list[float] | Nonec                   |                      ||          }t          |                    d                    }t          ||          }	d|	g|gdi}
ddddgdgd	d
dgdgddi}dddiidddddddiiiddddddddiiiddddddddiiiddddddddiiidddddddid ddddiiid!d"}|#|d$v r|dz  }|d%d&|
d'd(|||d)iigiid*d(gi||d+}n||dz
  |z  |
d*d(gidd,dd-|d.d/|d0}|S )1u   构建 OpenSearch 查询体：有向量时走 hybrid RRF 模式，无向量时走 BM25 + collapse 模式。

        Build the OpenSearch hybrid query with permission + facet filters.r)   boolmustfilterr1      r   z<em>z</em>)fragment_sizenumber_of_fragmentspre_tags	post_tagsr   )r   r   r   )r   r   cardinalityfieldr`   issuing_orgrN   )r   sizeunique_content)termsaggsdoc_type
   knowledge_category   subject_wordspublish_dateyearyyyyr   _keydesc)r   calendar_intervalformatmin_doc_countorder)date_histogramr   )re   by_orgby_typeby_categoryby_subject_wordsby_yearN>   r   r   hybridqueriesknncontent_vectorvectorkr   excludes)r   r-   _source	highlightr   best_chunksF)namer   r   r   )r   
inner_hits)r   fromr-   r   r^   r   )_build_combined_filterr    r?   r5   )rL   r(   rX   rR   rO   rP   rQ   combined_filterr)   query_clause
bm25_queryhighlight_cfgr   
fetch_sizer]   s                  r   rm   zSearchEngine._build_query   s    55dGDD.w{{>/J/JKK/
LII %*+ &

 %(+,!'")	  ,-!'")  )
& 8 $1"==)MG^;T+UV 
 $.r::)MG^;T+UV 
 $8DD)MG^;T+UV 
 $3B??)MG^;T+UV! ! ,)/$%&$f-# # *MG^;T+UV	 	) 
  
B #8J(J(J"RJ"!& %$42>-72A7& 7&("$  ')9(:;*)$ $DD2 "Y.#&)9(:;+ - !#(%2	# #   D" r!   c                   |                                  }g }|                    d          rP|d         }t          |t                    r|                    dd|ii           n|                    dd|ii           |                    d          rP|d         }t          |t                    r|                    dd|ii           n|                    dd|ii           |                    d          rP|d         }t          |t                    r|                    dd|ii           n|                    dd|ii           |                    d          rP|d         }t          |t                    r|                    dd|ii           n|                    dd|ii           |                    d          rUt          |d                                                   }|r,|                    ddd	d
t          |           d
iii           |                    d          rW	 t          |d                   }n# t          t          f$ r d}Y nw xY w|r%|                    dd| d|dz    ddii           |                    d          rQ|                    ddd	d
t          t          |d                                                              d
iii           |                    d          s|                    d          ri }|                    d          rC|d         }t          |t                    r|                                nt          |          |d<   |                    d          rC|d         }	t          |	t                    r|	                                nt          |	          |d<   |                    dd|ii           |                    d          r.d |d         D             }
|
r|                    d|
ddi           |g|z   }t          |          dk    rdd|iin|d         S )u   构建 ACL 权限过滤与业务维度（机构/类型/类目/日期/主题词）联合过滤条件。

        Build the ACL + facet combined filter clause.r   r   termr   r   document_scene_typesignerr.   r   r$   publish_yearNranger   z-01-01r   )gteltr   	date_fromdate_tor   lter   c           
         g | ]A}|                                 d dddt          |                                            diiiBS )r.   r   r   r$   )r3   r'   )ra   ws     r   rc   z7SearchEngine._build_combined_filter.<locals>.<listcomp>  sl     
 
 
 7799
'#%M)?		)J)J%M%M%M*!
 
 
r!   r   )shouldminimum_should_matchr   r   )build_es_filterr?   rs   listappendr   r3   r'   r9   	TypeError
ValueErrorr   	isoformatrz   )rR   rO   
acl_filterfacet_filtersvalr   r   
date_rangedfdt
sw_clausesall_filterss               r   r   z#SearchEngine._build_combined_filtera  s    ))++
.0;;}%% 	E-(C#t$$ E$$gs/C%DEEEE$$f}c.B%CDDD;;z"" 	B*%C#t$$ B$$g
C/@%ABBBB$$fz3.?%@AAA;;+,, 	L./C#t$$ L$$g0Dc/J%KLLLL$$f/CS.I%JKKK;;,-- 	M/0C#t$$ M$$g0Es/K%LMMMM$$f/Dc.J%KLLL;;x   		*++1133F $$ #%J)?)G)G%J%J%J#!&    ;;~&& 	7>233z*    $$&&*???%)AX"5"5"5) )&    ;;|$$ 	   !b%;C@U<V<V<\<\<^<^%_%_!b!b!b#"    ;;{## 	Jw{{9'='= 	J)+J{{;'' X[)6@T6J6J$WBLLNNNPSTVPWPW
5!{{9%% XY'6@T6J6J$WBLLNNNPSTVPWPW
5!  'NJ+G!HIII;;'' 	
 
 !1
 
 
J  $$
ANNO   "l]2/2;/?/?!/C/CVfk*++UV	
s   +I IIr   dict[str, float]c                  K   |                      ||          }|dt          ||                    d                    g|gdidgd}	 | j        j                            t          j        |           d{V }t          |t                    r|n|j
        }nB# t          $ r5}	t                              dt          |	          	           i cY d}	~	S d}	~	ww xY wi }
|                    d
i                               d
g           D ]Z}|                    di                               dd          }|                    d          pd}|r||
vs||
|         k    r||
|<   [|
S )u   Run a lightweight BM25-only query, return {content_hash: best_score}.

        内部调试用方法，不在主搜索路径中调用。
        Debug-only helper – no longer invoked in the main search path.
        r   r)   r   r`   r   r-   r   r\   Nbm25_score_query_failedrW   hitsr    _score        )r   r5   r?   rJ   rq   rr   r   ro   rs   rt   r]   rj   rk   rl   r   )rL   r(   rR   rO   r   r   r]   resprq   escoreshitchss                 r   _score_bm25zSearchEngine._score_bm25  s      55dGDD5j'++nB]B]^^_./  ''	 
 	 
	,,83JQU,VVVVVVVVD$T400?$$diCC 	 	 	NN4CFFNCCCIIIIII	 $&7762&&**6266 	 	CB''++NB??B!!(SA r''1vbz>>r
s   	AB 
C#*CCClist[float]c                  K   |                      ||          }|dd|||diidgd}	 | j        j                            t          j        |           d{V }t          |t                    r|n|j        }nB# t          $ r5}	t                              dt          |	          	           i cY d}	~	S d}	~	ww xY wi }
|                    d
i                               d
g           D ]Z}|                    di                               dd          }|                    d          pd}|r||
vs||
|         k    r||
|<   [|
S )u   Run a lightweight kNN-only query, return {content_hash: best_score}.

        内部调试用方法，不在主搜索路径中调用。
        Debug-only helper – no longer invoked in the main search path.
        r   r   r   r`   r   r\   Nknn_score_query_failedr   r   r   r   r   r   )r   rJ   rq   rr   r   ro   rs   rt   r]   rj   rk   rl   r   r?   )rL   rX   rR   rO   r   r   r]   r   rq   r   r   r   r   r   s                 r   
_score_knnzSearchEngine._score_knn  s      55dGDD$".!"1' ' '' 
  
	,,83JQU,VVVVVVVVD$T400?$$diCC 	 	 	NN33q66NBBBIIIIII	 $&7762&&**6266 	 	CB''++NB??B!!(SA r''1vbz>>r
s   AA9 9
B8*B3-B83B8raw_responsetop_nlist[dict[str, Any]]c                   |                     di                                dg           }i }|D ]}|                     di           }|                     dd          }|                     dd          }|                     di           }	|	                     dg           }
|	                     d	g           }||vr||                     d
g           |r|d         n|                     d	d          |                     d          |                     d          |                     d          |                     d          |                     d          |g d
||<   ||         }|r||d         pdk    r||d<   |
D ]@}t          |d                   dk     r%||d         vr|d                             |           At          |                                d d          d|         S )zGroup chunk-level hits by content_hash (RRF mode).

        Returns list of dicts with best score, highlights, and chunk metadata
        per unique content_hash.
        r   r   r`   r   r   r   r   r   r   doc_idsr   r   r   r   r   r   
r`   r   r   r   r   r   r   r   score
highlightsr   r   r   c                0    |                      d          pdS )Nr   r   r?   )rb   s    r   <lambda>z9SearchEngine._aggregate_by_content_hash.<locals>.<lambda>I  s    !%%..-A r!   T)keyreverseN)r?   rz   r   sortedvalues)rL   r   r   r   groupsr   sourcer`   r   r   
content_hltitle_hlgrouphls                 r   rv   z'SearchEngine._aggregate_by_content_hash  s    ++//;;,. 	3 	3CWWY++F!::nb99LGGHc**ER00I"y"55J }}Wb11H6))$0%zz)R88,4QXa[[&**Wb:Q:Q"(**\":":*0**5I*J*J#)::m#<#< &

: 6 6$*JJ~$>$>""$( (|$ <(E '%."5A66!&g  3 3u\*++a//BeL>Q4Q4Q,'..r2223 MMOO--
 
 
 5&	 	r!   c                   |                     di                                dg           }g }|D ]}|                     di           }|                     d          }|                     di                                di                                di                                dg           }d}g }	|D ]}
|
                     di           }|s#|                     d          r|d         d	         }|                     d
g           D ].}t          |	          dk     r||	vr|	                    |           /|                    |                     dd          |                     dg           |p|                     dd          |                     d          |                     d          |                     d          |                     d          |                     d          ||	d
           |S )z@Extract content groups from a collapse-on-content_hash response.r   r   r   r   r   Nr   r   r   r   r   r`   r   r   r   r   r   r   r   r   )r?   rz   r   )rL   r   r   r   r   r   r   
inner_listr  content_hlsinnerihfrags                r   ru   z%SearchEngine._extract_collapse_groupsM  s    ++//;;') 	 	CWWY++FGGH%%E b))]B''VRVR	  $(H%'K# 1 1YY{B// .BFF7OO .!'{1~HFF9b11 1 1D;''!++K0G0G#**40001 MM &

>2 > >!::i44!<VZZ%<%<$jj66&,jj1E&F&F%zz-88"JJz22 &

> : :)      r!   r   r7   dict[str, list[dict[str, Any]]]c                  K   |si S |                                 }| j        j                            t          j        ddd|iidddiig|gdit          |          dz  g d	d
           d{V }t          |t                    r|n|j	        }i }|
                    di           
                    dg           D ]I}|d         }|
                    dd          }	|                    |	g                               |           J|S )u   批量获取多个 content_hash 对应的文档版本列表，按用户权限过滤。

        Batch-fetch meta docs for content_hashes, filtered by user's ACL.

        Returns ``{content_hash: [meta_source, ...]}``.
        r   r   r`   r   status	completedr   rN   )doc_idr`   r   r   r   r   r   acl_idsr   r   r   )r-   r   r   r\   Nr   r   r   )r   rJ   rq   rr   r   es_meta_indexrz   rs   rt   r]   r?   
setdefaultr   )
rL   r   rR   r   r   rq   r   r   r   r   s
             r   rw   zSearchEngine._fetch_versionsy  sd       	I))++
X\((( $~~&FG#h%<=! $.,  N++b0    ) 
 
 
 
 
 
 
 
( !t,,;dd$)8:7762&&**6266 	; 	;C^FNB//B##B++226::::r!   )bm25_scores
knn_scoresr   r   r  dict[str, float] | Noner  c                  g }|D ]}|d         }|                     |g           }	|	rt          |	fd          }
|
d         }|                     d          p|
                     dd          }|
                     d          }|
                     d          }|
                     d	          }|
                     d
          }|
                     d          }t          |	          }n|                     dg           }|r|d         nd}|                     dd          }|                     d          }|                     d          }|                     d	          }|                     d
          }|                     d          }d}|                    ||||||||||                     d          |pi                      |          |pi                      |          |                     dg           d           |S )u  构建最终结果：对每个 content_hash 选择权限匹配度最高的文档版本。

        Build final results with best-version selection per content_hash.

        Picks the version with the most direct permission match:
        U_ > O_ > D_ > A_ > R_
        r`   c                V    t          |                     dg           j                  S )Nr  )rD   r?   
acl_tokens)vrR   s    r   r   z6SearchEngine._build_document_results.<locals>.<lambda>  s'    "6i,,do# # r!   )r   r  r   r   r   r   r   r   r   r   r   r   r   r   )r  r`   version_countr   r   r   r   r   r   r   
bm25_score	knn_scorer   )r?   r=   rz   r   )rL   r   r   rR   r  r  resultsr  r`   versionsbest_versionr  r   r   r   r   r   r   r  r   s      `                r   rx   z$SearchEngine._build_document_results  s4     )+# .	 .	E 0L#''b99H ""         &h/		'**Kl.>.>w.K.K)--l;;
%1%5%56J%K%K"*..}=='++J77+//?? #H  ))Ir22'.6B		'2.."YY|44
%*YY/C%D%D"#ii66 99Z00$yy88 !NN  ,!.(&8*$ ,7++*0b55lCC(.B33LAA#iib99       r!   c                   |                     di           }i }d|v r)d |d                              dg           D             |d<   d|v r)d |d                              dg           D             |d<   d|v r)d |d                              dg           D             |d<   d	|v r)d
 |d	                              dg           D             |d	<   d|v r)d |d                              dg           D             |d<   |S )u   提取聚合结果（按机构/类型/类目/年份），使用内容级去重计数。

        Extract aggregation buckets with content-level unique counts.rd   r   c                    g | ];}|d          |                     di                                d|d                   d<S r   r   r   	doc_countr   countr   ra   bs     r   rc   z6SearchEngine._extract_aggregations.<locals>.<listcomp>  s^          
  U8UU#3R88<<WanUU      r!   bucketsr   c                    g | ];}|d          |                     di                                d|d                   d<S r!  r   r%  s     r   rc   z6SearchEngine._extract_aggregations.<locals>.<listcomp>   s^     ! ! !
  U8UU#3R88<<WanUU ! ! !r!   r   c                    g | ]P}|                     d           |d          |                     di                                d|d                   dQS r!  r   r%  s     r   rc   z6SearchEngine._extract_aggregations.<locals>.<listcomp>	  sn     % % %
 55<<%U8UU#3R88<<WanUU % % %r!   r   c                    g | ]P}|                     d           |d          |                     di                                d|d                   dQS r!  r   r%  s     r   rc   z6SearchEngine._extract_aggregations.<locals>.<listcomp>  sn     * * *
 55<<*U8UU#3R88<<WanUU * * *r!   r   c                    g | ];}|d          |                     di                                d|d                   d<S )key_as_stringr   r   r"  r#  r   r%  s     r   rc   z6SearchEngine._extract_aggregations.<locals>.<listcomp>  s_     ! ! !
  _-UU#3R88<<WanUU ! ! !r!   r   )rL   r   r   results       r   ry   z"SearchEngine._extract_aggregations  s    3324t   
 h++Ir::     F8 ! !
 i,,Y;;! ! !F9 D  % %
 m,00B??% % %F=! %%* *
 0155iDD* * *F%& ! !
 i,,Y;;! ! !F9 r!   )rG   r	   rH   r   )r-   r   rR   r   rO   rS   rP   r9   rQ   r9   r   r*   )r(   r   rX   r   rR   r   rO   r*   rP   r9   rQ   r9   r   r*   )rR   r   rO   r*   r   r*   )
r(   r   rR   r   rO   r*   r   r9   r   r   )
rX   r   rR   r   rO   r*   r   r9   r   r   )r   r*   r   r9   r   r   )r   r*   r   r   )r   r7   rR   r   r   r
  )r   r   r   r
  rR   r   r  r  r  r  r   r   )r   r*   r   r
  )__name__
__module____qualname____doc__rM   rr   rm   staticmethodr   r   r   rv   ru   rw   rx   ry   r   r!   r   rF   rF   T   sL       P P, , , , *.B
 B
 B
 B
 B
 B
Lz z z z| g
 g
 g
 \g
R$ $ $ $L' ' ' 'V/ / / /b( ( ( (X, , , ,h 04.2B B B B B BL8 8 8 8 8 8r!   rF   N)r   r   r   r   )r   r   r   r   )r(   r   r)   r   r   r*   )r6   r7   r8   r7   r   r9   )r1  
__future__r   datetimer   typingr   
app.configr   app.core.embeddingr   app.core.permissionr   app.infrastructure.es_clientr	   r
   app.utils.loggerr   r.  rk   r>   r    r'   r5   rD   rF   r   r!   r   <module>r;     s\   $ # " " " " "                   / / / / / / 1 1 1 1 1 1 F F F F F F F F ' ' ' ' ' '	H		  q!DD    O O O O
   0   Q Q Q Q Q Q Q Q Q Qr!   