o
    |qiF>                     @   s|   d dl Z d dlmZmZmZmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ G dd	 d	eZdS )
    N)AnyAsyncIteratorIteratorListOptionalSetUnion)urlparse)BeautifulSoup)Document)
BaseLoader)WebBaseLoaderc                   @   s  e Zd ZdZ					d1ddddeded	ee d
edededee deee  fddZdedefddZ		d2de
e dededefddZdeee
e f defddZdedefddZdede
e fd d!Z	d3ded"ee d#ee de
e fd$d%Z	d3ded	ed"ee d#ee de
e f
d&d'Zdee fd(d)Zdee fd*d+Z	d3ded,ee dee fd-d.Zdede
e fd/d0ZdS )4GitbookLoadera   Load `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the sitemap, handling nested sitemap indexes.

    When `load_all_paths=True`, the loader parses XML sitemaps and requires the
    `lxml` package to be installed (`pip install lxml`).
    FNmainT)sitemap_urlallowed_domainsweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressr   r   c          
      C   s   |p|| _ | j dr| j dd | _ || _|| _|| _|| _|| _|| _| jdu r5t|j	}	|	r5|	h| _|rA|p>| j  d| _
n|| _
| | j
sVtd| j
 d| j dS )aT  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`. Requires `lxml` package.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
            sitemap_url: Custom sitemap URL to use when load_all_paths is True.
                Defaults to "{base_url}/sitemap.xml".
            allowed_domains: Optional set of allowed domains to fetch from.
                If None (default), the loader will restrict crawling to the domain
                of the `web_page` URL to prevent potential SSRF vulnerabilities.
                Provide an explicit set (e.g., {"example.com", "docs.example.com"})
                to allow crawling across multiple domains. Use with caution in
                server environments where users might control the input URLs.
        /Nz/sitemap.xmlz
Domain in z% is not in the allowed domains list: )r   endswithr   r   r   r   r   r   r	   netloc	start_url_is_url_allowed
ValueError)
selfr   r   r   r   r   r   r   r   Zinitial_domain r    s/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/langchain_community/document_loaders/gitbook.py__init__   s.   
%


zGitbookLoader.__init__urlreturnc                 C   sV   | j du rdS zt|}|jdvrW dS |jsW dS |j| j v W S  ty*   Y dS w )z0Check if a URL has an allowed scheme and domain.NF)httphttps)r   r	   schemer   	Exception)r   r#   parsedr    r    r!   r   Y   s   

zGitbookLoader._is_url_allowedURLurl_listurl_typec                 C   s2   |  |r|| dS td| d|  dS )aB  Safely add a URL to a list if it's from an allowed domain.

        Args:
            url_list: The list to add the URL to
            url: The URL to add
            url_type: Type of URL for warning message (e.g., "sitemap", "content")

        Returns:
            bool: True if URL was added, False if skipped
        TzSkipping disallowed z URL: F)r   appendwarningswarn)r   r+   r#   r,   r    r    r!   _safe_add_urlp   s
   

zGitbookLoader._safe_add_urlurl_or_urlsc                 C   s   t || j| jdS )zCreate a new WebBaseLoader instance for the given URL(s).

        This ensures each operation gets its own isolated WebBaseLoader.
        )Zweb_pathr   r   )r   r   r   )r   r1   r    r    r!   _create_web_loader   s
   z GitbookLoader._create_web_loadersoupc                 C   s   | dduS )z+Check if the soup contains a sitemap index.ZsitemapindexN)find)r   r3   r    r    r!   _is_sitemap_index   s   zGitbookLoader._is_sitemap_indexc                 C   s@   | d}g }|D ]}|d}|r|jr| ||jd q	|S )z*Extract sitemap URLs from a sitemap index.sitemaploc)find_allr4   textr0   )r   r3   Zsitemap_tagsurlsr6   r7   r    r    r!   _extract_sitemap_urls   s   


z#GitbookLoader._extract_sitemap_urlsprocessed_urls
web_loaderc                 C   s   |du r
|  | j}| |ro| |}g }|D ]T}||v r'td|  q|| z|j}|g|_|jdd}||_| 	|||}	|
|	 W q tyl }
 z| jratd| d|
  n W Y d}
~
qd}
~
ww |S | |S )aO  Process a sitemap, handling both direct content URLs and sitemap indexes.

        Args:
            soup: The BeautifulSoup object of the sitemap
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        Nz(Skipping already processed sitemap URL: lxml-xmlparserError processing sitemap : )r2   r   r5   r;   r.   r/   add	web_pathsscrape_process_sitemapextendr(   r   
_get_paths)r   r3   r<   r=   sitemap_urlsall_content_urlsr   original_web_pathssitemap_soupcontent_urlser    r    r!   rF      s<   



zGitbookLoader._process_sitemapc                    s  |du r|  | j}| |r|| |}g } fdd|D }|s$g S |j}||_|j|ddI dH }	||_t||	D ]=\}
} |
 z| || |I dH }|	| W q< t
yy } z| jrntd|
 d|  n W Y d}~q<d}~ww |S | |S )a^  Async version of _process_sitemap.

        Args:
            soup: The BeautifulSoup object of the sitemap
            base_url: The base URL for relative paths
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        Nc                    s   g | ]}| vr|qS r    r    ).0r#   r<   r    r!   
<listcomp>   s    z3GitbookLoader._aprocess_sitemap.<locals>.<listcomp>r>   r?   rA   rB   )r2   r   r5   r;   rD   ascrape_allziprC   _aprocess_sitemaprG   r(   r   r.   r/   rH   )r   r3   r   r<   r=   rI   rJ   Znew_urlsrK   soupsr   rL   rM   rN   r    rP   r!   rT      s:   




zGitbookLoader._aprocess_sitemapc                 c   s    | j s| | j}| }| || j}|r|V  dS dS | | j}|jdd}t }| ||}|sA| jrAt	
d| j  g }|D ]	}| ||d qE|sSdS | |}	|	|}
t|
|D ]\}}| ||}|rq|V  qbdS )zDFetch text from one single GitBook page or recursively from sitemap.r>   r?   $No content URLs found in sitemap at contentN)r   r2   r   rE   _get_documentr   setrF   r   r.   r/   r0   Z
scrape_allrS   )r   temp_loaderr3   doc	soup_infor<   relative_pathsr:   r#   content_loader
soup_infosr    r    r!   	lazy_load  s6   



zGitbookLoader.lazy_loadc                 C  s$  | j s(| | j}|| jgI dH }|d }| || j}|r&|V  dS dS | | j}|j| jgddI dH }|d }t }| || j|I dH }|sZ| j	rZt
d| j  g }|D ]	}| ||d q^|sldS | |}	|	|I dH }
t|
|D ]\}}| ||}|dur|V  q~dS )z/Asynchronously fetch text from GitBook page(s).Nr   r>   r?   rV   rW   )r   r2   r   rR   rX   r   rY   rT   r   r   r.   r/   r0   rS   )r   rZ   rU   r\   r[   r<   r]   r:   r#   r^   r_   Z	maybe_docr    r    r!   
alazy_load?  s>   



zGitbookLoader.alazy_load
custom_urlc                 C   sX   | | j}|s
dS |jdd }| d}|r|jnd}|p"| j|d}t||dS )z,Fetch content from page and return Document.N
)	separatorZh1 )sourcetitle)Zpage_contentmetadata)r4   r   Zget_textstripr9   r   r   )r   r3   rb   Zpage_content_rawrW   Ztitle_if_existsrg   rh   r    r    r!   rX   m  s   
zGitbookLoader._get_documentc                 C   s*   g }| dD ]}|jr||j q|S )zFetch all URLs in the sitemap.r7   )r8   r9   r-   )r   r3   r:   r7   r    r    r!   rH   z  s   zGitbookLoader._get_paths)FNr   FT)r*   )N)__name__
__module____qualname____doc__strboolr   r   r"   r   r   r0   r   r   r2   r
   r5   r;   rF   rT   r   r   r`   r   ra   r   rX   rH   r    r    r    r!   r      s    		


C

@
=*/
r   )r.   typingr   r   r   r   r   r   r   urllib.parser	   Zbs4r
   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z-langchain_community.document_loaders.web_baser   r   r    r    r    r!   <module>   s    $