
    i7                         d dl mZmZmZmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ d dlZd	ed
efdZdeee                  d
ee	         fdZdeeef         d
eeef         fdZded	ed
efdZ	 d dddededee         dee          d
ef
dZ!dddededee          d
efdZ"	 d dddededee	         dee         dee          d
ee	         fdZ#deded
e$fdZ%ded	e
d
efdZ&deded
efdZ'ded
efdZ(dS )!    )OptionalDictAnyList   )CrawlRequestCrawlJobCrawlResponseDocumentCrawlParamsRequestCrawlParamsDataWebhookConfigCrawlErrorsResponseActiveCrawlsResponseActiveCrawlPaginationConfig)handle_response_error)prepare_scrape_options)AsyncHttpClient)normalize_document_inputNrequestreturnc                 b   | j         r| j                                         st          d          d| j         i}| j        r
| j        |d<   | j        t          | j                  }|r||d<   | j        Ct          | j        t                    r| j        |d<   n| j        	                    d          |d<   | 	                    dd          }|
                    dd            |
                    dd            |
                    d	d            d
ddddddddddddddd}|                                D ]!\  }}||v r|
                    |          ||<   "|                    |           t          | dd           2t          t          | d                                                    |d<   |S )NURL cannot be emptyurlpromptscrapeOptionswebhookT)exclude_none)r   exclude_unsetscrape_optionsincludePathsexcludePathsmaxDiscoveryDepthsitemapignoreQueryParametersdeduplicateSimilarURLscrawlEntireDomainallowExternalLinksallowSubdomainsignoreRobotsTxtrobotsUserAgentdelaymaxConcurrencyregexOnFullURLzeroDataRetention)include_pathsexclude_pathsmax_discovery_depthr%   ignore_query_parametersdeduplicate_similar_urlscrawl_entire_domainallow_external_linksallow_subdomainsignore_robots_txtrobots_user_agentr-   max_concurrencyregex_on_full_urlzero_data_retentionintegration)r   strip
ValueErrorr   r!   r   r   
isinstancestr
model_dumppopitemsupdategetattr)r   dataoptsrequest_datafield_mappingssnakecamels          h/home/agentuser/.hermes/hermes-agent/venv/lib/python3.11/site-packages/firecrawl/v2/methods/aio/crawl.py_prepare_crawl_requestrO      s   ; 0gk//11 0.///7;D~ ( X)%g&<== 	)$(D!"gos++ 	L%oDOO%o88d8KKDO%%4t%LLLUD!!!Xt$$$%t,,,''2#:$<2 4-..+-2 N" ',,.. 2 2uL  &**511DKKKwt,,8!''="A"ABBHHJJ]K    	data_listc                     g }| pg D ]E}t          |t                    r.t          |          }|                    t	          di |           F|S )N )rA   dictr   appendr   )rQ   	documentsdoc_data
normalizeds       rN   _parse_crawl_documentsrY   D   sb     "IO 5 5h%% 	51(;;JX33
33444rP   bodyc                    |                      d          s#t          |                      dd                    |                      d          |                      dd          |                      dd          |                      dd          |                      d	          |                      d
          t          |                      dg                     dS )NsuccesserrorUnknown error occurredstatus	completedr   totalcreditsUsed	expiresAtnextrH   r_   r`   ra   credits_used
expires_atrd   rH   )get	ExceptionrY   )rZ   s    rN   _parse_crawl_status_responserj   M   s    88I E*BCCDDD ((8$$XXk1--'1%%22hh{++  &txx';';<<  rP   clientc                   K   t          |          }|                     d|           d{V }|j        dk    rt          |d           |                                }|                    d          r7t          |                    d          |                    d                    S t          |                    d	d
                    )aV  
    Start a crawl job for a website.
    
    Args:
        client: Async HTTP client instance
        request: CrawlRequest containing URL and options
        
    Returns:
        CrawlResponse with job information
        
    Raises:
        ValueError: If request is invalid
        Exception: If the crawl operation fails to start
    z	/v2/crawlN  zstart crawlr\   idr   )rn   r   r]   r^   )rO   poststatus_coder   jsonrh   r
   ri   )rk   r   payloadresponserZ   s        rN   start_crawlrt   \   s       %W--G[[g66666666Hs""h666==??Dxx	 EDHHUOODDDD
DHHW&>??
@
@@rP   request_timeoutjob_idpagination_configrv   c          	        K   |                      d| |           d{V }|j        dk    rt          |d           |                                }t	          |          }|d         }|r|j        nd}|r(|d         r t          | |d         |||	           d{V }t          |d
         |d         |d         |d         |d         |s|d         nd|          S )a[  
    Get the status of a crawl job.
    
    Args:
        client: Async HTTP client instance
        job_id: ID of the crawl job
        pagination_config: Optional configuration for pagination limits
        request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination 
            is enabled (default) and there are multiple pages of results, this timeout applies to 
            each page request separately, not to the entire operation
        
    Returns:
        CrawlJob with job information
        
    Raises:
        Exception: If the status check fails
    
/v2/crawl/timeoutNrm   zget crawl statusrH   Trd   ru   r_   r`   ra   rf   rg   re   )rh   rp   r   rq   rj   auto_paginate_fetch_all_pages_asyncr	   )	rk   rw   rx   rv   rs   rZ   rr   rV   r}   s	            rN   get_crawl_statusr   u   s<     0 ZZ 5V 5 5ZOOOOOOOOHs""h(:;;;==??D*400GI 8IR%33dM 
 
0FO+
 
 
 
 
 
 
 
 
	 x +&g^,<($1;WV__t   rP   next_urlc          
      <  K   |                      ||           d{V }|j        dk    rt          |d           |                                }t	          |          }t          |d         |d         |d         |d         |d	         |d
         |d                   S )a  
    Fetch a single page of crawl results using the provided next URL.

    Args:
        client: Async HTTP client instance
        next_url: Opaque next URL from a prior crawl status response
        request_timeout: Timeout (in seconds) for the HTTP request

    Returns:
        CrawlJob with the page data and next URL (if any)

    Raises:
        Exception: If the request fails or returns an error response
    r{   Nrm   zget crawl status pager_   r`   ra   rf   rg   rd   rH   re   )rh   rp   r   rq   rj   r	   )rk   r   rv   rs   rZ   rr   s         rN   get_crawl_status_pager      s      ( ZZ/ZBBBBBBBBHs""h(?@@@==??D*400Gx +&g^,<(V_V_   rP   initial_documentsc                  K   |                                 }|}d}|r|j        nd}|r|j        nd}	|r|j        nd}
t	          j                    }|r|||k    rn|
t	          j                    |z
  |
k    rn|                     ||           d{V }|j        dk    r8ddl}|	                    d          }|
                    dd|j        i           n|                                }	 t          |          }n# t          $ r Y naw xY w|d	         D ].}|	t          |          |	k    r n|                    |           /|	t          |          |	k    rn|d
         }|dz  }||S )a  
    Fetch all pages of crawl results asynchronously.
    
    Args:
        client: Async HTTP client instance
        next_url: URL for the next page
        initial_documents: Documents from the first page
        pagination_config: Optional configuration for pagination limits
        request_timeout: Optional timeout (in seconds) for the underlying HTTP request
        
    Returns:
        List of all documents from all pages
    r   Nr{   rm   	firecrawlzFailed to fetch next pagerp   )extrarH   rd      )copy	max_pagesmax_resultsmax_wait_timetime	monotonicrh   rp   logging	getLoggerwarningrq   rj   ri   lenrU   )rk   r   r   rx   rv   rV   current_url
page_countr   r   r   
start_timers   r   logger	page_datapage_payloaddocuments                     rN   r~   r~      s     * "&&((IKJ 0AJ!++dI3DN#//$K7HR%33dM!!J
 %!zY'>'>%DN,<,<z,I]+Z+Z  KIIIIIIII3&&NNN&&{33FNN6}hNb>cNdddMMOO		7	BBLL 	 	 	E	 %V, 	' 	'H'c)nn.K.KX&&&& ##i..K*G*G #6*a
K  %N s   .C> >
D
Dc                    K   |                      d|            d{V }|j        dk    rt          |d           |                                }|                    d          dk    S )a	  
    Cancel a crawl job.
    
    Args:
        client: Async HTTP client instance
        job_id: ID of the crawl job
        
    Returns:
        True if cancellation was successful
        
    Raises:
        Exception: If the cancellation operation fails
    rz   Nrm   zcancel crawlr_   	cancelled)deleterp   r   rq   rh   )rk   rw   rs   rZ   s       rN   cancel_crawlr     sv       ]]#8#8#899999999Hs""h777==??D88H,,rP   c                   K   |j         r|j                                         st          d          |j        r|j                                        st          d          |j         |j        d}|                     d|           d{V }|j        dk    rt          |d           |                                }|                    d          s#t          |                    d	d
                    |                    di           }i }ddddddddddddddd}|
                                D ]\  }}	||v r||         ||	<   d|v r|d         }
|
|d<   d|v r|d         |d<   t          di |S )au  
    Preview crawl parameters before starting a crawl job.
    
    Args:
        client: Async HTTP client instance
        request: CrawlParamsRequest containing URL and prompt
        
    Returns:
        CrawlParamsData containing crawl configuration
        
    Raises:
        ValueError: If request is invalid
        Exception: If the parameter preview fails
    r   zPrompt cannot be empty)r   r   z/v2/crawl/params-previewNrm   zcrawl params previewr\   r]   r^   rH   r1   r2   r3   r%   r4   r5   r6   r7   r8   r9   r:   r;   r!   r=   )r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r.   r   r0   r   r   rS   )r   r?   r@   r   ro   rp   r   rq   rh   ri   rE   r   )rk   r   rr   rs   rZ   params_data	convertedmappingrM   rL   wks              rN   crawl_params_previewr   .  s      ; 0gk//11 0.///> 3!5!5!7!7 31222kW^<<G[[!;WEEEEEEEEHs""h(>???==??D88I E*BCCDDD((62&&K "I''2!:"<24-..+)2 G    2 2uK*51IeK#!	)D#I	)''Y'''rP   crawl_idc                 h  K   |                      d| d           d{V }|j        dk    rt          |d           |                                }|                     d|          }|                     dg           |                     d|                     d	g                     d
}t	          di |S )a"  
    Get errors from a crawl job.
    
    Args:
        client: Async HTTP client instance
        crawl_id: ID of the crawl job
        
    Returns:
        CrawlErrorsResponse with errors and robots blocked
        
    Raises:
        Exception: If the error check operation fails
    rz   z/errorsNrm   zcheck crawl errorsrH   errorsrobotsBlockedrobots_blocked)r   r   rS   )rh   rp   r   rq   r   )rk   r   rs   rZ   rr   rX   s         rN   get_crawl_errorsr   e  s       ZZ >X > > >????????Hs""h(<=====??Dhhvt$$G++h++!++ow{{CSUW7X7XYY J ,,,,,rP   c           
        K   |                      d           d{V }|j        dk    rt          |d           |                                }|                     d          s#t	          |                     dd                    |                     dg           }g }|D ]}t          |t                    rz|                    |                     d	          |                     d
|                     d                    |                     d          |                     d          d           t          dd |D                       S )z
    Get active crawl jobs.
    
    Args:
        client: Async HTTP client instance
        
    Returns:
        ActiveCrawlsResponse with active crawl jobs
        
    Raises:
        Exception: If the active crawl jobs operation fails
    z/v2/crawl/activeNrm   zget active crawlsr\   r]   r^   crawlsrn   teamIdteam_idr   options)rn   r   r   r   Tc                 &    g | ]}t          d i |S )rS   )r   ).0ncs     rN   
<listcomp>z%get_active_crawls.<locals>.<listcomp>  s&    5]5]5]Bk6G6GB6G6G5]5]5]rP   )r\   r   )	rh   rp   r   rq   ri   rA   rT   rU   r   )rk   rs   rZ   	crawls_inrX   cs         rN   get_active_crawlsr     sQ      ZZ 233333333Hs""h(;<<<==??D88I E*BCCDDD2&&IJ  a 	eeDkk55155+;+;<<uuU||55++	      5]5]R\5]5]5]^^^^rP   )N))typingr   r   r   r   typesr   r	   r
   r   r   r   r   r   r   r   r   utils.error_handlerr   utils.validationr   utils.http_client_asyncr   utils.normalizer   r   rT   rO   rY   rB   rj   rt   floatr   r   r~   boolr   r   r   r   rS   rP   rN   <module>r      s   , , , , , , , , , , , ,                          9 8 8 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 +L +T + + + +\htCy&9 d8n    tCH~ $sCx.    Ao A A A A A A8 593
 (,3 3 333   013
 e_3 3 3 3 3t (,	! ! !!! e_	!
 ! ! ! !P 59	G (,G G GGG H~G   01	G e_G 
(^G G G GT- - - - - - -*4( 4(AS 4(Xg 4( 4( 4( 4(n-? -c -FY - - - -4_O _8L _ _ _ _ _ _rP   