
    i2T                        d Z ddlZddlmZmZmZmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZmZmZmZ ddlmZ ded	dfd
Zded	efdZdeee                  d	ee         fdZdeeef         d	eeef         fdZ deded	e
fdZ!	 d%dddededee         dee"         d	e	f
dZ#dddededee"         d	e	fdZ$	 d%dddededee         dee         dee"         d	ee         fdZ%deded	e&fdZ'	 	 d&dddedede(dee(         dee"         d	e	fdZ)	 	 d&dddedede(dee(         dee"         d	e	fdZ*deded	efd Z+d!ed"ed	efd#Z,ded	efd$Z-dS )'z.
Crawling functionality for Firecrawl v2 API.
    N)OptionalDictAnyList   )CrawlRequestCrawlJobCrawlResponseDocumentCrawlParamsRequestCrawlParamsResponseCrawlParamsDataWebhookConfigCrawlErrorsResponseActiveCrawlsResponseActiveCrawlPaginationConfig)
HttpClienthandle_response_errorvalidate_scrape_optionsprepare_scrape_options)normalize_document_inputrequestreturnc                     | j         r| j                                         st          d          | j        | j        dk    rt          d          | j        t          | j                   dS dS )z
    Validate crawl request parameters.
    
    Args:
        request: CrawlRequest to validate
        
    Raises:
        ValueError: If request is invalid
    URL cannot be emptyNr   zLimit must be positive)urlstrip
ValueErrorlimitscrape_optionsr   )r   s    d/home/agentuser/.hermes/hermes-agent/venv/lib/python3.11/site-packages/firecrawl/v2/methods/crawl.py_validate_crawl_requestr#      s     ; 0gk//11 0.///} W]a%7%71222 ) 677777 *)    c                    t          |            d| j        i}| j        r
| j        |d<   | j        t	          | j                  }|r||d<   |                     dd          }|                    dd           |                    dd           |                    dd           | j        Ct          | j        t                    r| j        |d<   n| j                            d	          |d<   d
ddddddddddddddd}|
                                D ]!\  }}||v r|                    |          ||<   "|                    |           d|v r8t          |d         t                    r|d                                         |d<   |S )z
    Prepare crawl request for API submission.
    
    Args:
        request: CrawlRequest to prepare
        
    Returns:
        Dictionary ready for API submission
    r   promptNscrapeOptionsT)exclude_noneexclude_unsetr!   webhook)r(   includePathsexcludePathsmaxDiscoveryDepthsitemapignoreQueryParametersdeduplicateSimilarURLscrawlEntireDomainallowExternalLinksallowSubdomainsignoreRobotsTxtrobotsUserAgentdelaymaxConcurrencyregexOnFullURLzeroDataRetention)include_pathsexclude_pathsmax_discovery_depthr.   ignore_query_parametersdeduplicate_similar_urlscrawl_entire_domainallow_external_linksallow_subdomainsignore_robots_txtrobots_user_agentr6   max_concurrencyregex_on_full_urlzero_data_retentionintegration)r#   r   r&   r!   r   
model_dumppopr*   
isinstancestritemsupdater   )r   datascrape_datarequest_datafield_mappings
snake_case
camel_cases          r"   _prepare_crawl_requestrT   &   s    G$$$ 7;D ~ ( X ),W-CDD 	0$/D! %%4t%LLL UD!!!Xt$$$%t,,, "gos++ 	L%oDOO &o88d8KKDO ('2#:$<2 4-..+-2 N& #1"6"6"8"8 < <
J%%+//
;;D 	KKD,?!E!E"=17799]Kr$   	data_listc           
          g }| pg D ]C}t          |t                    r,|                    t          di t	          |                     D|S )N )rJ   dictappendr   r   )rU   	documentsdoc_datas      r"   _parse_crawl_documentsr\   q   sa     "IO M Mh%% 	MXKK(@(J(JKKLLLr$   response_datac                    |                      d          s#t          |                      dd                    |                      d          |                      dd          |                      dd          |                      dd          |                      d	          |                      d
          t          |                      dg                     dS )NsuccesserrorUnknown error occurredstatus	completedr   totalcreditsUsed	expiresAtnextrN   rb   rc   rd   credits_used
expires_atrg   rN   )get	Exceptionr\   )r]   s    r"   _parse_crawl_status_responserm   y   s    Y'' N))'3KLLMMM  ##H--"&&{A66""7A..%))-;;#''44!!&))&}'8'8'D'DEE  r$   clientc                    t          |          }|                     d|          }|j        st          |d           |                                }|                    d          r7|                    d          |                    d          d}t          d
i |S t          |                    dd                    )aP  
    Start a crawl job for a website.
    
    Args:
        client: HTTP client instance
        request: CrawlRequest containing URL and options
        
    Returns:
        CrawlResponse with job information
        
    Raises:
        ValueError: If request is invalid
        Exception: If the crawl operation fails to start
    z	/v2/crawlzstart crawlr_   idr   )rp   r   r`   ra   NrW   )rT   postokr   jsonrk   r
   rl   )rn   r   rP   responser]   job_datas         r"   start_crawlrv      s     *'22L{{;55H; 7h666MMOOM## N##D)) $$U++
 

 ((x((())'3KLLMMMr$   request_timeoutjob_idpagination_configrx   c          	         |                      d| |          }|j        st          |d           |                                }t	          |          }|d         }|r|j        nd}|rC|d         r;|r|j        t          |          |j        k    st          | |d         |||          }t          |d	         |d
         |d         |d         |d         |s|d         nd|          S )aK  
    Get the status of a crawl job.

    Args:
        client: HTTP client instance
        job_id: ID of the crawl job
        pagination_config: Optional configuration for pagination behavior
        request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination 
            is enabled (default) and there are multiple pages of results, this timeout applies to 
            each page request separately, not to the entire operation

    Returns:
        CrawlJob with current status and data

    Raises:
        Exception: If the status check fails
    
/v2/crawl/timeoutzget crawl statusrN   Trg   Nrw   rb   rc   rd   ri   rj   rh   )
rk   rr   r   rs   rm   auto_paginatemax_resultslen_fetch_all_pagesr	   )	rn   ry   rz   rx   rt   r]   payloadrZ   r   s	            r"   get_crawl_statusr      s.   2 zz/v//zIIH ; <h(:;;; MMOOM*=99GI 8IR%33dM 
 

)5	NN/;;;$FO+
 
 
	 x +&g^,<($1;WV__t   r$   next_urlc          
      $   |                      ||          }|j        st          |d           |                                }t	          |          }t          |d         |d         |d         |d         |d         |d         |d	         
          S )a  
    Fetch a single page of crawl results using the provided next URL.

    Args:
        client: HTTP client instance
        next_url: Opaque next URL from a prior crawl status response
        request_timeout: Timeout (in seconds) for the HTTP request

    Returns:
        CrawlJob with the page data and next URL (if any)

    Raises:
        Exception: If the request fails or returns an error response
    r}   zget crawl status pagerb   rc   rd   ri   rj   rg   rN   rh   )rk   rr   r   rs   rm   r	   )rn   r   rx   rt   r]   r   s         r"   get_crawl_status_pager      s    ( zz(Oz<<H; Ah(?@@@MMOOM*=99Gx +&g^,<(V_V_   r$   initial_documentsc                   |                                 }|}d}|r|j        nd}|r|j        nd}	|r|j        nd}
t	          j                    }|r|||k    rn|
t	          j                    |z
  |
k    rn|                     ||          }|j        s8ddl}|	                    d          }|
                    dd|j        i           n|                                }	 t          |          }n# t          $ r Y naw xY w|d         D ].}|	t          |          |	k    r n|                    |           /|	t          |          |	k    rn|d	         }|d
z  }||S )a  
    Fetch all pages of crawl results.

    Args:
        client: HTTP client instance
        next_url: URL for the next page
        initial_documents: Documents from the first page
        pagination_config: Optional configuration for pagination limits
        request_timeout: Optional timeout (in seconds) for the underlying HTTP request

    Returns:
        List of all documents from all pages
    r   Nr}   	firecrawlzFailed to fetch next pagestatus_code)extrarN   rg      )copy	max_pagesr   max_wait_timetime	monotonicrk   rr   logging	getLoggerwarningr   rs   rm   rl   r   rY   )rn   r   r   rz   rx   rZ   current_url
page_countr   r   r   
start_timert   r   logger	page_datapage_payloaddocuments                     r"   r   r     s   * "&&((IKJ 0AJ!++dI3DN#//$K7HR%33dM!!J
 &!zY'>'>%DN,<,<z,I]+Z+Z ::k?:CC{ 	NNN&&{33FNN6}hNb>cNdddMMOO		7	BBLL 	 	 	E	 %V, 	' 	'H&3y>>[+H+HX&&&& "s9~~'D'D #6*a
M  &P s   "C2 2
C?>C?c                     |                      d|           }|j        st          |d           |                                }|                    d          dk    S )a  
    Cancel a running crawl job.
    
    Args:
        client: HTTP client instance
        job_id: ID of the crawl job to cancel
        
    Returns:
        bool: True if the crawl was cancelled, False otherwise
        
    Raises:
        Exception: If the cancellation fails
    r|   zcancel crawlrb   	cancelled)deleterr   r   rs   rk   )rn   ry   rt   r]   s       r"   cancel_crawlr   ^  s^     }}2&2233H; 8h777MMOOMX&&+55r$   poll_intervalr~   c                    t          j                    }	 t          | ||          }|j        dv r|S |0t          j                    |z
  |k    rt	          d| d| d          t          j        |           d)a  
    Wait for a crawl job to complete, polling for status updates.
    
    Args:
        client: HTTP client instance
        job_id: ID of the crawl job
        poll_interval: Seconds between status checks
        timeout: Maximum seconds to wait (None for no timeout)
        request_timeout: Optional timeout (in seconds) for each status request
        
    Returns:
        CrawlJob when job completes
        
    Raises:
        Exception: If the job fails
        TimeoutError: If timeout is reached
    Trw   )rc   failedr   Nz
Crawl job z did not complete within z seconds)r   r   r   rb   TimeoutErrorsleep)rn   ry   r   r~   rx   r   	crawl_jobs          r"   wait_for_crawl_completionr   u  s    2 !!J"$+
 
 
	 CCC DN$4$4z$AW#L#L^F^^W^^^___ 	
=!!!!"r$   c                d    t          | |          }|j        }||n|}t          | ||||          S )a  
    Start a crawl job and wait for it to complete.
    
    Args:
        client: HTTP client instance
        request: CrawlRequest containing URL and options
        poll_interval: Seconds between status checks
        timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
        request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination 
            requests when fetching results. If there are multiple pages, each page request gets this timeout
        
    Returns:
        CrawlJob when job completes
        
    Raises:
        ValueError: If request is invalid
        Exception: If the crawl fails to start or complete
        TimeoutError: If timeout is reached
    Nrw   )rv   rp   r   )rn   r   r   r~   rx   r   ry   effective_request_timeouts           r"   crawlr     sU    8 FG,,I\F 4C3NT[ %1   r$   c                    |j         r|j                                         st          d          |j        r|j                                        st          d          |j         |j        d}|                     d|          }|j        st          |d           |                                }|                    d          r{|                    di           }i }dd	d
dddddddddddd}d|v r2|d         }t          |t                    rt          d(i ||d<   n||d<   |                                D ]\  }	}
|	|v r|	dk    r||	         ||	         }i }ddddddd }|                                D ]\  }}||v r||         ||<   d!|v r8|d!         }t          |t                    rd"d#lm}  ||$          |d!<   n||d!<   |                                D ]\  }}||vr|d!k    r|||<   |||
<   ||	         ||
<   |                                D ]\  }}||vr|||<   d%|v r|d%         |d%<   t!          d(i |S t#          |                    d&d'                    ))al  
    Get crawl parameters from LLM based on URL and prompt.
    
    Args:
        client: HTTP client instance
        request: CrawlParamsRequest containing URL and prompt
        
    Returns:
        CrawlParamsData containing suggested crawl options
        
    Raises:
        ValueError: If request is invalid
        Exception: If the operation fails
    r   zPrompt cannot be empty)r   r&   z/v2/crawl/params-previewzcrawl params previewr_   rN   r:   r;   r<   r.   r=   r>   r?   r@   rA   rB   rC   rD   r!   rF   )r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7   r'   r9   r*   r'   Ninclude_tagsexclude_tagsonly_main_contentwait_forskip_tls_verificationremove_base64_images)includeTagsexcludeTagsonlyMainContentwaitForskipTlsVerificationremoveBase64Imagesformatsr   )ScrapeFormats)r   r   r`   ra   rW   )r   r   r   r&   rq   rr   r   rs   rk   rJ   rX   r   rL   listtypesr   r   rl   )rn   r   rP   rt   r]   params_dataconverted_paramsrQ   webhook_datarS   rR   scrape_opts_dataconverted_scrape_optsscrape_field_mappingsscrape_camelscrape_snakeformats_datar   keyvalues                       r"   crawl_params_previewr     sz     ; 0gk//11 0.///> 3!5!5!7!7 31222 {. L {{5|DDH ; @h(>??? MMOOM## ON#''33 ++!6 %>&@!6"8122/-!6
 
$ ##&y1L,-- ;.;.K.Kl.K.K ++.: +&4&:&:&<&< $	K $	K"J
[((00[5L5X'2:'>$,.)'5'5+>#-/F.D- -) 7L6Q6Q6S6S a a2l'+;;;BRS_B`1,? !$444'7	'B%lD99 L======?L}Ua?b?b?b1)<<?K1)< '7&<&<&>&> ? ?
U&;;;y@P@P9>1#63H$Z003>z3J$Z0 &++-- 	. 	.JC.(((- % %%*7	*BY'22!1222))'3KLLMMMr$   http_clientcrawl_idc                    |                      d| d          }|j        st          |d           	 |                                }|                     d|          }|                     dg           |                     d|                     dg                     d}t	          di |S # t
          $ r}t          d	|           d
}~ww xY w)a-  
    Get errors from a crawl job.
    
    Args:
        http_client: HTTP client for making requests
        crawl_id: The ID of the crawl job
        
    Returns:
        CrawlErrorsResponse containing errors and robots blocked URLs
        
    Raises:
        Exception: If the request fails
    r|   z/errorszcheck crawl errorsrN   errorsrobotsBlockedrobots_blocked)r   r   z'Failed to parse crawl errors response: NrW   )rk   rr   r   rs   r   rl   )r   r   rt   bodyr   
normalizedes          r"   get_crawl_errorsr   G  s     =H===>>H; >h(<===
G}}((64(( kk(B//%kk/7;;GWY[;\;\]]
 

 #00Z000 G G GE!EEFFFGs   A6B) )
C
3CC
c           
         |                      d          }|j        st          |d           |                                }|                     d          s#t	          |                     dd                    |                     dg           }g }|D ]}t          |t                    rz|                    |                     d          |                     d|                     d	                    |                     d
          |                     d          d           t          dd |D                       S )z
    Get a list of currently active crawl jobs.
    
    Args:
        client: HTTP client instance
        
    Returns:
        ActiveCrawlsResponse containing a list of active crawl jobs
        
    Raises:
        Exception: If the request fails
    z/v2/crawl/activezget active crawlsr_   r`   ra   crawlsrp   teamIdteam_idr   options)rp   r   r   r   Tc                 &    g | ]}t          d i |S )rW   )r   ).0ncs     r"   
<listcomp>z%get_active_crawls.<locals>.<listcomp>  s&    5d5d5dBk6G6GB6G6G5d5d5dr$   )r_   r   )	rk   rr   r   rs   rl   rJ   rX   rY   r   )rn   rt   r   	crawls_innormalized_crawlscs         r"   get_active_crawlsr   g  s8    zz,--H; =h(;<<<==??D88I E*BCCDDD2&&I  a 	$$eeDkk55155+;+;<<uuU||55++	& &     5d5dRc5d5d5deeeer$   )N)r   N).__doc__r   typingr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   utils.normalizer   r#   rX   rT   r\   rK   rm   rv   floatr   r   r   boolr   intr   r   r   r   r   rW   r$   r"   <module>r      s     , , , , , , , , , , , ,                            g f f f f f f f f f f f 6 6 6 6 6 68\ 8d 8 8 8 8*HL HT H H H HVhtCy&9 d8n    S#X 4S>     N
  N\  Nm  N  N  N  NL 59>
 (,> > >>>   01>
 e_> > > > >J (,	$ $ $$$ e_	$
 $ $ $ $V 59	H (,H H HHH H~H   01	H e_H 
(^H H H HV6 6S 6T 6 6 6 64 !	+" (,+" +" +"+"+" +" c]	+" e_+" +" +" +" +"b !	) (,) ) ))) ) c]	) e_) ) ) ) )XuN uN6H uN_ uN uN uN uNpG* G G@S G G G G@ fj  f-A  f  f  f  f  f  fr$   