
    i@                        d Z ddlZddlmZmZmZmZmZmZ ddl	m
Z
mZmZmZmZmZmZ ddlmZmZmZmZ ddlmZ ddl	mZ d	eee                  d
ee         fdZdeeef         d
eeef         fdZddddddddddedee         dee         deeeef                  dee         dee         dee         dee         dee         dee         d
efdZ	 d1dededee         d
efdZdddededee          d
efd Z!	 d1deded!ee         dee         d
ee         f
d"Z"deded
efd#Z#	 	 d2deded$ed%ee         d
ef
d&Z$ddddddddddd'
dedee         dee         deeeef                  dee         dee         dee         dee         dee         dee         d$ed%ee         d
efd(Z%dee         d
ee         fd)Z&dddddddd*dee         dee         deeeef                  dee         dee         dee         dee         dee         d
e'fd+Z(d3dee         d-ed
eee                  fd.Z)	 	 	 	 d4dedee         dee         d-ed$ed%ee         d
ee         fd/Z*deded
efd0Z+dS )5z4
Batch scraping functionality for Firecrawl v2 API.
    N)OptionalListCallableDictAnyUnion   )BatchScrapeRequestBatchScrapeResponseBatchScrapeJobScrapeOptionsDocumentWebhookConfigPaginationConfig)
HttpClienthandle_response_errorvalidate_scrape_optionsprepare_scrape_options)normalize_document_input)CrawlErrorsResponse	data_listreturnc                     g }| pg D ]E}t          |t                    r.t          |          }|                    t	          di |           F|S )N )
isinstancedictr   appendr   )r   	documentsdoc
normalizeds       d/home/agentuser/.hermes/hermes-agent/venv/lib/python3.11/site-packages/firecrawl/v2/methods/batch.py_parse_batch_scrape_documentsr"      sb     "IB 5 5c4   	51#66JX33
33444    bodyc                    |                      d          s#t          |                      dd                    |                      d          |                      dd          |                      dd          |                      d          |                      d	          |                      d
          t          |                      dg           pg           dS )NsuccesserrorUnknown error occurredstatus	completedr   totalcreditsUsed	expiresAtnextdatar)   r*   r+   credits_used
expires_atr.   r/   )get	Exceptionr"   )r$   s    r!   #_parse_batch_scrape_status_responser5      s    88I E*BCCDDD ((8$$XXk1--'1%%//hh{++  -dhhvr.B.B.HbII  r#   optionswebhookappend_to_idignore_invalid_urlsmax_concurrencyzero_data_retentionintegrationidempotency_keyclienturlsr7   r8   r9   r:   r;   r<   r=   r>   c          
         t          ||||||||          }
|                     |	          }|                     d|
|          }|j        st	          |d           |                                }|                    d          s#t          |                    dd                    t          |                    d          |                    d	          |                    d
          pd          S )aW  
    Start a batch scrape job for multiple URLs.
    
    Args:
        client: HTTP client instance
        urls: List of URLs to scrape
        options: Scraping options
        
    Returns:
        BatchScrapeResponse containing job information
        
    Raises:
        FirecrawlError: If the batch scrape operation fails to start
    r7   r8   r9   r:   r;   r<   r=   z/v2/batch/scrape)headerszstart batch scraper&   r'   r(   idurlinvalidURLsN)rD   rE   invalid_urls)	prepare_batch_scrape_request_prepare_headerspostokr   jsonr3   r4   r   )r?   r@   r7   r8   r9   r:   r;   r<   r=   r>   request_datarC   responser$   s                 r!   start_batch_scraperO   -   s    8 0!/'/	 	 	L %%o66G{{-|W{MMH ; >h(<=== ==??D88I E*BCCDDD88D>>HHUOOXXm,,4   r#   job_idpagination_configc           	         |                      d|           }|j        st          |d           |                                }t	          |          }|d         }|r|j        nd}|r |d         rt          | |d         ||          }t          |d         |d         |d         |d	         |d
         |s|d         nd|          S )ak  
    Get the status of a batch scrape job.
    
    Args:
        client: HTTP client instance
        job_id: ID of the batch scrape job
        pagination_config: Optional configuration for pagination behavior
        
    Returns:
        BatchScrapeJob containing job status and data
        
    Raises:
        FirecrawlError: If the status check fails
    /v2/batch/scrape/zget batch scrape statusr/   Tr.   r)   r*   r+   r1   r2   Nr0   )r3   rK   r   rL   r5   auto_paginate_fetch_all_batch_pagesr   )r?   rP   rQ   rN   r$   payloadr   rT   s           r!   get_batch_scrape_statusrW   g   s    ( zz6f6677H ; Ch(ABBB ==??D1$77GI 8IR%33dM 
 
*FO	
 
	 x +&g^,<($1;WV__t   r#   )request_timeoutnext_urlrX   c          
      $   |                      ||          }|j        st          |d           |                                }t	          |          }t          |d         |d         |d         |d         |d         |d         |d	         
          S )a  
    Fetch a single page of batch scrape results using the provided next URL.

    Args:
        client: HTTP client instance
        next_url: Opaque next URL from a prior batch scrape status response
        request_timeout: Timeout (in seconds) for the HTTP request

    Returns:
        BatchScrapeJob with the page data and next URL (if any)

    Raises:
        Exception: If the request fails or returns an error response
    )timeoutzget batch scrape status pager)   r*   r+   r1   r2   r.   r/   r0   )r3   rK   r   rL   r5   r   )r?   rY   rX   rN   r$   rV   s         r!   get_batch_scrape_status_pager\      s    ( zz(Oz<<H; Hh(FGGG==??D1$77Gx +&g^,<(V_V_   r#   initial_documentsc                    |                                 }|}d}|r|j        nd}|r|j        nd}|r|j        nd}	t	          j                    }
|r|||k    rn|	t	          j                    |
z
  |	k    rn|                     |          }|j        s8ddl}|	                    d          }|
                    dd|j        i           n|                                }	 t          |          }n# t          $ r Y naw xY w|d         D ].}|t          |          |k    r n|                    |           /|t          |          |k    rn|d         }|d	z  }||S )
aR  
    Fetch all pages of batch scrape results.
    
    Args:
        client: HTTP client instance
        next_url: URL for the next page
        initial_documents: Documents from the first page
        pagination_config: Optional configuration for pagination limits
        
    Returns:
        List of all documents from all pages
    r   N	firecrawlzFailed to fetch next pagestatus_code)extrar/   r.      )copy	max_pagesmax_resultsmax_wait_timetime	monotonicr3   rK   logging	getLoggerwarningr`   rL   r5   r4   lenr   )r?   rY   r]   rQ   r   current_url
page_countrd   re   rf   
start_timerN   ri   logger	page_datapage_payloaddocuments                    r!   rU   rU      s   $ "&&((IKJ 0AJ!++dI3DN#//$K7HR%33dM!!J
 %!zY'>'>%DN,<,<z,I]+Z+Z ::k**{ 	NNN&&{33FNN6}hNb>cNdddMMOO		>yIILL 	 	 	E	 %V, 	' 	'H&3y>>[+H+HX&&&& "s9~~'D'D #6*a
K  %N s    C0 0
C=<C=c                     |                      d|           }|j        st          |d           |                                }|                    d          dk    S )a(  
    Cancel a running batch scrape job.
    
    Args:
        client: HTTP client instance
        job_id: ID of the batch scrape job to cancel
        
    Returns:
        BatchScrapeStatusResponse with updated status
        
    Raises:
        FirecrawlError: If the cancellation fails
    rS   zcancel batch scraper)   	cancelled)deleterK   r   rL   r3   )r?   rP   rN   r$   s       r!   cancel_batch_scraperw   	  sa    $ }}999::H ; ?h(=>>> ==??D88H,,r#   poll_intervalr[   c                     t          j                    }	 t          | |          }|j        dv r|S |r0t          j                    |z
  |k    rt	          d| d| d          t          j        |           b)a  
    Wait for a batch scrape job to complete, polling for status updates.
    
    Args:
        client: HTTP client instance
        job_id: ID of the batch scrape job
        poll_interval: Seconds between status checks
        timeout: Maximum seconds to wait (None for no timeout)
        
    Returns:
        BatchScrapeStatusResponse when job completes
        
    Raises:
        FirecrawlError: If the job fails or timeout is reached
        TimeoutError: If timeout is reached
    T)r*   failedru   zBatch scrape job z did not complete within z seconds)rg   rh   rW   r)   TimeoutErrorsleep)r?   rP   rx   r[   ro   
status_jobs         r!   wait_for_batch_completionr~   &  s    , !!J",VV<<
  DDD  	g((:5@@e6eeT[eeefff 	
=!!!"r#   )
r7   r8   r9   r:   r;   r<   r=   r>   rx   r[   c       
         f    t          | |||||||||	
  
        }|j        }t          | ||
|          S )a  
    Start a batch scrape job and wait for it to complete.
    
    Args:
        client: HTTP client instance
        urls: List of URLs to scrape
        options: Scraping options
        poll_interval: Seconds between status checks
        timeout: Maximum seconds to wait (None for no timeout)
        
    Returns:
        BatchScrapeStatusResponse when job completes
        
    Raises:
        FirecrawlError: If the batch scrape fails to start or complete
        TimeoutError: If timeout is reached
    r6   )rO   rD   r~   )r?   r@   r7   r8   r9   r:   r;   r<   r=   r>   rx   r[   startrP   s                 r!   batch_scraper   M  s]    B !/'/'  E XF %w  r#   c                 N   | st          d          g }| D ]}|rt          |t                    st          d|           |                    d          s'|                    d          st          d|           |                    |                                           |S )z
    Validate and normalize a list of URLs for batch scraping.
    
    Args:
        urls: List of URLs to validate
        
    Returns:
        Validated list of URLs
        
    Raises:
        ValueError: If URLs are invalid
    zURLs list cannot be emptyzInvalid URL: zhttp://zhttps://z)URL must start with http:// or https://: )
ValueErrorr   str
startswithr   strip)r@   validated_urlsrE   s      r!   validate_batch_urlsr     s      64555N + + 	4*S#.. 	42S22333 y)) 	PS^^J-G-G 	PNNNOOOciikk****r#   rB   c                l   t          |           }d|i}	|r&t          |          }
|
r|	                    |
           |4t          |t                    r||	d<   n|                    d          |	d<   |||	d<   |||	d<   |||	d<   |||	d	<   |$t	          |                                          |	d
<   |	S )z
    Prepare a batch scrape request payload.
    
    Args:
        urls: List of URLs to scrape
        options: Scraping options
        
    Returns:
        Request payload dictionary
    r@   Nr8   T)exclude_none
appendToIdignoreInvalidURLsmaxConcurrencyzeroDataRetentionr=   )r   r   updater   r   
model_dumpr   )r@   r7   r8   r9   r:   r;   r<   r=   r   rM   scrape_datas              r!   rH   rH     s    * )..N$*N#;L  -,W55 	-,,, gs## 	L&-L##&-&8&8d&8&K&KL#%1\"&,?()")8%&&,?()&)+&6&6&<&<&>&>]#r#   d   
chunk_sizec                     g }t          dt          |           |          D ]"}|                    | |||z                       #|S )z
    Split a large list of URLs into smaller chunks for batch processing.
    
    Args:
        urls: List of URLs to chunk
        chunk_size: Maximum size of each chunk
        
    Returns:
        List of URL chunks
    r   )rangerl   r   )r@   r   chunksis       r!   
chunk_urlsr     sR     F1c$ii,, . .d1Q^+,----Mr#   c                     t          ||          }g }d}|D ]<}	t          | |	|||          }
|
j        r|                    |
j                   |dz  }=|S )a  
    Process a large batch of URLs by splitting into smaller chunks.
    
    Args:
        client: HTTP client instance
        urls: List of URLs to scrape
        options: Scraping options
        chunk_size: Size of each batch chunk
        poll_interval: Seconds between status checks
        timeout: Maximum seconds to wait per chunk
        
    Returns:
        List of all scraped documents
        
    Raises:
        FirecrawlError: If any chunk fails
    r   )r7   rx   r[   rb   )r   r   r/   extend)r?   r@   r7   r   rx   r[   
url_chunksall_documentscompleted_chunkschunkresults              r!   process_large_batchr     s    2 D*--JM  '
 
 
 ; 	.  ---Ar#   c                 P   |                      d| d          }|j        st          |d           |                                }|                     d|          }|                     dg           |                     d|                     dg                     d}t	          d	i |S )
z
    Get errors for a batch scrape job.

    Args:
        client: HTTP client instance
        job_id: ID of the batch scrape job

    Returns:
        CrawlErrorsResponse with errors and robots-blocked URLs
    rS   z/errorszget batch scrape errorsr/   errorsrobotsBlockedrobots_blocked)r   r   r   )r3   rK   r   rL   r   )r?   rP   rN   r$   rV   r    s         r!   get_batch_scrape_errorsr     s     zz=f===>>H; Ch(ABBB==??Dhhvt$$G++h++!++ow{{CSUW7X7XYY J ,,,,,r#   )N)r	   N)r   )Nr   r	   N),__doc__rg   typingr   r   r   r   r   r   typesr
   r   r   r   r   r   r   utilsr   r   r   r   utils.normalizer   r   r"   r   r5   boolintrO   rW   floatr\   rU   rw   r~   r   r   r   rH   r   r   r   r   r#   r!   <module>r      s_     = = = = = = = = = = = = = = = =                  g f f f f f f f f f f f 6 6 6 6 6 6 ' ' ' ' ' 'Xd3i-@ T(^    d38n c3h    & (,37"&*.%)*.!%%)7 7 77
s)7 m$	7
 eC./07 3-7 "$7 c]7 "$7 #7 c]7 7 7 7 7z 591 111   011 	1 1 1 1p (,	$ $ $$$ e_	$
 $ $ $ $V 59	D DDD H~D   01	D
 
(^D D D DN--- 
- - - -@ !	$" $"$"$" $" c]	$"
 $" $" $" $"V (,37"&*.%)*.!%%)!3 3 33
s)3 m$	3
 eC./03 3-3 "$3 c]3 "$3 #3 c]3 3 c]3 3 3 3 3ld3i DI    B (,37"&*.%)*.!%/ / /
s)/ m$/ eC./0	/
 3-/ "$/ c]/ "$/ #/ 
/ / / /d T#Y C $tCy/    ( (,!- --
s)- m$- 	-
 - c]- 
(^- - - -`-J - -@S - - - - - -r#   