extractDataBatch method
Extracts data from multiple URLs concurrently
urls is the list of URLs to fetch
selector is the CSS selector to use
attribute is the attribute to extract (optional)
asText whether to extract the text content (default: true)
headers are additional headers to send with the request
timeout is the timeout for the request in milliseconds
retries is the number of retry attempts
ignoreRobotsTxt whether to ignore robots.txt rules (default: false)
onProgress is a callback for progress updates
Implementation
Future<Map<String, List<String>>> extractDataBatch({
required List<String> urls,
required String selector,
String? attribute,
bool asText = true,
Map<String, String>? headers,
int? timeout,
int? retries,
bool ignoreRobotsTxt = false,
void Function(int completed, int total, String url)? onProgress,
}) async {
_logger.info('Extracting data batch: ${urls.length} URLs');
final results = <String, List<String>>{};
final errors = <String, dynamic>{};
final completer = Completer<Map<String, List<String>>>();
int completed = 0;
// Function to check if all tasks are completed
void checkCompletion() {
if (completed == urls.length) {
if (errors.isNotEmpty) {
_logger.warning(
'Batch completed with ${errors.length} errors: ${errors.keys.join(', ')}',
);
} else {
_logger.success('Batch completed successfully');
}
completer.complete(results);
}
}
// Add each URL as a task
for (final url in urls) {
_taskQueue.addTask<List<String>>(
task: () async {
// First fetch the HTML
final html = await _webScraper.fetchHtml(
url: url,
headers: headers,
timeout: timeout,
retries: retries,
ignoreRobotsTxt: ignoreRobotsTxt,
);
// Then extract the data from the HTML
return _webScraper.extractData(
html: html,
selector: selector,
attribute: attribute,
asText: asText,
);
},
priority: 0,
taskName: 'ExtractData-$url',
onStart: () {
_logger.info('Starting extraction for URL: $url');
},
onComplete: (result) {
_logger.success('Extraction completed for URL: $url');
results[url] = result;
completed++;
onProgress?.call(completed, urls.length, url);
checkCompletion();
},
onError: (error, stackTrace) {
_logger.error('Extraction failed for URL: $url - $error');
errors[url] = error;
completed++;
onProgress?.call(completed, urls.length, url);
checkCompletion();
},
);
}
return completer.future;
}