import * as cheerio from 'cheerio' import logger from '@adonisjs/core/services/logger' import { AuctionSchema, type AuctionData } from '../schemas/auction_schema.js' /** * Configuration for the scraper */ const SCRAPER_CONFIG = { baseUrl: 'https://icetrade.by', requestDelay: 1000, // 1 second between requests timeout: 30000, // 30 seconds timeout userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', maxRetries: 3, retryDelay: 2000, } as const /** * Error thrown when scraping fails */ export class ScraperError extends Error { constructor( message: string, public readonly cause?: unknown ) { super(message) this.name = 'ScraperError' } } /** * ScraperService - Handles fetching and parsing auction data from icetrade.by * * Features: * - Rate limiting (1s delay between requests) * - Retry logic with exponential backoff * - Comprehensive error handling * - Data validation with Zod * - Structured logging */ export class ScraperService { /** * Builds the URL for fetching auctions with all required parameters */ private buildUrl(pageNumber: number): string { const params = new URLSearchParams({ search_text: '', 'zakup_type[1]': '1', 'zakup_type[2]': '1', onPage: '100', sort: 'num:desc', p: pageNumber.toString(), // Status flags 'r[1]': '1', 'r[2]': '1', 'r[3]': '1', 'r[4]': '1', 'r[5]': '1', 'r[6]': '1', 'r[7]': '1', // Trade type flags 't[Trade]': '1', 't[contest]': '1', 't[request]': '1', 't[qualification]': '1', 't[negotiations]': '1', }) return `${SCRAPER_CONFIG.baseUrl}/trades/index?${params.toString()}` } /** * Fetches HTML content from the specified page with retry logic */ async fetchPage(pageNumber: number): Promise { const url = this.buildUrl(pageNumber) let lastError: Error | undefined logger.info(`Preparing to fetch URL: ${url}`) for (let attempt = 1; attempt <= SCRAPER_CONFIG.maxRetries; attempt++) { try { logger.info( `Fetching page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`, { url } ) const controller = new AbortController() const timeoutId = setTimeout(() => controller.abort(), SCRAPER_CONFIG.timeout) const response = await fetch(url, { headers: { 'User-Agent': SCRAPER_CONFIG.userAgent, Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Accept-Encoding': 'gzip, deflate, br', Connection: 'keep-alive', }, signal: controller.signal, }) clearTimeout(timeoutId) logger.info(`Received response: status=${response.status} ${response.statusText}`) if (!response.ok) { const bodyText = await response.text().catch(() => 'Unable to read response body') throw new Error( `HTTP ${response.status}: ${response.statusText}. Body: ${bodyText.substring(0, 200)}` ) } const html = await response.text() if (!html || html.trim().length === 0) { throw new Error('Received empty response') } logger.info(`Successfully fetched page ${pageNumber}: ${html.length} bytes`) return html } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)) // Log detailed error information const errorDetails: Record = { message: lastError.message, name: lastError.name, url, } // Add stack trace for non-HTTP errors if (!(lastError.message.startsWith('HTTP '))) { errorDetails.stack = lastError.stack } // Check for specific error types if (lastError.name === 'AbortError') { errorDetails.reason = 'Request timeout after 30s' } else if (lastError.message.includes('fetch failed')) { errorDetails.reason = 'Network error - check DNS, firewall, or connectivity' } else if (lastError.message.includes('ENOTFOUND')) { errorDetails.reason = 'DNS resolution failed - domain not found' } else if (lastError.message.includes('ECONNREFUSED')) { errorDetails.reason = 'Connection refused - server not reachable' } else if (lastError.message.includes('ETIMEDOUT')) { errorDetails.reason = 'Connection timeout - server too slow or unreachable' } logger.warn(errorDetails, `Failed to fetch page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`) if (attempt < SCRAPER_CONFIG.maxRetries) { const delay = SCRAPER_CONFIG.retryDelay * attempt logger.info(`Retrying in ${delay}ms...`) await this.delay(delay) } } } const finalError = new ScraperError( `Failed to fetch page ${pageNumber} after ${SCRAPER_CONFIG.maxRetries} attempts: ${lastError?.message}`, lastError ) logger.error('All fetch attempts failed', { pageNumber, url, lastErrorMessage: lastError?.message, lastErrorName: lastError?.name, }) throw finalError } /** * Parses HTML content and extracts auction data */ parsePage(html: string): AuctionData[] { try { const $ = cheerio.load(html) const auctions: AuctionData[] = [] // Find the auctions table const auctionsTable = $('table.auctions.w100') if (auctionsTable.length === 0) { logger.warn('No auctions table found in HTML') return [] } // Parse each auction row const rows = auctionsTable.find('tbody tr') logger.info(`Found ${rows.length} auction rows to parse`) rows.each((index, element) => { try { const row = $(element) // Extract auction data from table cells const cells = row.find('td') if (cells.length < 4) { logger.warn(`Row ${index} has insufficient cells, skipping`) return } // Extract auction number (typically in first cell) const auctionNumCell = $(cells[0]) const auctionNum = auctionNumCell.text().trim() // Extract title and link (typically in second cell with tag) const titleCell = $(cells[1]) const titleLink = titleCell.find('a').first() const title = titleLink.text().trim() const link = titleLink.attr('href')?.trim() || '' // Skip if link is empty or missing if (!link) { logger.warn(`Row ${index} has missing or empty link, skipping`) return } // Make link absolute if it's relative const absoluteLink = link.startsWith('http') ? link : `${SCRAPER_CONFIG.baseUrl}${link.startsWith('/') ? link : `/${link}`}` // Extract organization (typically in third cell) const organizationCell = $(cells[2]) const organization = organizationCell.text().trim() // Extract status (typically in fourth cell) const statusCell = $(cells[3]) const status = statusCell.text().trim() // Extract deadline if available (typically in fifth cell) const deadlineCell = $(cells[4]) const deadline = deadlineCell.text().trim() || null // Description can be extracted from title cell's additional text or separate element const description = titleCell.find('.description').text().trim() || null // Validate with Zod schema const result = AuctionSchema.safeParse({ auctionNum, title, organization, status, deadline, link: absoluteLink, description, }) if (!result.success) { logger.warn(`Validation failed for auction at row ${index}`, { errors: result.error.issues, data: { auctionNum, title }, }) return } auctions.push(result.data) } catch (error) { logger.error(`Error parsing auction row ${index}`, { error: error instanceof Error ? error.message : String(error), }) } }) logger.info(`Successfully parsed ${auctions.length} valid auctions`) return auctions } catch (error) { throw new ScraperError( 'Failed to parse HTML content', error instanceof Error ? error : new Error(String(error)) ) } } /** * Scrapes multiple pages of auctions with rate limiting * * @param maxPages - Maximum number of pages to scrape (default: 1) * @returns Array of all parsed auction data */ async scrapeAuctions(maxPages: number = 1): Promise { if (maxPages < 1) { throw new Error('maxPages must be at least 1') } logger.info(`Starting scrape of ${maxPages} page(s)`) const allAuctions: AuctionData[] = [] for (let page = 1; page <= maxPages; page++) { try { // Fetch page HTML const html = await this.fetchPage(page) // Parse auctions from HTML const auctions = this.parsePage(html) allAuctions.push(...auctions) logger.info(`Page ${page}/${maxPages}: Found ${auctions.length} auctions`) // Rate limiting: wait before next request (except for last page) if (page < maxPages) { logger.debug(`Waiting ${SCRAPER_CONFIG.requestDelay}ms before next request`) await this.delay(SCRAPER_CONFIG.requestDelay) } } catch (error) { const err = error instanceof Error ? error : new Error(String(error)) logger.error(`Error scraping page ${page}`, { message: err.message, name: err.name, stack: err.stack, cause: err.cause, }) // Continue to next page instead of failing completely // This ensures partial data is still returned continue } } logger.info( `Scraping completed: ${allAuctions.length} total auctions from ${maxPages} page(s)` ) return allAuctions } /** * Helper method for delays (rate limiting, retries) */ private delay(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } }