auction-scrapper/app/services/scraper_service.ts

import * as cheerio from 'cheerio'
import logger from '@adonisjs/core/services/logger'
import { AuctionSchema, type AuctionData } from '../schemas/auction_schema.js'

/**
 * Configuration for the scraper
 */
const SCRAPER_CONFIG = {
  baseUrl: 'https://icetrade.by',
  requestDelay: 1000, // 1 second between requests
  timeout: 30000, // 30 seconds timeout
  userAgent:
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  maxRetries: 3,
  retryDelay: 2000,
} as const

/**
 * Error thrown when scraping fails
 */
export class ScraperError extends Error {
  constructor(
    message: string,
    public readonly cause?: unknown
  ) {
    super(message)
    this.name = 'ScraperError'
  }
}

/**
 * ScraperService - Handles fetching and parsing auction data from icetrade.by
 *
 * Features:
 * - Rate limiting (1s delay between requests)
 * - Retry logic with exponential backoff
 * - Comprehensive error handling
 * - Data validation with Zod
 * - Structured logging
 */
export class ScraperService {
  /**
   * Builds the URL for fetching auctions with all required parameters
   */
  private buildUrl(pageNumber: number): string {
    const params = new URLSearchParams({
      search_text: '',
      'zakup_type[1]': '1',
      'zakup_type[2]': '1',
      onPage: '100',
      sort: 'num:desc',
      p: pageNumber.toString(),
      // Status flags
      'r[1]': '1',
      'r[2]': '1',
      'r[3]': '1',
      'r[4]': '1',
      'r[5]': '1',
      'r[6]': '1',
      'r[7]': '1',
      // Trade type flags
      't[Trade]': '1',
      't[contest]': '1',
      't[request]': '1',
      't[qualification]': '1',
      't[negotiations]': '1',
    })

    return `${SCRAPER_CONFIG.baseUrl}/trades/index?${params.toString()}`
  }

  /**
   * Fetches HTML content from the specified page with retry logic
   */
  async fetchPage(pageNumber: number): Promise<string> {
    const url = this.buildUrl(pageNumber)
    let lastError: Error | undefined

    logger.info(`Preparing to fetch URL: ${url}`)

    for (let attempt = 1; attempt <= SCRAPER_CONFIG.maxRetries; attempt++) {
      try {
        logger.info(
          `Fetching page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`,
          { url }
        )

        const controller = new AbortController()
        const timeoutId = setTimeout(() => controller.abort(), SCRAPER_CONFIG.timeout)

        const response = await fetch(url, {
          headers: {
            'User-Agent': SCRAPER_CONFIG.userAgent,
            Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            Connection: 'keep-alive',
          },
          signal: controller.signal,
        })

        clearTimeout(timeoutId)

        logger.info(`Received response: status=${response.status} ${response.statusText}`)

        if (!response.ok) {
          const bodyText = await response.text().catch(() => 'Unable to read response body')
          throw new Error(
            `HTTP ${response.status}: ${response.statusText}. Body: ${bodyText.substring(0, 200)}`
          )
        }

        const html = await response.text()

        if (!html || html.trim().length === 0) {
          throw new Error('Received empty response')
        }

        logger.info(`Successfully fetched page ${pageNumber}: ${html.length} bytes`)
        return html
      } catch (error) {
        lastError = error instanceof Error ? error : new Error(String(error))

        // Log detailed error information
        const errorDetails: Record<string, any> = {
          message: lastError.message,
          name: lastError.name,
          url,
        }

        // Add stack trace for non-HTTP errors
        if (!(lastError.message.startsWith('HTTP '))) {
          errorDetails.stack = lastError.stack
        }

        // Check for specific error types
        if (lastError.name === 'AbortError') {
          errorDetails.reason = 'Request timeout after 30s'
        } else if (lastError.message.includes('fetch failed')) {
          errorDetails.reason = 'Network error - check DNS, firewall, or connectivity'
        } else if (lastError.message.includes('ENOTFOUND')) {
          errorDetails.reason = 'DNS resolution failed - domain not found'
        } else if (lastError.message.includes('ECONNREFUSED')) {
          errorDetails.reason = 'Connection refused - server not reachable'
        } else if (lastError.message.includes('ETIMEDOUT')) {
          errorDetails.reason = 'Connection timeout - server too slow or unreachable'
        }

        logger.warn(errorDetails, `Failed to fetch page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`)

        if (attempt < SCRAPER_CONFIG.maxRetries) {
          const delay = SCRAPER_CONFIG.retryDelay * attempt
          logger.info(`Retrying in ${delay}ms...`)
          await this.delay(delay)
        }
      }
    }

    const finalError = new ScraperError(
      `Failed to fetch page ${pageNumber} after ${SCRAPER_CONFIG.maxRetries} attempts: ${lastError?.message}`,
      lastError
    )

    logger.error('All fetch attempts failed', {
      pageNumber,
      url,
      lastErrorMessage: lastError?.message,
      lastErrorName: lastError?.name,
    })

    throw finalError
  }

  /**
   * Parses HTML content and extracts auction data
   */
  parsePage(html: string): AuctionData[] {
    try {
      const $ = cheerio.load(html)
      const auctions: AuctionData[] = []

      // Find the auctions table
      const auctionsTable = $('table.auctions.w100')

      if (auctionsTable.length === 0) {
        logger.warn('No auctions table found in HTML')
        return []
      }

      // Parse each auction row
      const rows = auctionsTable.find('tbody tr')
      logger.info(`Found ${rows.length} auction rows to parse`)

      rows.each((index, element) => {
        try {
          const row = $(element)

          // Extract auction data from table cells
          const cells = row.find('td')

          if (cells.length < 4) {
            logger.warn(`Row ${index} has insufficient cells, skipping`)
            return
          }

          // Extract auction number (typically in first cell)
          const auctionNumCell = $(cells[0])
          const auctionNum = auctionNumCell.text().trim()

          // Extract title and link (typically in second cell with <a> tag)
          const titleCell = $(cells[1])
          const titleLink = titleCell.find('a').first()
          const title = titleLink.text().trim()
          const link = titleLink.attr('href')?.trim() || ''

          // Skip if link is empty or missing
          if (!link) {
            logger.warn(`Row ${index} has missing or empty link, skipping`)
            return
          }

          // Make link absolute if it's relative
          const absoluteLink = link.startsWith('http')
            ? link
            : `${SCRAPER_CONFIG.baseUrl}${link.startsWith('/') ? link : `/${link}`}`

          // Extract organization (typically in third cell)
          const organizationCell = $(cells[2])
          const organization = organizationCell.text().trim()

          // Extract status (typically in fourth cell)
          const statusCell = $(cells[3])
          const status = statusCell.text().trim()

          // Extract deadline if available (typically in fifth cell)
          const deadlineCell = $(cells[4])
          const deadline = deadlineCell.text().trim() || null

          // Description can be extracted from title cell's additional text or separate element
          const description = titleCell.find('.description').text().trim() || null

          // Validate with Zod schema
          const result = AuctionSchema.safeParse({
            auctionNum,
            title,
            organization,
            status,
            deadline,
            link: absoluteLink,
            description,
          })

          if (!result.success) {
            logger.warn(`Validation failed for auction at row ${index}`, {
              errors: result.error.issues,
              data: { auctionNum, title },
            })
            return
          }

          auctions.push(result.data)
        } catch (error) {
          logger.error(`Error parsing auction row ${index}`, {
            error: error instanceof Error ? error.message : String(error),
          })
        }
      })

      logger.info(`Successfully parsed ${auctions.length} valid auctions`)
      return auctions
    } catch (error) {
      throw new ScraperError(
        'Failed to parse HTML content',
        error instanceof Error ? error : new Error(String(error))
      )
    }
  }

  /**
   * Scrapes multiple pages of auctions with rate limiting
   *
   * @param maxPages - Maximum number of pages to scrape (default: 1)
   * @returns Array of all parsed auction data
   */
  async scrapeAuctions(maxPages: number = 1): Promise<AuctionData[]> {
    if (maxPages < 1) {
      throw new Error('maxPages must be at least 1')
    }

    logger.info(`Starting scrape of ${maxPages} page(s)`)
    const allAuctions: AuctionData[] = []

    for (let page = 1; page <= maxPages; page++) {
      try {
        // Fetch page HTML
        const html = await this.fetchPage(page)

        // Parse auctions from HTML
        const auctions = this.parsePage(html)

        allAuctions.push(...auctions)

        logger.info(`Page ${page}/${maxPages}: Found ${auctions.length} auctions`)

        // Rate limiting: wait before next request (except for last page)
        if (page < maxPages) {
          logger.debug(`Waiting ${SCRAPER_CONFIG.requestDelay}ms before next request`)
          await this.delay(SCRAPER_CONFIG.requestDelay)
        }
      } catch (error) {
        const err = error instanceof Error ? error : new Error(String(error))
        logger.error(`Error scraping page ${page}`, {
          message: err.message,
          name: err.name,
          stack: err.stack,
          cause: err.cause,
        })

        // Continue to next page instead of failing completely
        // This ensures partial data is still returned
        continue
      }
    }

    logger.info(
      `Scraping completed: ${allAuctions.length} total auctions from ${maxPages} page(s)`
    )

    return allAuctions
  }

  /**
   * Helper method for delays (rate limiting, retries)
   */
  private delay(ms: number): Promise<void> {
    return new Promise((resolve) => setTimeout(resolve, ms))
  }
}