339 lines
10 KiB
TypeScript
339 lines
10 KiB
TypeScript
import * as cheerio from 'cheerio'
|
|
import logger from '@adonisjs/core/services/logger'
|
|
import { AuctionSchema, type AuctionData } from '../schemas/auction_schema.js'
|
|
|
|
/**
|
|
* Configuration for the scraper
|
|
*/
|
|
const SCRAPER_CONFIG = {
|
|
baseUrl: 'https://icetrade.by',
|
|
requestDelay: 1000, // 1 second between requests
|
|
timeout: 30000, // 30 seconds timeout
|
|
userAgent:
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
maxRetries: 3,
|
|
retryDelay: 2000,
|
|
} as const
|
|
|
|
/**
|
|
* Error thrown when scraping fails
|
|
*/
|
|
export class ScraperError extends Error {
|
|
constructor(
|
|
message: string,
|
|
public readonly cause?: unknown
|
|
) {
|
|
super(message)
|
|
this.name = 'ScraperError'
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ScraperService - Handles fetching and parsing auction data from icetrade.by
|
|
*
|
|
* Features:
|
|
* - Rate limiting (1s delay between requests)
|
|
* - Retry logic with exponential backoff
|
|
* - Comprehensive error handling
|
|
* - Data validation with Zod
|
|
* - Structured logging
|
|
*/
|
|
export class ScraperService {
|
|
/**
|
|
* Builds the URL for fetching auctions with all required parameters
|
|
*/
|
|
private buildUrl(pageNumber: number): string {
|
|
const params = new URLSearchParams({
|
|
search_text: '',
|
|
'zakup_type[1]': '1',
|
|
'zakup_type[2]': '1',
|
|
onPage: '100',
|
|
sort: 'num:desc',
|
|
p: pageNumber.toString(),
|
|
// Status flags
|
|
'r[1]': '1',
|
|
'r[2]': '1',
|
|
'r[3]': '1',
|
|
'r[4]': '1',
|
|
'r[5]': '1',
|
|
'r[6]': '1',
|
|
'r[7]': '1',
|
|
// Trade type flags
|
|
't[Trade]': '1',
|
|
't[contest]': '1',
|
|
't[request]': '1',
|
|
't[qualification]': '1',
|
|
't[negotiations]': '1',
|
|
})
|
|
|
|
return `${SCRAPER_CONFIG.baseUrl}/trades/index?${params.toString()}`
|
|
}
|
|
|
|
/**
|
|
* Fetches HTML content from the specified page with retry logic
|
|
*/
|
|
async fetchPage(pageNumber: number): Promise<string> {
|
|
const url = this.buildUrl(pageNumber)
|
|
let lastError: Error | undefined
|
|
|
|
logger.info(`Preparing to fetch URL: ${url}`)
|
|
|
|
for (let attempt = 1; attempt <= SCRAPER_CONFIG.maxRetries; attempt++) {
|
|
try {
|
|
logger.info(
|
|
`Fetching page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`,
|
|
{ url }
|
|
)
|
|
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => controller.abort(), SCRAPER_CONFIG.timeout)
|
|
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'User-Agent': SCRAPER_CONFIG.userAgent,
|
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
Connection: 'keep-alive',
|
|
},
|
|
signal: controller.signal,
|
|
})
|
|
|
|
clearTimeout(timeoutId)
|
|
|
|
logger.info(`Received response: status=${response.status} ${response.statusText}`)
|
|
|
|
if (!response.ok) {
|
|
const bodyText = await response.text().catch(() => 'Unable to read response body')
|
|
throw new Error(
|
|
`HTTP ${response.status}: ${response.statusText}. Body: ${bodyText.substring(0, 200)}`
|
|
)
|
|
}
|
|
|
|
const html = await response.text()
|
|
|
|
if (!html || html.trim().length === 0) {
|
|
throw new Error('Received empty response')
|
|
}
|
|
|
|
logger.info(`Successfully fetched page ${pageNumber}: ${html.length} bytes`)
|
|
return html
|
|
} catch (error) {
|
|
lastError = error instanceof Error ? error : new Error(String(error))
|
|
|
|
// Log detailed error information
|
|
const errorDetails: Record<string, any> = {
|
|
message: lastError.message,
|
|
name: lastError.name,
|
|
url,
|
|
}
|
|
|
|
// Add stack trace for non-HTTP errors
|
|
if (!(lastError.message.startsWith('HTTP '))) {
|
|
errorDetails.stack = lastError.stack
|
|
}
|
|
|
|
// Check for specific error types
|
|
if (lastError.name === 'AbortError') {
|
|
errorDetails.reason = 'Request timeout after 30s'
|
|
} else if (lastError.message.includes('fetch failed')) {
|
|
errorDetails.reason = 'Network error - check DNS, firewall, or connectivity'
|
|
} else if (lastError.message.includes('ENOTFOUND')) {
|
|
errorDetails.reason = 'DNS resolution failed - domain not found'
|
|
} else if (lastError.message.includes('ECONNREFUSED')) {
|
|
errorDetails.reason = 'Connection refused - server not reachable'
|
|
} else if (lastError.message.includes('ETIMEDOUT')) {
|
|
errorDetails.reason = 'Connection timeout - server too slow or unreachable'
|
|
}
|
|
|
|
logger.warn(errorDetails, `Failed to fetch page ${pageNumber} (attempt ${attempt}/${SCRAPER_CONFIG.maxRetries})`)
|
|
|
|
if (attempt < SCRAPER_CONFIG.maxRetries) {
|
|
const delay = SCRAPER_CONFIG.retryDelay * attempt
|
|
logger.info(`Retrying in ${delay}ms...`)
|
|
await this.delay(delay)
|
|
}
|
|
}
|
|
}
|
|
|
|
const finalError = new ScraperError(
|
|
`Failed to fetch page ${pageNumber} after ${SCRAPER_CONFIG.maxRetries} attempts: ${lastError?.message}`,
|
|
lastError
|
|
)
|
|
|
|
logger.error('All fetch attempts failed', {
|
|
pageNumber,
|
|
url,
|
|
lastErrorMessage: lastError?.message,
|
|
lastErrorName: lastError?.name,
|
|
})
|
|
|
|
throw finalError
|
|
}
|
|
|
|
/**
|
|
* Parses HTML content and extracts auction data
|
|
*/
|
|
parsePage(html: string): AuctionData[] {
|
|
try {
|
|
const $ = cheerio.load(html)
|
|
const auctions: AuctionData[] = []
|
|
|
|
// Find the auctions table
|
|
const auctionsTable = $('table.auctions.w100')
|
|
|
|
if (auctionsTable.length === 0) {
|
|
logger.warn('No auctions table found in HTML')
|
|
return []
|
|
}
|
|
|
|
// Parse each auction row
|
|
const rows = auctionsTable.find('tbody tr')
|
|
logger.info(`Found ${rows.length} auction rows to parse`)
|
|
|
|
rows.each((index, element) => {
|
|
try {
|
|
const row = $(element)
|
|
|
|
// Extract auction data from table cells
|
|
const cells = row.find('td')
|
|
|
|
if (cells.length < 4) {
|
|
logger.warn(`Row ${index} has insufficient cells, skipping`)
|
|
return
|
|
}
|
|
|
|
// Extract auction number (typically in first cell)
|
|
const auctionNumCell = $(cells[0])
|
|
const auctionNum = auctionNumCell.text().trim()
|
|
|
|
// Extract title and link (typically in second cell with <a> tag)
|
|
const titleCell = $(cells[1])
|
|
const titleLink = titleCell.find('a').first()
|
|
const title = titleLink.text().trim()
|
|
const link = titleLink.attr('href')?.trim() || ''
|
|
|
|
// Skip if link is empty or missing
|
|
if (!link) {
|
|
logger.warn(`Row ${index} has missing or empty link, skipping`)
|
|
return
|
|
}
|
|
|
|
// Make link absolute if it's relative
|
|
const absoluteLink = link.startsWith('http')
|
|
? link
|
|
: `${SCRAPER_CONFIG.baseUrl}${link.startsWith('/') ? link : `/${link}`}`
|
|
|
|
// Extract organization (typically in third cell)
|
|
const organizationCell = $(cells[2])
|
|
const organization = organizationCell.text().trim()
|
|
|
|
// Extract status (typically in fourth cell)
|
|
const statusCell = $(cells[3])
|
|
const status = statusCell.text().trim()
|
|
|
|
// Extract deadline if available (typically in fifth cell)
|
|
const deadlineCell = $(cells[4])
|
|
const deadline = deadlineCell.text().trim() || null
|
|
|
|
// Description can be extracted from title cell's additional text or separate element
|
|
const description = titleCell.find('.description').text().trim() || null
|
|
|
|
// Validate with Zod schema
|
|
const result = AuctionSchema.safeParse({
|
|
auctionNum,
|
|
title,
|
|
organization,
|
|
status,
|
|
deadline,
|
|
link: absoluteLink,
|
|
description,
|
|
})
|
|
|
|
if (!result.success) {
|
|
logger.warn(`Validation failed for auction at row ${index}`, {
|
|
errors: result.error.issues,
|
|
data: { auctionNum, title },
|
|
})
|
|
return
|
|
}
|
|
|
|
auctions.push(result.data)
|
|
} catch (error) {
|
|
logger.error(`Error parsing auction row ${index}`, {
|
|
error: error instanceof Error ? error.message : String(error),
|
|
})
|
|
}
|
|
})
|
|
|
|
logger.info(`Successfully parsed ${auctions.length} valid auctions`)
|
|
return auctions
|
|
} catch (error) {
|
|
throw new ScraperError(
|
|
'Failed to parse HTML content',
|
|
error instanceof Error ? error : new Error(String(error))
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scrapes multiple pages of auctions with rate limiting
|
|
*
|
|
* @param maxPages - Maximum number of pages to scrape (default: 1)
|
|
* @returns Array of all parsed auction data
|
|
*/
|
|
async scrapeAuctions(maxPages: number = 1): Promise<AuctionData[]> {
|
|
if (maxPages < 1) {
|
|
throw new Error('maxPages must be at least 1')
|
|
}
|
|
|
|
logger.info(`Starting scrape of ${maxPages} page(s)`)
|
|
const allAuctions: AuctionData[] = []
|
|
|
|
for (let page = 1; page <= maxPages; page++) {
|
|
try {
|
|
// Fetch page HTML
|
|
const html = await this.fetchPage(page)
|
|
|
|
// Parse auctions from HTML
|
|
const auctions = this.parsePage(html)
|
|
|
|
allAuctions.push(...auctions)
|
|
|
|
logger.info(`Page ${page}/${maxPages}: Found ${auctions.length} auctions`)
|
|
|
|
// Rate limiting: wait before next request (except for last page)
|
|
if (page < maxPages) {
|
|
logger.debug(`Waiting ${SCRAPER_CONFIG.requestDelay}ms before next request`)
|
|
await this.delay(SCRAPER_CONFIG.requestDelay)
|
|
}
|
|
} catch (error) {
|
|
const err = error instanceof Error ? error : new Error(String(error))
|
|
logger.error(`Error scraping page ${page}`, {
|
|
message: err.message,
|
|
name: err.name,
|
|
stack: err.stack,
|
|
cause: err.cause,
|
|
})
|
|
|
|
// Continue to next page instead of failing completely
|
|
// This ensures partial data is still returned
|
|
continue
|
|
}
|
|
}
|
|
|
|
logger.info(
|
|
`Scraping completed: ${allAuctions.length} total auctions from ${maxPages} page(s)`
|
|
)
|
|
|
|
return allAuctions
|
|
}
|
|
|
|
/**
|
|
* Helper method for delays (rate limiting, retries)
|
|
*/
|
|
private delay(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms))
|
|
}
|
|
}
|