diff --git a/sync/src/scraper.ts b/sync/src/scraper.ts index 45d681a..3330ccf 100644 --- a/sync/src/scraper.ts +++ b/sync/src/scraper.ts @@ -217,148 +217,96 @@ async function scrapeMembers(frame: Frame): Promise { log(`After form submit: ${frame.url()}`); } - // Collect rows across all pages + // --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count --- type ParsedRow = Awaited>[number]; - const allRows: ParsedRow[] = []; - let pageNum = 1; - while (true) { - // Log tables found for diagnostics - const tableInfo = await frame.$$eval('table', (ts) => - ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`), - ); - log(`Page ${pageNum} tables: ${tableInfo.join(', ') || 'none'}`); + await frame.waitForSelector('table.FdcLayList', { timeout: 20000 }); + const firstRows = await parseRowsFromTable(frame); + log(`Initial fetch: ${firstRows.length} rows`); + for (const row of firstRows) { + log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`); + } - await frame.waitForSelector('table.FdcLayList', { timeout: 20000 }); + const pagination = await frame.evaluate(() => + document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? '' + ); + log(`Pagination: "${pagination}"`); - const pageRows = await parseRowsFromTable(frame); - log(`Page ${pageNum}: parsed ${pageRows.length} rows`); - for (const row of pageRows) { - log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`); - } - allRows.push(...pageRows); + const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i); + const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null; + const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null; - // Check pagination status from "Datensatz X-Y von Z" text - const pagination = await frame.evaluate(() => { - const nav = document.querySelector('table.FdcLayListNav'); - return nav?.textContent?.trim() ?? ''; - }); - log(`Pagination: "${pagination}"`); + const seenStNrs = new Set(firstRows.map(r => r.standesbuchNr).filter(Boolean)); + const allRows: ParsedRow[] = [...firstRows]; - // Parse "Datensatz X-Y von Z" to check if more pages exist - const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i); - if (pagMatch) { - const from = parseInt(pagMatch[1], 10); - const to = parseInt(pagMatch[2], 10); - const total = parseInt(pagMatch[3], 10); - if (to >= total) { - log(`All ${total} records loaded across ${pageNum} page(s)`); - break; - } - log(`Loaded ${to} of ${total} — navigating to next page`); + // --- Phase 2: if more members exist and pagination is disabled, use StNr range queries --- + if (totalExpected && shownSoFar && shownSoFar < totalExpected) { + log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`); - // Calculate next page number to use as a fallback click target - const pageSize = to - from + 1; - const nextPageNum = Math.floor(to / pageSize) + 1; + const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit + const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members + let startNr = 1; + let consecutiveEmpty = 0; - // Click the "next page" control in FdcLayListNav. - // FDISK renders pagination as plain inside (no wrappers). - // Use Playwright's click() which properly triggers JS event listeners attached via addEventListener. - // Try in order: b_next img → b_last img → any with ">" text → page-number link. - let nextClicked = false; + while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) { + const endNr = startNr + BATCH - 1; - const nextImg = frame.locator('table.FdcLayListNav td.Right img[src*="b_next"]'); - if (await nextImg.count() > 0) { - await nextImg.first().click({ timeout: 5000 }).catch(() => {}); - nextClicked = true; - } else { - // Fallback via evaluate for text/title/page-number patterns - const clicked = await frame.evaluate((nextPg: number) => { - const nav = document.querySelector('table.FdcLayListNav'); - if (!nav) return false; - const clickable = Array.from(nav.querySelectorAll('a, input[type="button"], input[type="submit"], td')); - for (const el of clickable) { - const text = ((el as HTMLElement).textContent ?? '').trim(); - const title = ((el as HTMLElement).getAttribute('title') ?? '').toLowerCase(); - const onclick = ((el as HTMLElement).getAttribute('onclick') ?? '').toLowerCase(); - if (text === '>' || text === '>>' || - title.includes('nächst') || title.includes('weiter') || title.includes('next') || - onclick.includes('next') || onclick.includes('weiter') || - text === String(nextPg)) { - (el as HTMLElement).click(); - return true; - } - } - return false; - }, nextPageNum); - nextClicked = clicked; - } + // Set StNr range in the search form and submit + const formOk = await frame.evaluate((s: number, e: number) => { + const form = (document as any).forms['frmsearch']; + if (!form) return false; + const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement; + const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement; + if (!fromFld || !toFld) return false; + fromFld.value = String(s); + toFld.value = String(e); + return true; + }, startNr, endNr); - if (!nextClicked) { - const navHtml = await frame.evaluate(() => { - const nav = document.querySelector('table.FdcLayListNav'); - return nav?.innerHTML?.replace(/\s+/g, ' ').trim() ?? '(not found)'; - }); - log(`WARN: could not find next-page link — stopping pagination`); - log(`FdcLayListNav HTML: ${navHtml}`); + if (!formOk) { + log('WARN: could not set StNr range fields — aborting range queries'); break; } - await frame.waitForLoadState('networkidle', { timeout: 30000 }); + await Promise.all([ + frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }), + frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }), + ]); - // Verify we actually moved to the next page — if pagination didn't advance, stop - const newPagination = await frame.evaluate(() => - document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? '' - ); - if (newPagination === pagination) { - log(`WARN: pagination did not advance after click (still "${pagination}") — stopping`); - // Try clicking the Right td directly as a last resort - const tdClicked = await frame.evaluate(() => { - const td = document.querySelector('table.FdcLayListNav td.Right') as HTMLElement | null; - if (td) { td.click(); return true; } - return false; - }); - if (tdClicked) { - await frame.waitForLoadState('networkidle', { timeout: 30000 }); - const afterTdPagination = await frame.evaluate(() => - document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? '' - ); - if (afterTdPagination === pagination) { - log('WARN: td.Right click also did not advance — pagination is disabled, stopping'); - break; - } - } else { - break; - } + const rangeRows = await parseRowsFromTable(frame); + const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr)); + newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); }); + allRows.push(...newRows); + + log(`StNr ${startNr}–${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`); + for (const row of newRows) { + log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`); } - pageNum++; - } else { - // No pagination indicator found — assume single page - log('No pagination indicator found — assuming single page'); - break; - } - } // end while - log(`Parsed ${allRows.length} rows total across ${pageNum} page(s)`); + consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0; + startNr = endNr + 1; + } + + log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`); + } + + log(`Parsed ${allRows.length} raw rows total`); const members: FdiskMember[] = []; for (const row of allRows) { - if (!row.standesbuchNr || !row.vorname || !row.zuname) { - log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`); - continue; - } + if (!row.standesbuchNr || !row.vorname || !row.zuname) continue; const abmeldedatum = parseDate(row.abmeldedatum); members.push({ - standesbuchNr: row.standesbuchNr, - dienstgrad: row.dienstgrad, - vorname: row.vorname, - zuname: row.zuname, - geburtsdatum: parseDate(row.geburtsdatum), - svnr: row.svnr || null, + standesbuchNr: row.standesbuchNr, + dienstgrad: row.dienstgrad, + vorname: row.vorname, + zuname: row.zuname, + geburtsdatum: parseDate(row.geburtsdatum), + svnr: row.svnr || null, eintrittsdatum: parseDate(row.eintrittsdatum), abmeldedatum, - status: abmeldedatum ? 'ausgetreten' : 'aktiv', - detailUrl: row.href, + status: abmeldedatum ? 'ausgetreten' : 'aktiv', + detailUrl: row.href, }); } return members;