This commit is contained in:
Matthias Hochmeister
2026-03-13 20:26:33 +01:00
parent 8f454905b9
commit f009694da7

View File

@@ -217,136 +217,84 @@ async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`After form submit: ${frame.url()}`); log(`After form submit: ${frame.url()}`);
} }
// Collect rows across all pages // --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number]; type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
const allRows: ParsedRow[] = [];
let pageNum = 1;
while (true) {
// Log tables found for diagnostics
const tableInfo = await frame.$$eval('table', (ts) =>
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`),
);
log(`Page ${pageNum} tables: ${tableInfo.join(', ') || 'none'}`);
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 }); await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const firstRows = await parseRowsFromTable(frame);
const pageRows = await parseRowsFromTable(frame); log(`Initial fetch: ${firstRows.length} rows`);
log(`Page ${pageNum}: parsed ${pageRows.length} rows`); for (const row of firstRows) {
for (const row of pageRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`); log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
} }
allRows.push(...pageRows);
// Check pagination status from "Datensatz X-Y von Z" text const pagination = await frame.evaluate(() =>
const pagination = await frame.evaluate(() => { document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
const nav = document.querySelector('table.FdcLayListNav'); );
return nav?.textContent?.trim() ?? '';
});
log(`Pagination: "${pagination}"`); log(`Pagination: "${pagination}"`);
// Parse "Datensatz X-Y von Z" to check if more pages exist
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i); const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
if (pagMatch) { const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
const from = parseInt(pagMatch[1], 10); const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
const to = parseInt(pagMatch[2], 10);
const total = parseInt(pagMatch[3], 10);
if (to >= total) {
log(`All ${total} records loaded across ${pageNum} page(s)`);
break;
}
log(`Loaded ${to} of ${total} — navigating to next page`);
// Calculate next page number to use as a fallback click target const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const pageSize = to - from + 1; const allRows: ParsedRow[] = [...firstRows];
const nextPageNum = Math.floor(to / pageSize) + 1;
// Click the "next page" control in FdcLayListNav. // --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
// FDISK renders pagination as plain <img> inside <td class="Right"> (no <a> wrappers). if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
// Use Playwright's click() which properly triggers JS event listeners attached via addEventListener. log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
// Try in order: b_next img → b_last img → any <a> with ">" text → page-number link.
let nextClicked = false;
const nextImg = frame.locator('table.FdcLayListNav td.Right img[src*="b_next"]'); const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
if (await nextImg.count() > 0) { const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
await nextImg.first().click({ timeout: 5000 }).catch(() => {}); let startNr = 1;
nextClicked = true; let consecutiveEmpty = 0;
} else {
// Fallback via evaluate for text/title/page-number patterns while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
const clicked = await frame.evaluate((nextPg: number) => { const endNr = startNr + BATCH - 1;
const nav = document.querySelector('table.FdcLayListNav');
if (!nav) return false; // Set StNr range in the search form and submit
const clickable = Array.from(nav.querySelectorAll('a, input[type="button"], input[type="submit"], td')); const formOk = await frame.evaluate((s: number, e: number) => {
for (const el of clickable) { const form = (document as any).forms['frmsearch'];
const text = ((el as HTMLElement).textContent ?? '').trim(); if (!form) return false;
const title = ((el as HTMLElement).getAttribute('title') ?? '').toLowerCase(); const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
const onclick = ((el as HTMLElement).getAttribute('onclick') ?? '').toLowerCase(); const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
if (text === '>' || text === '>>' || if (!fromFld || !toFld) return false;
title.includes('nächst') || title.includes('weiter') || title.includes('next') || fromFld.value = String(s);
onclick.includes('next') || onclick.includes('weiter') || toFld.value = String(e);
text === String(nextPg)) {
(el as HTMLElement).click();
return true; return true;
} }, startNr, endNr);
}
return false;
}, nextPageNum);
nextClicked = clicked;
}
if (!nextClicked) { if (!formOk) {
const navHtml = await frame.evaluate(() => { log('WARN: could not set StNr range fields — aborting range queries');
const nav = document.querySelector('table.FdcLayListNav');
return nav?.innerHTML?.replace(/\s+/g, ' ').trim() ?? '(not found)';
});
log(`WARN: could not find next-page link — stopping pagination`);
log(`FdcLayListNav HTML: ${navHtml}`);
break; break;
} }
await frame.waitForLoadState('networkidle', { timeout: 30000 }); await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Verify we actually moved to the next page — if pagination didn't advance, stop const rangeRows = await parseRowsFromTable(frame);
const newPagination = await frame.evaluate(() => const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? '' newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
); allRows.push(...newRows);
if (newPagination === pagination) {
log(`WARN: pagination did not advance after click (still "${pagination}") — stopping`);
// Try clicking the Right td directly as a last resort
const tdClicked = await frame.evaluate(() => {
const td = document.querySelector('table.FdcLayListNav td.Right') as HTMLElement | null;
if (td) { td.click(); return true; }
return false;
});
if (tdClicked) {
await frame.waitForLoadState('networkidle', { timeout: 30000 });
const afterTdPagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
if (afterTdPagination === pagination) {
log('WARN: td.Right click also did not advance — pagination is disabled, stopping');
break;
}
} else {
break;
}
}
pageNum++;
} else {
// No pagination indicator found — assume single page
log('No pagination indicator found — assuming single page');
break;
}
} // end while
log(`Parsed ${allRows.length} rows total across ${pageNum} page(s)`); log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
for (const row of newRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = []; const members: FdiskMember[] = [];
for (const row of allRows) { for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) { if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`);
continue;
}
const abmeldedatum = parseDate(row.abmeldedatum); const abmeldedatum = parseDate(row.abmeldedatum);
members.push({ members.push({
standesbuchNr: row.standesbuchNr, standesbuchNr: row.standesbuchNr,