This commit is contained in:
Matthias Hochmeister
2026-03-13 20:26:33 +01:00
parent 8f454905b9
commit f009694da7

View File

@@ -217,148 +217,96 @@ async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`After form submit: ${frame.url()}`); log(`After form submit: ${frame.url()}`);
} }
// Collect rows across all pages // --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number]; type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
const allRows: ParsedRow[] = [];
let pageNum = 1;
while (true) { await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
// Log tables found for diagnostics const firstRows = await parseRowsFromTable(frame);
const tableInfo = await frame.$$eval('table', (ts) => log(`Initial fetch: ${firstRows.length} rows`);
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`), for (const row of firstRows) {
); log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
log(`Page ${pageNum} tables: ${tableInfo.join(', ') || 'none'}`); }
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 }); const pagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
log(`Pagination: "${pagination}"`);
const pageRows = await parseRowsFromTable(frame); const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
log(`Page ${pageNum}: parsed ${pageRows.length} rows`); const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
for (const row of pageRows) { const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
allRows.push(...pageRows);
// Check pagination status from "Datensatz X-Y von Z" text const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const pagination = await frame.evaluate(() => { const allRows: ParsedRow[] = [...firstRows];
const nav = document.querySelector('table.FdcLayListNav');
return nav?.textContent?.trim() ?? '';
});
log(`Pagination: "${pagination}"`);
// Parse "Datensatz X-Y von Z" to check if more pages exist // --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i); if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
if (pagMatch) { log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
const from = parseInt(pagMatch[1], 10);
const to = parseInt(pagMatch[2], 10);
const total = parseInt(pagMatch[3], 10);
if (to >= total) {
log(`All ${total} records loaded across ${pageNum} page(s)`);
break;
}
log(`Loaded ${to} of ${total} — navigating to next page`);
// Calculate next page number to use as a fallback click target const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
const pageSize = to - from + 1; const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
const nextPageNum = Math.floor(to / pageSize) + 1; let startNr = 1;
let consecutiveEmpty = 0;
// Click the "next page" control in FdcLayListNav. while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
// FDISK renders pagination as plain <img> inside <td class="Right"> (no <a> wrappers). const endNr = startNr + BATCH - 1;
// Use Playwright's click() which properly triggers JS event listeners attached via addEventListener.
// Try in order: b_next img → b_last img → any <a> with ">" text → page-number link.
let nextClicked = false;
const nextImg = frame.locator('table.FdcLayListNav td.Right img[src*="b_next"]'); // Set StNr range in the search form and submit
if (await nextImg.count() > 0) { const formOk = await frame.evaluate((s: number, e: number) => {
await nextImg.first().click({ timeout: 5000 }).catch(() => {}); const form = (document as any).forms['frmsearch'];
nextClicked = true; if (!form) return false;
} else { const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
// Fallback via evaluate for text/title/page-number patterns const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
const clicked = await frame.evaluate((nextPg: number) => { if (!fromFld || !toFld) return false;
const nav = document.querySelector('table.FdcLayListNav'); fromFld.value = String(s);
if (!nav) return false; toFld.value = String(e);
const clickable = Array.from(nav.querySelectorAll('a, input[type="button"], input[type="submit"], td')); return true;
for (const el of clickable) { }, startNr, endNr);
const text = ((el as HTMLElement).textContent ?? '').trim();
const title = ((el as HTMLElement).getAttribute('title') ?? '').toLowerCase();
const onclick = ((el as HTMLElement).getAttribute('onclick') ?? '').toLowerCase();
if (text === '>' || text === '>>' ||
title.includes('nächst') || title.includes('weiter') || title.includes('next') ||
onclick.includes('next') || onclick.includes('weiter') ||
text === String(nextPg)) {
(el as HTMLElement).click();
return true;
}
}
return false;
}, nextPageNum);
nextClicked = clicked;
}
if (!nextClicked) { if (!formOk) {
const navHtml = await frame.evaluate(() => { log('WARN: could not set StNr range fields — aborting range queries');
const nav = document.querySelector('table.FdcLayListNav');
return nav?.innerHTML?.replace(/\s+/g, ' ').trim() ?? '(not found)';
});
log(`WARN: could not find next-page link — stopping pagination`);
log(`FdcLayListNav HTML: ${navHtml}`);
break; break;
} }
await frame.waitForLoadState('networkidle', { timeout: 30000 }); await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Verify we actually moved to the next page — if pagination didn't advance, stop const rangeRows = await parseRowsFromTable(frame);
const newPagination = await frame.evaluate(() => const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? '' newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
); allRows.push(...newRows);
if (newPagination === pagination) {
log(`WARN: pagination did not advance after click (still "${pagination}") — stopping`); log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
// Try clicking the Right td directly as a last resort for (const row of newRows) {
const tdClicked = await frame.evaluate(() => { log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
const td = document.querySelector('table.FdcLayListNav td.Right') as HTMLElement | null;
if (td) { td.click(); return true; }
return false;
});
if (tdClicked) {
await frame.waitForLoadState('networkidle', { timeout: 30000 });
const afterTdPagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
if (afterTdPagination === pagination) {
log('WARN: td.Right click also did not advance — pagination is disabled, stopping');
break;
}
} else {
break;
}
} }
pageNum++;
} else {
// No pagination indicator found — assume single page
log('No pagination indicator found — assuming single page');
break;
}
} // end while
log(`Parsed ${allRows.length} rows total across ${pageNum} page(s)`); consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = []; const members: FdiskMember[] = [];
for (const row of allRows) { for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) { if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`);
continue;
}
const abmeldedatum = parseDate(row.abmeldedatum); const abmeldedatum = parseDate(row.abmeldedatum);
members.push({ members.push({
standesbuchNr: row.standesbuchNr, standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad, dienstgrad: row.dienstgrad,
vorname: row.vorname, vorname: row.vorname,
zuname: row.zuname, zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum), geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null, svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum), eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum, abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv', status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href, detailUrl: row.href,
}); });
} }
return members; return members;