This commit is contained in:
Matthias Hochmeister
2026-03-13 20:26:33 +01:00
parent 8f454905b9
commit f009694da7

View File

@@ -217,148 +217,96 @@ async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`After form submit: ${frame.url()}`);
}
// Collect rows across all pages
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
const allRows: ParsedRow[] = [];
let pageNum = 1;
while (true) {
// Log tables found for diagnostics
const tableInfo = await frame.$$eval('table', (ts) =>
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`),
);
log(`Page ${pageNum} tables: ${tableInfo.join(', ') || 'none'}`);
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const firstRows = await parseRowsFromTable(frame);
log(`Initial fetch: ${firstRows.length} rows`);
for (const row of firstRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const pagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
log(`Pagination: "${pagination}"`);
const pageRows = await parseRowsFromTable(frame);
log(`Page ${pageNum}: parsed ${pageRows.length} rows`);
for (const row of pageRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
allRows.push(...pageRows);
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
// Check pagination status from "Datensatz X-Y von Z" text
const pagination = await frame.evaluate(() => {
const nav = document.querySelector('table.FdcLayListNav');
return nav?.textContent?.trim() ?? '';
});
log(`Pagination: "${pagination}"`);
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const allRows: ParsedRow[] = [...firstRows];
// Parse "Datensatz X-Y von Z" to check if more pages exist
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
if (pagMatch) {
const from = parseInt(pagMatch[1], 10);
const to = parseInt(pagMatch[2], 10);
const total = parseInt(pagMatch[3], 10);
if (to >= total) {
log(`All ${total} records loaded across ${pageNum} page(s)`);
break;
}
log(`Loaded ${to} of ${total} — navigating to next page`);
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
// Calculate next page number to use as a fallback click target
const pageSize = to - from + 1;
const nextPageNum = Math.floor(to / pageSize) + 1;
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
let startNr = 1;
let consecutiveEmpty = 0;
// Click the "next page" control in FdcLayListNav.
// FDISK renders pagination as plain <img> inside <td class="Right"> (no <a> wrappers).
// Use Playwright's click() which properly triggers JS event listeners attached via addEventListener.
// Try in order: b_next img → b_last img → any <a> with ">" text → page-number link.
let nextClicked = false;
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
const endNr = startNr + BATCH - 1;
const nextImg = frame.locator('table.FdcLayListNav td.Right img[src*="b_next"]');
if (await nextImg.count() > 0) {
await nextImg.first().click({ timeout: 5000 }).catch(() => {});
nextClicked = true;
} else {
// Fallback via evaluate for text/title/page-number patterns
const clicked = await frame.evaluate((nextPg: number) => {
const nav = document.querySelector('table.FdcLayListNav');
if (!nav) return false;
const clickable = Array.from(nav.querySelectorAll('a, input[type="button"], input[type="submit"], td'));
for (const el of clickable) {
const text = ((el as HTMLElement).textContent ?? '').trim();
const title = ((el as HTMLElement).getAttribute('title') ?? '').toLowerCase();
const onclick = ((el as HTMLElement).getAttribute('onclick') ?? '').toLowerCase();
if (text === '>' || text === '>>' ||
title.includes('nächst') || title.includes('weiter') || title.includes('next') ||
onclick.includes('next') || onclick.includes('weiter') ||
text === String(nextPg)) {
(el as HTMLElement).click();
return true;
}
}
return false;
}, nextPageNum);
nextClicked = clicked;
}
// Set StNr range in the search form and submit
const formOk = await frame.evaluate((s: number, e: number) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
if (!fromFld || !toFld) return false;
fromFld.value = String(s);
toFld.value = String(e);
return true;
}, startNr, endNr);
if (!nextClicked) {
const navHtml = await frame.evaluate(() => {
const nav = document.querySelector('table.FdcLayListNav');
return nav?.innerHTML?.replace(/\s+/g, ' ').trim() ?? '(not found)';
});
log(`WARN: could not find next-page link — stopping pagination`);
log(`FdcLayListNav HTML: ${navHtml}`);
if (!formOk) {
log('WARN: could not set StNr range fields — aborting range queries');
break;
}
await frame.waitForLoadState('networkidle', { timeout: 30000 });
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Verify we actually moved to the next page — if pagination didn't advance, stop
const newPagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
if (newPagination === pagination) {
log(`WARN: pagination did not advance after click (still "${pagination}") — stopping`);
// Try clicking the Right td directly as a last resort
const tdClicked = await frame.evaluate(() => {
const td = document.querySelector('table.FdcLayListNav td.Right') as HTMLElement | null;
if (td) { td.click(); return true; }
return false;
});
if (tdClicked) {
await frame.waitForLoadState('networkidle', { timeout: 30000 });
const afterTdPagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
if (afterTdPagination === pagination) {
log('WARN: td.Right click also did not advance — pagination is disabled, stopping');
break;
}
} else {
break;
}
const rangeRows = await parseRowsFromTable(frame);
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
allRows.push(...newRows);
log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
for (const row of newRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
pageNum++;
} else {
// No pagination indicator found — assume single page
log('No pagination indicator found — assuming single page');
break;
}
} // end while
log(`Parsed ${allRows.length} rows total across ${pageNum} page(s)`);
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) {
log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`);
continue;
}
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
});
}
return members;