update
This commit is contained in:
@@ -217,136 +217,84 @@ async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
|
|||||||
log(`After form submit: ${frame.url()}`);
|
log(`After form submit: ${frame.url()}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect rows across all pages
|
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
|
||||||
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
|
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
|
||||||
const allRows: ParsedRow[] = [];
|
|
||||||
let pageNum = 1;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
// Log tables found for diagnostics
|
|
||||||
const tableInfo = await frame.$$eval('table', (ts) =>
|
|
||||||
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`),
|
|
||||||
);
|
|
||||||
log(`Page ${pageNum} tables: ${tableInfo.join(', ') || 'none'}`);
|
|
||||||
|
|
||||||
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
|
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
|
||||||
|
const firstRows = await parseRowsFromTable(frame);
|
||||||
const pageRows = await parseRowsFromTable(frame);
|
log(`Initial fetch: ${firstRows.length} rows`);
|
||||||
log(`Page ${pageNum}: parsed ${pageRows.length} rows`);
|
for (const row of firstRows) {
|
||||||
for (const row of pageRows) {
|
|
||||||
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
||||||
}
|
}
|
||||||
allRows.push(...pageRows);
|
|
||||||
|
|
||||||
// Check pagination status from "Datensatz X-Y von Z" text
|
const pagination = await frame.evaluate(() =>
|
||||||
const pagination = await frame.evaluate(() => {
|
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
|
||||||
const nav = document.querySelector('table.FdcLayListNav');
|
);
|
||||||
return nav?.textContent?.trim() ?? '';
|
|
||||||
});
|
|
||||||
log(`Pagination: "${pagination}"`);
|
log(`Pagination: "${pagination}"`);
|
||||||
|
|
||||||
// Parse "Datensatz X-Y von Z" to check if more pages exist
|
|
||||||
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
|
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
|
||||||
if (pagMatch) {
|
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
|
||||||
const from = parseInt(pagMatch[1], 10);
|
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
|
||||||
const to = parseInt(pagMatch[2], 10);
|
|
||||||
const total = parseInt(pagMatch[3], 10);
|
|
||||||
if (to >= total) {
|
|
||||||
log(`All ${total} records loaded across ${pageNum} page(s)`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
log(`Loaded ${to} of ${total} — navigating to next page`);
|
|
||||||
|
|
||||||
// Calculate next page number to use as a fallback click target
|
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
|
||||||
const pageSize = to - from + 1;
|
const allRows: ParsedRow[] = [...firstRows];
|
||||||
const nextPageNum = Math.floor(to / pageSize) + 1;
|
|
||||||
|
|
||||||
// Click the "next page" control in FdcLayListNav.
|
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
|
||||||
// FDISK renders pagination as plain <img> inside <td class="Right"> (no <a> wrappers).
|
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
|
||||||
// Use Playwright's click() which properly triggers JS event listeners attached via addEventListener.
|
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
|
||||||
// Try in order: b_next img → b_last img → any <a> with ">" text → page-number link.
|
|
||||||
let nextClicked = false;
|
|
||||||
|
|
||||||
const nextImg = frame.locator('table.FdcLayListNav td.Right img[src*="b_next"]');
|
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
|
||||||
if (await nextImg.count() > 0) {
|
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
|
||||||
await nextImg.first().click({ timeout: 5000 }).catch(() => {});
|
let startNr = 1;
|
||||||
nextClicked = true;
|
let consecutiveEmpty = 0;
|
||||||
} else {
|
|
||||||
// Fallback via evaluate for text/title/page-number patterns
|
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
|
||||||
const clicked = await frame.evaluate((nextPg: number) => {
|
const endNr = startNr + BATCH - 1;
|
||||||
const nav = document.querySelector('table.FdcLayListNav');
|
|
||||||
if (!nav) return false;
|
// Set StNr range in the search form and submit
|
||||||
const clickable = Array.from(nav.querySelectorAll('a, input[type="button"], input[type="submit"], td'));
|
const formOk = await frame.evaluate((s: number, e: number) => {
|
||||||
for (const el of clickable) {
|
const form = (document as any).forms['frmsearch'];
|
||||||
const text = ((el as HTMLElement).textContent ?? '').trim();
|
if (!form) return false;
|
||||||
const title = ((el as HTMLElement).getAttribute('title') ?? '').toLowerCase();
|
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
|
||||||
const onclick = ((el as HTMLElement).getAttribute('onclick') ?? '').toLowerCase();
|
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
|
||||||
if (text === '>' || text === '>>' ||
|
if (!fromFld || !toFld) return false;
|
||||||
title.includes('nächst') || title.includes('weiter') || title.includes('next') ||
|
fromFld.value = String(s);
|
||||||
onclick.includes('next') || onclick.includes('weiter') ||
|
toFld.value = String(e);
|
||||||
text === String(nextPg)) {
|
|
||||||
(el as HTMLElement).click();
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}, startNr, endNr);
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}, nextPageNum);
|
|
||||||
nextClicked = clicked;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nextClicked) {
|
if (!formOk) {
|
||||||
const navHtml = await frame.evaluate(() => {
|
log('WARN: could not set StNr range fields — aborting range queries');
|
||||||
const nav = document.querySelector('table.FdcLayListNav');
|
|
||||||
return nav?.innerHTML?.replace(/\s+/g, ' ').trim() ?? '(not found)';
|
|
||||||
});
|
|
||||||
log(`WARN: could not find next-page link — stopping pagination`);
|
|
||||||
log(`FdcLayListNav HTML: ${navHtml}`);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
await frame.waitForLoadState('networkidle', { timeout: 30000 });
|
await Promise.all([
|
||||||
|
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||||||
|
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||||||
|
]);
|
||||||
|
|
||||||
// Verify we actually moved to the next page — if pagination didn't advance, stop
|
const rangeRows = await parseRowsFromTable(frame);
|
||||||
const newPagination = await frame.evaluate(() =>
|
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
|
||||||
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
|
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
|
||||||
);
|
allRows.push(...newRows);
|
||||||
if (newPagination === pagination) {
|
|
||||||
log(`WARN: pagination did not advance after click (still "${pagination}") — stopping`);
|
|
||||||
// Try clicking the Right td directly as a last resort
|
|
||||||
const tdClicked = await frame.evaluate(() => {
|
|
||||||
const td = document.querySelector('table.FdcLayListNav td.Right') as HTMLElement | null;
|
|
||||||
if (td) { td.click(); return true; }
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
if (tdClicked) {
|
|
||||||
await frame.waitForLoadState('networkidle', { timeout: 30000 });
|
|
||||||
const afterTdPagination = await frame.evaluate(() =>
|
|
||||||
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
|
|
||||||
);
|
|
||||||
if (afterTdPagination === pagination) {
|
|
||||||
log('WARN: td.Right click also did not advance — pagination is disabled, stopping');
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pageNum++;
|
|
||||||
} else {
|
|
||||||
// No pagination indicator found — assume single page
|
|
||||||
log('No pagination indicator found — assuming single page');
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} // end while
|
|
||||||
|
|
||||||
log(`Parsed ${allRows.length} rows total across ${pageNum} page(s)`);
|
log(`StNr ${startNr}–${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
|
||||||
|
for (const row of newRows) {
|
||||||
|
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
||||||
|
}
|
||||||
|
|
||||||
|
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
|
||||||
|
startNr = endNr + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
log(`Parsed ${allRows.length} raw rows total`);
|
||||||
|
|
||||||
const members: FdiskMember[] = [];
|
const members: FdiskMember[] = [];
|
||||||
for (const row of allRows) {
|
for (const row of allRows) {
|
||||||
if (!row.standesbuchNr || !row.vorname || !row.zuname) {
|
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
|
||||||
log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const abmeldedatum = parseDate(row.abmeldedatum);
|
const abmeldedatum = parseDate(row.abmeldedatum);
|
||||||
members.push({
|
members.push({
|
||||||
standesbuchNr: row.standesbuchNr,
|
standesbuchNr: row.standesbuchNr,
|
||||||
|
|||||||
Reference in New Issue
Block a user