Files
dashboard/sync/src/scraper.ts
Matthias Hochmeister d276e45248 update
2026-03-16 16:04:05 +01:00

1314 lines
53 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chromium, Page, Frame } from '@playwright/test';
import * as fs from 'fs';
import * as path from 'path';
import {
FdiskMember,
FdiskAusbildung,
FdiskBefoerderung,
FdiskUntersuchung,
FdiskFahrgenehmigung,
} from './types';
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
const DEBUG_HTML = process.env.FDISK_DEBUG_HTML === '1' || process.env.FDISK_DEBUG_HTML === 'true';
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
/** Save frame HTML to debug/ folder when FDISK_DEBUG_HTML=1 */
async function dumpHtml(frame: Frame, label: string): Promise<void> {
if (!DEBUG_HTML) return;
try {
const debugDir = path.resolve(process.cwd(), 'debug');
fs.mkdirSync(debugDir, { recursive: true });
const html = await frame.content();
const safeName = label.replace(/[^a-zA-Z0-9_-]/g, '_');
const filePath = path.join(debugDir, `${safeName}.html`);
fs.writeFileSync(filePath, html, 'utf-8');
log(` [debug] saved HTML → ${filePath} (${(html.length / 1024).toFixed(1)} KB)`);
} catch (err: any) {
log(` [debug] failed to save HTML for "${label}": ${err.message}`);
}
}
function log(msg: string) {
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
}
/**
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
* Returns null if empty or unparseable.
*/
function parseDate(raw: string | null | undefined): string | null {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
if (!match) return null;
return `${match[3]}-${match[2]}-${match[1]}`;
}
/**
* Extract text content from a cell, trimmed, or null if empty.
*/
function cellText(text: string | undefined | null): string | null {
const t = (text ?? '').trim();
return t || null;
}
/**
* Fetch only members we care about, rather than scraping the full member list.
*
* Phase 1: one search per known StNr (exact match).
* Phase 2: if knownNames is non-empty, a single unfiltered fetch (page 1 only)
* to pick up members matched by name (first-time linking).
*
* Returns deduplicated FdiskMember[].
*/
async function scrapeKnownMembers(
frame: Frame,
knownStNrs: Set<string>,
knownNames: Set<string>,
): Promise<FdiskMember[]> {
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
const seenStNrs = new Set<string>();
const allRows: ParsedRow[] = [];
// --- Phase 1: fetch by exact StNr ---
log(`scrapeKnownMembers: fetching ${knownStNrs.size} known StNrs`);
for (const stNr of knownStNrs) {
const formOk = await frame.evaluate((sn) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (!fromFld || !toFld) return false;
fromFld.value = sn;
toFld.value = sn;
return true;
}, stNr);
if (!formOk) {
log(` WARN: search form not usable for StNr ${stNr}`);
continue;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rows = await parseRowsFromTable(frame);
for (const r of rows) {
if (r.standesbuchNr && !seenStNrs.has(r.standesbuchNr)) {
seenStNrs.add(r.standesbuchNr);
allRows.push(r);
}
}
log(` StNr ${stNr}: ${rows.length} row(s)`);
// Be gentle on the server
await frame.page().waitForTimeout(300);
}
// --- Phase 2: single unfiltered fetch for name-matching ---
if (knownNames.size > 0) {
log(`scrapeKnownMembers: unfiltered fetch for ${knownNames.size} name-based matches`);
// Clear StNr filter
await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (fromFld) fromFld.value = '';
if (toFld) toFld.value = '';
});
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rows = await parseRowsFromTable(frame);
let matched = 0;
for (const r of rows) {
if (!r.standesbuchNr || seenStNrs.has(r.standesbuchNr)) continue;
const nameKey = `${(r.vorname || '').toLowerCase()}::${(r.zuname || '').toLowerCase()}`;
if (knownNames.has(nameKey)) {
seenStNrs.add(r.standesbuchNr);
allRows.push(r);
matched++;
}
}
log(` Unfiltered page: ${rows.length} total rows, ${matched} name-matched`);
}
log(`scrapeKnownMembers: ${allRows.length} members collected`);
// Build FdiskMember objects
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
geburtsort: null,
geschlecht: null,
beruf: null,
wohnort: null,
plz: null,
});
}
return members;
}
export async function scrapeAll(username: string, password: string, knownStNrs: Set<string>, knownNames: Set<string>): Promise<{
members: FdiskMember[];
ausbildungen: FdiskAusbildung[];
befoerderungen: FdiskBefoerderung[];
untersuchungen: FdiskUntersuchung[];
fahrgenehmigungen: FdiskFahrgenehmigung[];
}> {
const browser = await chromium.launch({
headless: true,
args: ['--disable-gpu', '--disable-software-rasterizer'],
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
await login(page, username, password);
// After login, page is on Start.aspx (frameset).
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
// the server reads the org context from session variables set by the menu.
// Navigate via the menu frame (left.aspx) to set session state properly.
const mainFrame = await navigateToMemberList(page);
const members = await scrapeKnownMembers(mainFrame, knownStNrs, knownNames);
log(`Found ${members.length} members (targeted query)`);
if (DEBUG_HTML) log(`[debug] HTML dump mode ON — saving pages to debug/`);
const ausbildungen: FdiskAusbildung[] = [];
const befoerderungen: FdiskBefoerderung[] = [];
const untersuchungen: FdiskUntersuchung[] = [];
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
for (const member of members) {
try {
// Navigate to member detail page — use direct URL if available, else search+click fallback
const onDetail = member.detailUrl
? (await frame_goto(mainFrame, member.detailUrl), true)
: await navigateToMemberDetailBySearch(mainFrame, member.standesbuchNr);
if (!onDetail) {
log(` SKIP ${member.vorname} ${member.zuname} (${member.standesbuchNr}): could not reach detail page`);
continue;
}
// Scrape extra profile fields from the detail form
const profileFields = await scrapeDetailProfileFields(mainFrame);
member.geburtsort = profileFields.geburtsort;
member.geschlecht = profileFields.geschlecht;
member.beruf = profileFields.beruf;
member.wohnort = profileFields.wohnort;
member.plz = profileFields.plz;
// Debug: dump the member detail page (Ausbildungen are scraped from here)
await dumpHtml(mainFrame, `detail_StNr${member.standesbuchNr}`);
// Extract mitgliedschaft + person params from the current URL for constructing sub-section URLs.
// PersonenForm.aspx is in the personen module; sub-sections are each in their own module.
// URL pattern: ?search=1&searchid_mitgliedschaften=X&id_personen=Y&id_mitgliedschaften=X&searchid_personen=Y&searchid_maskmode=
const currentUrl = mainFrame.url();
const urlObj = new URL(currentUrl);
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
const idPersonen = urlObj.searchParams.get('id_personen');
const idInstanzen = urlObj.searchParams.get('id_instanzen') ?? ID_INSTANZEN;
// Ausbildungen
const quals = await scrapeAusbildungenFromDetailPage(mainFrame, member, idMitgliedschaft, idPersonen);
ausbildungen.push(...quals);
// Beförderungen
const befos = (idMitgliedschaft && idPersonen)
? await scrapeMemberBefoerderungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
befoerderungen.push(...befos);
// Untersuchungen
const unters = (idMitgliedschaft && idPersonen)
? await scrapeMemberUntersuchungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
untersuchungen.push(...unters);
// Fahrgenehmigungen
const fahrg = (idMitgliedschaft && idPersonen)
? await scrapeMemberFahrgenehmigungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen, idInstanzen)
: [];
fahrgenehmigungen.push(...fahrg);
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen, ${befos.length} Beförderungen, ${unters.length} Untersuchungen, ${fahrg.length} Fahrgenehmigungen`);
await page.waitForTimeout(500);
} catch (err) {
log(` WARN: could not scrape detail for ${member.vorname} ${member.zuname}: ${err}`);
}
}
return { members, ausbildungen, befoerderungen, untersuchungen, fahrgenehmigungen };
} finally {
await browser.close();
}
}
/** Navigate a frame, waiting for networkidle. Wrapper to avoid repetition. */
async function frame_goto(frame: Frame, url: string): Promise<void> {
await frame.goto(url, { waitUntil: 'networkidle' });
}
async function login(page: Page, username: string, password: string): Promise<void> {
log(`Navigating to ${LOGIN_URL}`);
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
await page.waitForLoadState('networkidle');
// Check if already logged in
const currentUrlBefore = page.url();
if (!currentUrlBefore.toLowerCase().includes('login')) {
log(`Already logged in, on: ${currentUrlBefore}`);
return;
}
// Exact selectors from the known login form HTML
const usernameField = page.locator('#login');
const passwordField = page.locator('#password');
const submitButton = page.locator('#Submit2');
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
await usernameField.fill(username);
await passwordField.fill(password);
await submitButton.click();
// Wait for navigation away from the login page (up to 15s)
try {
await page.waitForURL(
(url) => !url.toString().toLowerCase().includes('login'),
{ waitUntil: 'networkidle', timeout: 15000 },
);
} catch {
// waitForURL timed out — fall through to the URL check below
}
// Verify we're logged in
const currentUrl = page.url();
if (currentUrl.toLowerCase().includes('login')) {
throw new Error(`Login failed — still on login page: ${currentUrl}`);
}
log(`Logged in successfully, redirected to: ${currentUrl}`);
}
/**
* Fallback navigation to a member's detail page when no direct URL is available.
* Navigates to the member list, filters by exact standesbuchNr, then clicks the result row.
* Returns true if we successfully landed on a detail page.
*/
async function navigateToMemberDetailBySearch(frame: Frame, standesbuchNr: string): Promise<boolean> {
// Navigate to the member list
await frame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await frame.waitForLoadState('networkidle');
// Set exact standesbuchNr filter in the search form
const formOk = await frame.evaluate((stNr) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (!fromFld || !toFld) return false;
fromFld.value = stNr;
toFld.value = stNr;
return true;
}, standesbuchNr);
if (!formOk) {
log(` WARN navigateToMemberDetailBySearch: search form not usable for StNr ${standesbuchNr}`);
return false;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Click on the first data row — FDISK rows navigate to the detail page on click
const firstRowLink = await frame.$('table.FdcLayList tbody tr:first-child a, table.FdcLayList tbody tr:first-child td');
if (!firstRowLink) {
log(` WARN navigateToMemberDetailBySearch: no result row for StNr ${standesbuchNr}`);
return false;
}
try {
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }),
firstRowLink.click(),
]);
} catch {
// waitForNavigation may time out if click didn't navigate (e.g. onclick vs href)
// Check whether the URL changed at all
}
const url = frame.url();
const onDetailPage = !url.includes('MitgliedschaftenList') && !url.includes('meine_Mitglieder');
if (onDetailPage) {
log(` Navigated to detail via search+click: ${url}`);
} else {
log(` WARN navigateToMemberDetailBySearch: still on list page after click for StNr ${standesbuchNr}`);
}
return onDetailPage;
}
async function navigateToMemberList(page: Page): Promise<Frame> {
const mainFrame = page.frame({ name: 'mainFrame' });
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await mainFrame.waitForLoadState('networkidle');
const url = mainFrame.url();
const title = await mainFrame.title();
log(`mainFrame loaded: ${url} — title: "${title}"`);
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
throw new Error(`Member list returned error page: ${url}`);
}
return mainFrame;
}
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`Scraping member list from: ${frame.url()}`);
// Clear the Standesbuchnummer filter if the search form is present.
// FDISK pre-fills the logged-in user's own Standesbuchnummer, which limits results to 1 member.
// We clear it before submitting so all members of the fire station are returned.
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
if (hasForm) {
const fieldDump = await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return { cleared: [], pageSizeSet: null as string | null, allFields: [] };
const cleared: string[] = [];
const allFields: string[] = [];
let pageSizeSet: string | null = null;
for (const el of Array.from(form.elements) as HTMLInputElement[]) {
if (el.type === 'hidden') continue;
const name = (el.name ?? '').toLowerCase();
const id = (el.id ?? '').toLowerCase();
if (el.value) allFields.push(`${el.name || el.id}=${el.value}`);
if (name.includes('standesbuch') || id.includes('standesbuch')) {
el.value = '';
cleared.push(el.name || el.id);
}
// Maximize page size: look for a select AND its paired hidden input
// FDISK uses a custom Dd widget where <select name="xDd_dd"> is the visible dropdown
// but the actual POST value comes from <input type="hidden" name="xDd_id"> or similar.
if ((name.includes('anzahl') || id.includes('anzahl') ||
name.includes('pagesize') || id.includes('pagesize') ||
name.includes('rows') || id.includes('rows')) &&
el.tagName === 'SELECT') {
const select = el as unknown as HTMLSelectElement;
// Pick the largest numeric option value, or the last option as fallback
let bestOption: HTMLOptionElement | null = null;
let bestVal = -1;
for (const opt of Array.from(select.options)) {
const n = parseInt(opt.value, 10);
if (!isNaN(n) && n > bestVal) { bestVal = n; bestOption = opt; }
}
if (!bestOption && select.options.length > 0) {
bestOption = select.options[select.options.length - 1];
}
if (bestOption) {
select.value = bestOption.value;
pageSizeSet = `${el.name || el.id}=${bestOption.value}`;
// Also update the paired hidden field used by the Dd custom widget.
// Common patterns: xDd_dd → xDd_id or xDd_hd
const baseName = (el.name || el.id).replace(/_dd$/i, '');
for (const suffix of ['_id', '_hd', '_val']) {
const hidden = form.elements[baseName + suffix] as HTMLInputElement | undefined;
if (hidden && hidden.type === 'hidden') {
hidden.value = bestOption.value;
pageSizeSet += ` (also set ${baseName + suffix})`;
}
}
}
}
}
return { cleared, pageSizeSet, allFields };
});
if (fieldDump.allFields.length > 0) {
log(`Search form active filters before clear: ${fieldDump.allFields.join(', ')}`);
}
if (fieldDump.cleared.length > 0) {
log(`Cleared Standesbuchnummer filter fields: ${fieldDump.cleared.join(', ')}`);
} else {
log('Search form found — no Standesbuchnummer field detected, submitting as-is');
}
if (fieldDump.pageSizeSet) {
log(`Set page size: ${fieldDump.pageSizeSet}`);
} else {
log('No page size field found — will paginate through all results');
}
// Use Promise.all to start waiting for navigation BEFORE triggering the submit,
// otherwise waitForLoadState resolves against the already-idle current page.
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
log(`After form submit: ${frame.url()}`);
}
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const firstRows = await parseRowsFromTable(frame);
log(`Initial fetch: ${firstRows.length} rows`);
// Log href debug info for the first row to diagnose URL extraction
const rowDebug = await frame.evaluate(() => (window as any).__fdiskFirstRowDebug ?? 'no debug info');
log(`Row href debug: ${rowDebug}`);
for (const row of firstRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
const pagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
log(`Pagination: "${pagination}"`);
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const allRows: ParsedRow[] = [...firstRows];
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
let startNr = 1;
let consecutiveEmpty = 0;
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
const endNr = startNr + BATCH - 1;
// Set StNr range in the search form and submit
const formOk = await frame.evaluate(({ s, e }: { s: number; e: number }) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
if (!fromFld || !toFld) return false;
fromFld.value = String(s);
toFld.value = String(e);
return true;
}, { s: startNr, e: endNr });
if (!formOk) {
log('WARN: could not set StNr range fields — aborting range queries');
break;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rangeRows = await parseRowsFromTable(frame);
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
allRows.push(...newRows);
log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
for (const row of newRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
geburtsort: null,
geschlecht: null,
beruf: null,
wohnort: null,
plz: null,
});
}
return members;
}
async function parseRowsFromTable(frame: Frame) {
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
// Each <td> contains an <a title="value"> — the title is the clean cell text.
// Navigation may be via href or onclick handlers (FDISK uses both depending on version).
return frame.$$eval('table.FdcLayList tbody tr', (trs) =>
trs.map((tr, rowIdx) => {
const cells = Array.from(tr.querySelectorAll('td'));
const val = (i: number) => {
const a = cells[i]?.querySelector('a');
const title = a?.getAttribute('title')?.trim();
// Use title only if non-empty; otherwise fall back to textContent
return (title || cells[i]?.textContent || '').trim();
};
// Extract detail URL — try multiple strategies:
// 1. Standard <a href="..."> pointing to an .aspx page
// 2. onclick attribute on <a>, <td>, or <tr> containing an .aspx URL
let href: string | null = null;
let debugInfo = '';
for (const a of Array.from(tr.querySelectorAll('a'))) {
const rawHref = (a as Element).getAttribute('href') ?? '';
debugInfo += `a.href="${rawHref}" `;
if (rawHref && rawHref !== '#' && rawHref !== '' && !rawHref.startsWith('javascript:')) {
href = (a as HTMLAnchorElement).href; // resolves relative → absolute
break;
}
}
if (!href) {
// Scan onclick on all ancestors + cells + anchors for .aspx URLs
const candidates: Element[] = [tr, ...Array.from(tr.querySelectorAll('a, td'))];
for (const el of candidates) {
const onclick = el.getAttribute('onclick') ?? '';
if (onclick) debugInfo += `onclick="${onclick}" `;
const match = onclick.match(/['"]([^'"]*\.aspx[^'"]*)['"]/);
if (match) {
try {
href = new URL(match[1], (window as Window).location.href).href;
} catch {
href = match[1];
}
break;
}
}
}
// Log debug info for first data row to help diagnose href extraction issues
if (rowIdx === 0 && val(2)) {
(window as any).__fdiskFirstRowDebug = `StNr=${val(2)} href=${href} debug=${debugInfo}`;
}
return {
status: val(1),
standesbuchNr: val(2),
dienstgrad: val(3),
vorname: val(4),
zuname: val(5),
geburtsdatum: val(6),
svnr: val(7),
eintrittsdatum: val(8),
abmeldedatum: val(9),
href,
};
}),
);
}
/**
* Scrape additional profile fields from the member detail form.
* Called while the frame is already on the member detail page.
*/
async function scrapeDetailProfileFields(frame: Frame): Promise<{
geburtsort: string | null;
geschlecht: string | null;
beruf: string | null;
wohnort: string | null;
plz: string | null;
}> {
return frame.evaluate(() => {
const val = (selector: string): string | null => {
const el = document.querySelector(selector) as HTMLInputElement | HTMLSelectElement | null;
if (!el) return null;
if (el.tagName === 'SELECT') {
const sel = el as HTMLSelectElement;
const opt = sel.options[sel.selectedIndex];
return opt ? (opt.text || opt.value || '').trim() || null : null;
}
return (el as HTMLInputElement).value?.trim() || null;
};
return {
geburtsort: val('input[name="geburtsort"]') ?? val('input[id*="geburtsort"]'),
geschlecht: val('select[name*="geschlecht"]') ?? val('select[id*="geschlecht"]'),
beruf: val('input[name="beruf"]') ?? val('input[id*="beruf"]'),
wohnort: val('input[name="ort"]') ?? val('input[id*="_ort"]') ?? val('input[name="wohnort"]'),
plz: val('input[name="plz"]') ?? val('input[id*="plz"]'),
};
});
}
/**
* Scrape Ausbildungen by navigating to the AusbildungenListEdit.aspx page.
* This is a ListEdit page (like Fahrgenehmigungen) with <input>/<select> elements.
*/
async function scrapeAusbildungenFromDetailPage(
frame: Frame,
member: FdiskMember,
idMitgliedschaft?: string | null,
idPersonen?: string | null,
): Promise<FdiskAusbildung[]> {
// If we don't have the IDs, we cannot navigate to the Ausbildungen page
if (!idMitgliedschaft || !idPersonen) {
log(` Ausbildungen for StNr ${member.standesbuchNr}: missing mitgliedschaft/personen IDs, skipping`);
return [];
}
const url = `${BASE_URL}/fdisk/module/mgvw/ausbildungen/AusbildungenListEdit.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → Ausbildungen ERROR page: ${landed}`);
return [];
}
// Dump HTML for debugging
await dumpHtml(frame, `ausbildungen_StNr${member.standesbuchNr}`);
// This is a ListEdit page — read form fields by ID pattern or parse table with extractCellValue
const ausbildungen = await frame.evaluate((stNr: string) => {
const extractCellValue = (cell: Element): string => {
const input = cell.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input && input.value?.trim()) return input.value.trim();
const sel = cell.querySelector('select') as HTMLSelectElement | null;
if (sel) {
// Try selectedIndex first
const idx = sel.selectedIndex;
if (idx >= 0 && sel.options[idx]) {
const t = (sel.options[idx].text || sel.options[idx].value || '').trim();
if (t) return t;
}
// Fallback: read the selected attribute directly from HTML
const selectedOpt = sel.querySelector('option[selected]') as HTMLOptionElement | null;
if (selectedOpt) {
const t = (selectedOpt.text || selectedOpt.value || '').trim();
if (t) return t;
}
}
const anchor = cell.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return cell.textContent?.replace(/\u00A0/g, ' ').trim() ?? '';
};
const results: Array<{
standesbuchNr: string;
kursname: string | null;
kursDatum: string | null;
ablaufdatum: string | null;
ort: string | null;
bemerkung: string | null;
syncKey: string;
}> = [];
// Collect rows from all tables, find the data table
const tables = Array.from(document.querySelectorAll('table'));
let bestRows: Array<{ cells: string[] }> = [];
let bestHeaders: string[] = [];
for (const table of tables) {
const rows: Array<{ cells: string[] }> = [];
const headerCells: string[] = [];
// Get headers
for (const th of Array.from(table.querySelectorAll('thead th, tr:first-child th'))) {
headerCells.push(extractCellValue(th));
}
// Get data rows
for (const tr of Array.from(table.querySelectorAll('tr'))) {
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue;
if (tr.querySelectorAll('th').length > 0) continue;
rows.push({ cells: tds.map(td => extractCellValue(td)) });
}
if (rows.length > bestRows.length) {
bestRows = rows;
bestHeaders = headerCells;
}
}
if (bestRows.length === 0) return results;
// Try to find column indices from headers
const hdr = bestHeaders.map(h => h.toLowerCase());
let kursnameIdx = hdr.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
let datumIdx = hdr.findIndex(h => h.includes('datum') || h.includes('abschluss'));
let ablaufIdx = hdr.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
let ortIdx = hdr.findIndex(h => h.includes('ort'));
let bemIdx = hdr.findIndex(h => h.includes('bem') || h.includes('info'));
// If headers didn't help, scan data for date-like columns and text columns
if (kursnameIdx === -1 && bestRows.length > 0) {
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
// Find date columns
const dateCols = new Set<number>();
const textCols: number[] = [];
for (const row of bestRows.slice(0, 3)) {
for (let ci = 0; ci < row.cells.length; ci++) {
const v = row.cells[ci]?.trim();
if (!v) continue;
if (datePattern.test(v)) dateCols.add(ci);
else if (v.length > 2 && !/^[\d.,]+$/.test(v)) textCols.push(ci);
}
}
// The longest text column is likely the Kursname
if (textCols.length > 0) {
let maxLen = 0;
for (const ci of textCols) {
const len = (bestRows[0]?.cells[ci] ?? '').length;
if (len > maxLen) { maxLen = len; kursnameIdx = ci; }
}
}
// First date column is Datum, second is Ablaufdatum
const sortedDates = Array.from(dateCols).sort((a, b) => a - b);
if (sortedDates.length > 0 && datumIdx === -1) datumIdx = sortedDates[0];
if (sortedDates.length > 1 && ablaufIdx === -1) ablaufIdx = sortedDates[1];
}
for (const row of bestRows) {
const kursname = ((kursnameIdx >= 0 ? row.cells[kursnameIdx] : row.cells[0])?.trim()) || '';
if (!kursname) continue;
// Skip header-like rows
if (/kurs|ausbildung|bezeichnung|datensätze|tiefennavigation/i.test(kursname)) continue;
const rawDatum = datumIdx >= 0 ? row.cells[datumIdx]?.trim() : null;
const rawAblauf = ablaufIdx >= 0 ? row.cells[ablaufIdx]?.trim() : null;
const rawOrt = ortIdx >= 0 ? row.cells[ortIdx]?.trim() || null : null;
const rawBem = bemIdx >= 0 ? row.cells[bemIdx]?.trim() || null : null;
// parseDate is not available inside evaluate; return raw values
results.push({
standesbuchNr: stNr,
kursname,
kursDatum: rawDatum || null,
ablaufdatum: rawAblauf || null,
ort: rawOrt,
bemerkung: rawBem,
syncKey: `${stNr}::${kursname}::${rawDatum ?? ''}`,
});
}
return results;
}, member.standesbuchNr).catch(() => [] as FdiskAusbildung[]);
// Post-process: parse dates and rebuild syncKeys
const results: FdiskAusbildung[] = ausbildungen.filter((a): a is typeof a & { kursname: string } => !!a.kursname).map(a => {
const kursDatum = parseDate(a.kursDatum);
return {
standesbuchNr: a.standesbuchNr,
kursname: a.kursname,
kursDatum,
ablaufdatum: parseDate(a.ablaufdatum),
ort: a.ort,
bemerkung: a.bemerkung,
syncKey: `${a.standesbuchNr}::${a.kursname}::${kursDatum ?? ''}`,
};
});
// Debug: dump HTML when no Ausbildungen found
if (results.length === 0) {
await dumpHtml(frame, `ausbildungen_empty_StNr${member.standesbuchNr}`);
}
return results;
}
/**
* Navigate to a sub-section URL and wait for any data table.
* Logs the actual URL and title so wrong-page issues are visible.
* Returns all <tr> rows from the first table found, or null if none.
*/
async function navigateAndGetTableRows(
frame: Frame,
url: string,
): Promise<{ rows: Array<{ cells: string[] }>; dateColIdx: number } | null> {
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
// Check for FDISK error pages
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → ERROR page: ${landed}`);
return null;
}
// Log all table classes on the page for diagnostics (first time only)
const tableInfo = await frame.evaluate(() => {
return Array.from(document.querySelectorAll('table')).map((t, i) => {
const cls = t.className || '(no class)';
const id = t.id || '';
const rowCount = t.querySelectorAll('tr').length;
return `${i}:cls="${cls}"${id ? ` id="${id}"` : ''} rows=${rowCount}`;
}).join(' | ');
}).catch(() => 'N/A');
log(` → tables: ${tableInfo}`);
// Collect rows from ALL tables, reading input/select values for inline-edit pages
const allRows = await frame.evaluate(() => {
const results: Array<{ cells: string[]; tableClass: string }> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
for (const tr of Array.from(table.querySelectorAll('tbody tr, tr'))) {
// Skip rows that are nested inside a child table
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue; // skip single-cell nav/header rows
results.push({
tableClass: cls,
cells: tds.map(td => {
const input = td.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input) return input.value?.trim() ?? '';
const sel = td.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const opt = sel.options[sel.selectedIndex];
return (opt?.text || opt?.value || '').trim();
}
// For FDISK list tables, the value is in <a title="..."> inside each cell
const anchor = td.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return td.textContent?.trim() ?? '';
}),
});
}
}
return results;
}).catch(() => [] as Array<{ cells: string[]; tableClass: string }>);
// Prefer rows from FdcLayList-class tables
const fdcRows = allRows.filter(r => r.tableClass.includes('FdcLayList'));
const resultRows = fdcRows.length > 0 ? fdcRows : allRows;
// Strip \u00A0 (non-breaking space) from all cell values and trim
const mapped = resultRows.map(r => ({
cells: r.cells.map(c => c.replace(/\u00A0/g, ' ').trim()),
}));
// Find date column dynamically: look for a DD.MM.YYYY pattern in any column
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
let dateColIdx = -1;
for (const r of mapped) {
for (let ci = 0; ci < r.cells.length; ci++) {
if (datePattern.test(r.cells[ci] ?? '')) {
dateColIdx = ci;
break;
}
}
if (dateColIdx >= 0) break;
}
const dataRows = dateColIdx >= 0
? mapped.filter(r => datePattern.test(r.cells[dateColIdx] ?? ''))
: [];
log(`${allRows.length} total rows, ${fdcRows.length} FdcLayList rows, ${dataRows.length} data rows (date in col ${dateColIdx})`);
// Debug: dump HTML when no data rows found
if (dataRows.length === 0) {
const urlSlug = url.split('/').pop()?.split('?')[0] ?? 'unknown';
await dumpHtml(frame, `navigateAndGetTableRows_${urlSlug}`);
}
return { rows: dataRows, dateColIdx };
}
/**
* Navigate to the Beförderungen sub-page and scrape all promotions.
*/
async function scrapeMemberBefoerderungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskBefoerderung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/befoerderungen/befoerderungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
const result = await navigateAndGetTableRows(frame, url);
if (!result) return [];
const { rows, dateColIdx } = result;
const results: FdiskBefoerderung[] = [];
for (const row of rows) {
const datum = parseDate(row.cells[dateColIdx]);
// The next non-empty column after the date holds the Dienstgrad
let dienstgrad = '';
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
const v = cellText(row.cells[ci]);
if (v) { dienstgrad = v; break; }
}
const syncKey = `${standesbuchNr}::${dienstgrad}::${datum ?? ''}`;
results.push({ standesbuchNr, datum, dienstgrad, syncKey });
}
log(` Beförderungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const b of results) log(` ${b.datum ?? '—'} ${b.dienstgrad}`);
return results;
}
/**
* Navigate to the Untersuchungen sub-page and scrape all medical exams.
*/
async function scrapeMemberUntersuchungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskUntersuchung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/untersuchungen/UntersuchungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
const result = await navigateAndGetTableRows(frame, url);
if (!result) return [];
const { rows, dateColIdx } = result;
const results: FdiskUntersuchung[] = [];
for (const row of rows) {
// Collect non-empty values from columns after the date column
const valueCols: string[] = [];
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
const v = cellText(row.cells[ci]);
if (v !== null) valueCols.push(v);
}
// Original layout: 0=Datum, 1=Anmerkungen, 2=Untersuchungsart, 3=Tauglichkeitsstufe
// With spacer columns the date may not be at 0; use relative offsets from collected values
const anmerkungen = valueCols[0] ?? null;
const art = valueCols[1] ?? null;
const ergebnis = valueCols[2] ?? null;
if (!art) continue;
const datum = parseDate(row.cells[dateColIdx]);
const syncKey = `${standesbuchNr}::${art}::${datum ?? ''}`;
results.push({
standesbuchNr,
datum,
anmerkungen,
art,
ergebnis,
syncKey,
});
}
log(` Untersuchungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const u of results) log(` ${u.datum ?? '—'} [${u.art}] ${u.ergebnis ?? '—'} | ${u.anmerkungen ?? ''}`);
return results;
}
/**
* Navigate to the Gesetzliche Fahrgenehmigungen sub-page and scrape all entries.
* This page is a ListEdit page with form fields named by row index pattern:
* ausstellungsdatum_{i}, gueltig_bis_{i}, behoerde_{i}, nummer_{i}, id_fahrgenehmigungsklassen_{i}
* Falls back to table-based parsing if field IDs are not found.
*/
async function scrapeMemberFahrgenehmigungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
idInstanzen: string,
): Promise<FdiskFahrgenehmigung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/ges_fahrgenehmigungen/Ges_fahrgenehmigungenListEdit.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`
+ `&searchid_instanzen=${idInstanzen}`;
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → Fahrgenehmigungen ERROR page: ${landed}`);
return [];
}
// Dump HTML for diagnostics
await dumpHtml(frame, `fahrgenehmigungen_StNr${standesbuchNr}`);
// Read form fields by ID pattern: {fieldname}_{rowIndex}
const rawRows = await frame.evaluate(() => {
const rows: Array<{
ausstellungsdatum: string;
gueltigBis: string;
behoerde: string;
nummer: string;
klasse: string;
}> = [];
for (let i = 0; i < 100; i++) {
// Try to find any field for this row index — if none exist, we've passed all rows
const ausstellungEl = document.querySelector(`input[name="ausstellungsdatum_${i}"], input[id="ausstellungsdatum_${i}"]`) as HTMLInputElement | null;
const gueltigEl = document.querySelector(`input[name="gueltig_bis_${i}"], input[id="gueltig_bis_${i}"]`) as HTMLInputElement | null;
const behoerdeEl = document.querySelector(`input[name="behoerde_${i}"], input[id="behoerde_${i}"]`) as HTMLInputElement | null;
const nummerEl = document.querySelector(`input[name="nummer_${i}"], input[id="nummer_${i}"]`) as HTMLInputElement | null;
const klasseEl = document.querySelector(`select[name="id_fahrgenehmigungsklassen_${i}"], select[id="id_fahrgenehmigungsklassen_${i}"]`) as HTMLSelectElement | null;
// If no field found at all, stop
if (!ausstellungEl && !gueltigEl && !behoerdeEl && !nummerEl && !klasseEl) break;
// Read klasse from select: try selectedIndex, then fallback to [selected] attribute
let klasse = '';
if (klasseEl) {
const idx = klasseEl.selectedIndex;
if (idx >= 0 && klasseEl.options[idx]) {
klasse = (klasseEl.options[idx].text || klasseEl.options[idx].value || '').trim();
}
if (!klasse) {
const selectedOpt = klasseEl.querySelector('option[selected]') as HTMLOptionElement | null;
if (selectedOpt) {
klasse = (selectedOpt.text || selectedOpt.value || '').trim();
}
}
if (!klasse && klasseEl.value?.trim()) {
klasse = klasseEl.value.trim();
}
}
rows.push({
ausstellungsdatum: ausstellungEl?.value?.trim() ?? '',
gueltigBis: gueltigEl?.value?.trim() ?? '',
behoerde: behoerdeEl?.value?.trim() ?? '',
nummer: nummerEl?.value?.trim() ?? '',
klasse,
});
}
return rows;
}).catch(() => [] as Array<{ ausstellungsdatum: string; gueltigBis: string; behoerde: string; nummer: string; klasse: string }>);
log(` → Fahrgenehmigungen form-field extraction: ${rawRows.length} rows found`);
// If form-field approach found rows, use them
if (rawRows.length > 0) {
const results: FdiskFahrgenehmigung[] = [];
for (const row of rawRows) {
const klasse = cellText(row.klasse);
if (!klasse) continue;
const ausstellungsdatum = parseDate(row.ausstellungsdatum);
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
results.push({
standesbuchNr,
ausstellungsdatum,
gueltigBis: parseDate(row.gueltigBis),
behoerde: cellText(row.behoerde),
nummer: cellText(row.nummer),
klasse,
syncKey,
});
}
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
return results;
}
// Fallback: table-based parsing (original approach with extractCellValue)
log(` → Fahrgenehmigungen: no form fields found, falling back to table parsing`);
const pageData = await frame.evaluate(() => {
const extractCellValue = (cell: Element): string => {
const input = cell.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input && input.value?.trim()) return input.value.trim();
const sel = cell.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const idx = sel.selectedIndex;
if (idx >= 0 && sel.options[idx]) {
const t = (sel.options[idx].text || sel.options[idx].value || '').trim();
if (t) return t;
}
// Fallback: read the selected attribute directly from HTML
const selectedOpt = sel.querySelector('option[selected]') as HTMLOptionElement | null;
if (selectedOpt) {
const t = (selectedOpt.text || selectedOpt.value || '').trim();
if (t) return t;
}
if (sel.value?.trim()) return sel.value.trim();
}
const anchor = cell.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return cell.textContent?.trim() ?? '';
};
const tables: Array<{
tableClass: string;
headers: string[];
rows: Array<{ cells: string[] }>;
}> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
const thElements = Array.from(table.querySelectorAll('thead th, tr th'));
const headers = thElements.map(th => extractCellValue(th));
const dataRows: Array<{ cells: string[] }> = [];
for (const tr of Array.from(table.querySelectorAll('tr'))) {
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue;
if (tr.querySelectorAll('th').length > 0) continue;
dataRows.push({ cells: tds.map(td => extractCellValue(td)) });
}
tables.push({ tableClass: cls, headers, rows: dataRows });
}
return tables;
}).catch(() => [] as Array<{ tableClass: string; headers: string[]; rows: Array<{ cells: string[] }> }>);
// Diagnostic: log all tables found
for (let ti = 0; ti < pageData.length; ti++) {
const t = pageData[ti];
log(` → table ${ti}: cls="${t.tableClass}" headers=[${t.headers.join(', ')}] dataRows=${t.rows.length}`);
for (let ri = 0; ri < t.rows.length; ri++) {
const preview = t.rows[ri].cells.slice(0, 8).map((c, j) => `[${j}]="${c}"`).join(' ');
log(` row ${ri}: ${preview}`);
}
}
const bestTable = pageData.find(t => t.tableClass.includes('FdcLayList') && t.rows.length > 0)
|| pageData.filter(t => t.rows.length > 0).sort((a, b) => b.rows.length - a.rows.length)[0];
if (!bestTable || bestTable.rows.length === 0) {
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: no data table found`);
return [];
}
const headers = bestTable.headers.map(h => h.toLowerCase());
log(` Fahrgenehmigungen headers: [${headers.join(', ')}]`);
let klasseIdx = headers.findIndex(h => h.includes('klasse') || h.includes('fahrgenehmigung'));
let ausstellungIdx = headers.findIndex(h => h.includes('ausstellung'));
let gueltigIdx = headers.findIndex(h => h.includes('gültig') || h.includes('gultig') || h.includes('ablauf'));
let behoerdeIdx = headers.findIndex(h => h.includes('behörde') || h.includes('behorde'));
let nummerIdx = headers.findIndex(h => h.includes('nummer') || h.includes('nr'));
const KNOWN_KLASSEN = new Set([
'AM', 'A1', 'A2', 'A', 'B', 'BE', 'C1', 'C1E', 'C', 'CE',
'D1', 'D1E', 'D', 'DE', 'F', 'L', 'L17', 'B+E', 'C+E', 'D+E',
]);
if (klasseIdx === -1) {
for (const row of bestTable.rows.slice(0, 3)) {
for (let ci = 0; ci < row.cells.length; ci++) {
const val = row.cells[ci]?.trim();
// Match known klassen or values containing "Führerschein" etc.
if (KNOWN_KLASSEN.has(val.toUpperCase()) || /führerschein|lenkberechtigung/i.test(val)) {
klasseIdx = ci;
log(` Fahrgenehmigungen: found Klasse in column ${ci} by data inspection`);
break;
}
}
if (klasseIdx >= 0) break;
}
}
if (ausstellungIdx === -1) {
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
for (const row of bestTable.rows.slice(0, 3)) {
for (let ci = 0; ci < row.cells.length; ci++) {
if (ci === klasseIdx) continue;
if (datePattern.test(row.cells[ci]?.trim())) {
ausstellungIdx = ci;
break;
}
}
if (ausstellungIdx >= 0) break;
}
}
log(` Fahrgenehmigungen column map: klasse=${klasseIdx} ausstellung=${ausstellungIdx} gueltig=${gueltigIdx} behoerde=${behoerdeIdx} nummer=${nummerIdx}`);
if (klasseIdx === -1) {
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: could not determine Klasse column. Returning empty.`);
await dumpHtml(frame, `fahrgenehmigungen_fallback_StNr${standesbuchNr}`);
return [];
}
const results: FdiskFahrgenehmigung[] = [];
for (const row of bestTable.rows) {
const klasse = cellText(row.cells[klasseIdx]);
if (!klasse) continue;
if (/klasse|fahrgenehmigung|ausstellung|datensätze|information|tiefennavigation/i.test(klasse)) continue;
if (/^\d{2}\.\d{2}\.\d{4}$/.test(klasse)) continue;
const ausstellungsdatum = parseDate(ausstellungIdx >= 0 ? row.cells[ausstellungIdx] : undefined);
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
results.push({
standesbuchNr,
ausstellungsdatum,
gueltigBis: parseDate(gueltigIdx >= 0 ? row.cells[gueltigIdx] : undefined),
behoerde: cellText(behoerdeIdx >= 0 ? row.cells[behoerdeIdx] : undefined),
nummer: cellText(nummerIdx >= 0 ? row.cells[nummerIdx] : undefined),
klasse,
syncKey,
});
}
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
return results;
}
// Legacy export kept for compatibility — delegates to the new unified flow
export async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
if (!member.detailUrl) return [];
await frame_goto(frame, member.detailUrl);
// Try to extract IDs from the detail URL
const urlObj = new URL(member.detailUrl, frame.url());
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
const idPersonen = urlObj.searchParams.get('id_personen');
return scrapeAusbildungenFromDetailPage(frame, member, idMitgliedschaft, idPersonen);
}