Files
dashboard/sync/src/scraper.ts
2026-04-18 18:15:40 +02:00

1385 lines
55 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chromium, Page, Frame } from '@playwright/test';
import {
FdiskMember,
FdiskAusbildung,
FdiskBefoerderung,
FdiskUntersuchung,
FdiskFahrgenehmigung,
} from './types';
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
/**
* Maps a raw FDISK status string to a dashboard status value.
* Returns null for unknown/unneeded statuses — those members should be skipped.
*/
function mapFdiskStatus(raw: string): 'aktiv' | 'kind' | 'jugend' | 'reserve' | null {
switch (raw.trim()) {
case 'Aktiv': return 'aktiv';
case 'Kind': return 'kind';
case 'Jugend': return 'jugend';
case 'Reserve': return 'reserve';
default: return null;
}
}
function log(msg: string) {
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
}
/**
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
* Returns null if empty or unparseable.
*/
function parseDate(raw: string | null | undefined): string | null {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
// Accept 12 digit day/month with optional trailing time (e.g. "10.9.2011 00:00:00")
const match = trimmed.match(/^(\d{1,2})\.(\d{1,2})\.(\d{4})/);
if (!match) return null;
return `${match[3]}-${match[2].padStart(2, '0')}-${match[1].padStart(2, '0')}`;
}
/**
* Extract text content from a cell, trimmed, or null if empty.
*/
function cellText(text: string | undefined | null): string | null {
const t = (text ?? '').trim();
return t || null;
}
/**
* Fetch only members we care about, rather than scraping the full member list.
*
* Phase 1: one search per known StNr (exact match).
* Phase 2: if knownNames is non-empty, a single unfiltered fetch (page 1 only)
* to pick up members matched by name (first-time linking).
*
* Returns deduplicated FdiskMember[].
*/
async function scrapeKnownMembers(
frame: Frame,
knownStNrs: Set<string>,
knownNames: Set<string>,
): Promise<FdiskMember[]> {
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
const seenStNrs = new Set<string>();
const allRows: ParsedRow[] = [];
// --- Phase 1: fetch by exact StNr ---
log(`scrapeKnownMembers: fetching ${knownStNrs.size} known StNrs`);
for (const stNr of knownStNrs) {
const formOk = await frame.evaluate((sn) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (!fromFld || !toFld) return false;
fromFld.value = sn;
toFld.value = sn;
return true;
}, stNr);
if (!formOk) {
log(` WARN: search form not usable for StNr ${stNr}`);
continue;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rows = await parseRowsFromTable(frame);
for (const r of rows) {
if (r.standesbuchNr && !seenStNrs.has(r.standesbuchNr)) {
seenStNrs.add(r.standesbuchNr);
allRows.push(r);
}
}
log(` StNr ${stNr}: ${rows.length} row(s)`);
// Be gentle on the server
await frame.page().waitForTimeout(300);
}
// --- Phase 2: single unfiltered fetch for name-matching ---
if (knownNames.size > 0) {
log(`scrapeKnownMembers: unfiltered fetch for ${knownNames.size} name-based matches`);
// Clear StNr filter
await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (fromFld) fromFld.value = '';
if (toFld) toFld.value = '';
});
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rows = await parseRowsFromTable(frame);
let matched = 0;
for (const r of rows) {
if (!r.standesbuchNr || seenStNrs.has(r.standesbuchNr)) continue;
const nameKey = `${(r.vorname || '').toLowerCase()}::${(r.zuname || '').toLowerCase()}`;
if (knownNames.has(nameKey)) {
seenStNrs.add(r.standesbuchNr);
allRows.push(r);
matched++;
}
}
log(` Unfiltered page: ${rows.length} total rows, ${matched} name-matched`);
}
log(`scrapeKnownMembers: ${allRows.length} members collected`);
// Build FdiskMember objects
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const status = mapFdiskStatus(row.status);
if (!status) continue; // skip members with non-synced statuses
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status,
detailUrl: row.href,
geburtsort: null,
geschlecht: null,
beruf: null,
wohnort: null,
plz: null,
});
}
return members;
}
export async function scrapeAll(username: string, password: string): Promise<{
members: FdiskMember[];
ausbildungen: FdiskAusbildung[];
befoerderungen: FdiskBefoerderung[];
untersuchungen: FdiskUntersuchung[];
fahrgenehmigungen: FdiskFahrgenehmigung[];
}> {
const browser = await chromium.launch({
headless: true,
args: ['--disable-gpu', '--disable-software-rasterizer'],
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
await login(page, username, password);
// After login, page is on Start.aspx (frameset).
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
// the server reads the org context from session variables set by the menu.
// Navigate via the menu frame (left.aspx) to set session state properly.
const mainFrame = await navigateToMemberList(page);
const members = await scrapeMembers(mainFrame);
log(`Found ${members.length} members (full scrape)`);
const ausbildungen: FdiskAusbildung[] = [];
const befoerderungen: FdiskBefoerderung[] = [];
const untersuchungen: FdiskUntersuchung[] = [];
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
for (const member of members) {
try {
// Navigate to member detail page — use direct URL if available, else search+click fallback
const onDetail = member.detailUrl
? (await frame_goto(mainFrame, member.detailUrl), true)
: await navigateToMemberDetailBySearch(mainFrame, member.standesbuchNr);
if (!onDetail) {
log(` SKIP ${member.vorname} ${member.zuname} (${member.standesbuchNr}): could not reach detail page`);
continue;
}
// Scrape extra profile fields from the detail form
const profileFields = await scrapeDetailProfileFields(mainFrame);
member.geburtsort = profileFields.geburtsort;
member.geschlecht = profileFields.geschlecht;
member.beruf = profileFields.beruf;
member.wohnort = profileFields.wohnort;
member.plz = profileFields.plz;
// Extract mitgliedschaft + person params from the current URL for constructing sub-section URLs.
// PersonenForm.aspx is in the personen module; sub-sections are each in their own module.
// URL pattern: ?search=1&searchid_mitgliedschaften=X&id_personen=Y&id_mitgliedschaften=X&searchid_personen=Y&searchid_maskmode=
const currentUrl = mainFrame.url();
const urlObj = new URL(currentUrl);
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
const idPersonen = urlObj.searchParams.get('id_personen');
const idInstanzen = urlObj.searchParams.get('id_instanzen') ?? ID_INSTANZEN;
// Ausbildungen
if (idMitgliedschaft && idPersonen) {
try {
const quals = await scrapeAusbildungenFromDetailPage(mainFrame, member, idMitgliedschaft, idPersonen);
ausbildungen.push(...quals);
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
} catch (err: any) {
log(` WARN: Ausbildungen scrape failed for ${member.vorname} ${member.zuname} (StNr ${member.standesbuchNr}): ${err.message}`);
}
}
// Beförderungen
const befos = (idMitgliedschaft && idPersonen)
? await scrapeMemberBefoerderungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
befoerderungen.push(...befos);
// Untersuchungen
const unters = (idMitgliedschaft && idPersonen)
? await scrapeMemberUntersuchungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
untersuchungen.push(...unters);
// Fahrgenehmigungen
const fahrg = (idMitgliedschaft && idPersonen)
? await scrapeMemberFahrgenehmigungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen, idInstanzen)
: [];
fahrgenehmigungen.push(...fahrg);
log(` ${member.vorname} ${member.zuname}: ${befos.length} Beförderungen, ${unters.length} Untersuchungen, ${fahrg.length} Fahrgenehmigungen`);
await page.waitForTimeout(500);
} catch (err) {
log(` WARN: could not scrape detail for ${member.vorname} ${member.zuname}: ${err}`);
}
}
return { members, ausbildungen, befoerderungen, untersuchungen, fahrgenehmigungen };
} finally {
await browser.close();
}
}
/** Navigate a frame, waiting for networkidle. Wrapper to avoid repetition. */
async function frame_goto(frame: Frame, url: string): Promise<void> {
await frame.goto(url, { waitUntil: 'networkidle' });
}
/** Select "Alle" in the anzeige_count dropdown to show all rows, then wait for reload. */
async function selectAlleAnzeige(frame: Frame): Promise<void> {
try {
const sel = frame.locator('select[name="anzeige_count"], select#anzeige_count');
if (await sel.count() === 0) return;
const current = await sel.inputValue().catch(() => '');
if (current === 'ALLE') return; // already showing all
await sel.selectOption('ALLE');
await frame.waitForLoadState('networkidle').catch(() => {});
} catch {
// Dropdown may not exist on all pages — that's OK
}
}
async function login(page: Page, username: string, password: string): Promise<void> {
log(`Navigating to ${LOGIN_URL}`);
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
await page.waitForLoadState('networkidle');
// Check if already logged in
const currentUrlBefore = page.url();
if (!currentUrlBefore.toLowerCase().includes('login')) {
log(`Already logged in, on: ${currentUrlBefore}`);
return;
}
// Exact selectors from the known login form HTML
const usernameField = page.locator('#login');
const passwordField = page.locator('#password');
const submitButton = page.locator('#Submit2');
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
await usernameField.fill(username);
await passwordField.fill(password);
await submitButton.click();
// Wait for navigation away from the login page (up to 15s)
try {
await page.waitForURL(
(url) => !url.toString().toLowerCase().includes('login'),
{ waitUntil: 'networkidle', timeout: 15000 },
);
} catch {
// waitForURL timed out — fall through to the URL check below
}
// Verify we're logged in
const currentUrl = page.url();
if (currentUrl.toLowerCase().includes('login')) {
throw new Error(`Login failed — still on login page: ${currentUrl}`);
}
log(`Logged in successfully, redirected to: ${currentUrl}`);
}
/**
* Fallback navigation to a member's detail page when no direct URL is available.
* Navigates to the member list, filters by exact standesbuchNr, then clicks the result row.
* Returns true if we successfully landed on a detail page.
*/
async function navigateToMemberDetailBySearch(frame: Frame, standesbuchNr: string): Promise<boolean> {
// Navigate to the member list
await frame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await frame.waitForLoadState('networkidle');
// Set exact standesbuchNr filter in the search form
const formOk = await frame.evaluate((stNr) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (!fromFld || !toFld) return false;
fromFld.value = stNr;
toFld.value = stNr;
return true;
}, standesbuchNr);
if (!formOk) {
log(` WARN navigateToMemberDetailBySearch: search form not usable for StNr ${standesbuchNr}`);
return false;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Click on the first data row — FDISK rows navigate to the detail page on click
const firstRowLink = await frame.$('table.FdcLayList tbody tr:first-child a, table.FdcLayList tbody tr:first-child td');
if (!firstRowLink) {
log(` WARN navigateToMemberDetailBySearch: no result row for StNr ${standesbuchNr}`);
return false;
}
try {
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }),
firstRowLink.click(),
]);
} catch {
// waitForNavigation may time out if click didn't navigate (e.g. onclick vs href)
// Check whether the URL changed at all
}
const url = frame.url();
const onDetailPage = !url.includes('MitgliedschaftenList') && !url.includes('meine_Mitglieder');
if (onDetailPage) {
log(` Navigated to detail via search+click: ${url}`);
} else {
log(` WARN navigateToMemberDetailBySearch: still on list page after click for StNr ${standesbuchNr}`);
}
return onDetailPage;
}
async function navigateToMemberList(page: Page): Promise<Frame> {
const mainFrame = page.frame({ name: 'mainFrame' });
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await mainFrame.waitForLoadState('networkidle');
const url = mainFrame.url();
const title = await mainFrame.title();
log(`mainFrame loaded: ${url} — title: "${title}"`);
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
throw new Error(`Member list returned error page: ${url}`);
}
return mainFrame;
}
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`Scraping member list from: ${frame.url()}`);
// Clear the Standesbuchnummer filter if the search form is present.
// FDISK pre-fills the logged-in user's own Standesbuchnummer, which limits results to 1 member.
// We clear it before submitting so all members of the fire station are returned.
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
if (hasForm) {
const fieldDump = await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return { cleared: [], pageSizeSet: null as string | null, allFields: [] };
const cleared: string[] = [];
const allFields: string[] = [];
let pageSizeSet: string | null = null;
for (const el of Array.from(form.elements) as HTMLInputElement[]) {
if (el.type === 'hidden') continue;
const name = (el.name ?? '').toLowerCase();
const id = (el.id ?? '').toLowerCase();
if (el.value) allFields.push(`${el.name || el.id}=${el.value}`);
if (name.includes('standesbuch') || id.includes('standesbuch')) {
el.value = '';
cleared.push(el.name || el.id);
}
// Maximize page size: look for a select AND its paired hidden input
// FDISK uses a custom Dd widget where <select name="xDd_dd"> is the visible dropdown
// but the actual POST value comes from <input type="hidden" name="xDd_id"> or similar.
if ((name.includes('anzahl') || id.includes('anzahl') ||
name.includes('pagesize') || id.includes('pagesize') ||
name.includes('rows') || id.includes('rows')) &&
el.tagName === 'SELECT') {
const select = el as unknown as HTMLSelectElement;
// Pick the largest numeric option value, or the last option as fallback
let bestOption: HTMLOptionElement | null = null;
let bestVal = -1;
for (const opt of Array.from(select.options)) {
const n = parseInt(opt.value, 10);
if (!isNaN(n) && n > bestVal) { bestVal = n; bestOption = opt; }
}
if (!bestOption && select.options.length > 0) {
bestOption = select.options[select.options.length - 1];
}
if (bestOption) {
select.value = bestOption.value;
pageSizeSet = `${el.name || el.id}=${bestOption.value}`;
// Also update the paired hidden field used by the Dd custom widget.
// Common patterns: xDd_dd → xDd_id or xDd_hd
const baseName = (el.name || el.id).replace(/_dd$/i, '');
for (const suffix of ['_id', '_hd', '_val']) {
const hidden = form.elements[baseName + suffix] as HTMLInputElement | undefined;
if (hidden && hidden.type === 'hidden') {
hidden.value = bestOption.value;
pageSizeSet += ` (also set ${baseName + suffix})`;
}
}
}
}
}
return { cleared, pageSizeSet, allFields };
});
if (fieldDump.allFields.length > 0) {
log(`Search form active filters before clear: ${fieldDump.allFields.join(', ')}`);
}
if (fieldDump.cleared.length > 0) {
log(`Cleared Standesbuchnummer filter fields: ${fieldDump.cleared.join(', ')}`);
} else {
log('Search form found — no Standesbuchnummer field detected, submitting as-is');
}
if (fieldDump.pageSizeSet) {
log(`Set page size: ${fieldDump.pageSizeSet}`);
} else {
log('No page size field found — will paginate through all results');
}
// Use Promise.all to start waiting for navigation BEFORE triggering the submit,
// otherwise waitForLoadState resolves against the already-idle current page.
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
log(`After form submit: ${frame.url()}`);
}
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const firstRows = await parseRowsFromTable(frame);
log(`Initial fetch: ${firstRows.length} rows`);
// Log href debug info for the first row to diagnose URL extraction
const rowDebug = await frame.evaluate(() => (window as any).__fdiskFirstRowDebug ?? 'no debug info');
log(`Row href debug: ${rowDebug}`);
for (const row of firstRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
const pagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
log(`Pagination: "${pagination}"`);
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const allRows: ParsedRow[] = [...firstRows];
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
let startNr = 1;
let consecutiveEmpty = 0;
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
const endNr = startNr + BATCH - 1;
// Set StNr range in the search form and submit
const formOk = await frame.evaluate(({ s, e }: { s: number; e: number }) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
if (!fromFld || !toFld) return false;
fromFld.value = String(s);
toFld.value = String(e);
return true;
}, { s: startNr, e: endNr });
if (!formOk) {
log('WARN: could not set StNr range fields — aborting range queries');
break;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rangeRows = await parseRowsFromTable(frame);
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
allRows.push(...newRows);
log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
for (const row of newRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const status = mapFdiskStatus(row.status);
if (!status) continue; // skip members with non-synced statuses
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status,
detailUrl: row.href,
geburtsort: null,
geschlecht: null,
beruf: null,
wohnort: null,
plz: null,
});
}
return members;
}
async function parseRowsFromTable(frame: Frame) {
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
// Each <td> contains an <a title="value"> — the title is the clean cell text.
// Navigation may be via href or onclick handlers (FDISK uses both depending on version).
return frame.$$eval('table.FdcLayList tbody tr', (trs) =>
trs.map((tr, rowIdx) => {
const cells = Array.from(tr.querySelectorAll('td'));
const val = (i: number) => {
const a = cells[i]?.querySelector('a');
const title = a?.getAttribute('title')?.trim();
// Use title only if non-empty; otherwise fall back to textContent
return (title || cells[i]?.textContent || '').trim();
};
// Extract detail URL — try multiple strategies:
// 1. Standard <a href="..."> pointing to an .aspx page
// 2. onclick attribute on <a>, <td>, or <tr> containing an .aspx URL
let href: string | null = null;
let debugInfo = '';
for (const a of Array.from(tr.querySelectorAll('a'))) {
const rawHref = (a as Element).getAttribute('href') ?? '';
debugInfo += `a.href="${rawHref}" `;
if (rawHref && rawHref !== '#' && rawHref !== '' && !rawHref.startsWith('javascript:')) {
href = (a as HTMLAnchorElement).href; // resolves relative → absolute
break;
}
}
if (!href) {
// Scan onclick on all ancestors + cells + anchors for .aspx URLs
const candidates: Element[] = [tr, ...Array.from(tr.querySelectorAll('a, td'))];
for (const el of candidates) {
const onclick = el.getAttribute('onclick') ?? '';
if (onclick) debugInfo += `onclick="${onclick}" `;
const match = onclick.match(/['"]([^'"]*\.aspx[^'"]*)['"]/);
if (match) {
try {
href = new URL(match[1], (window as Window).location.href).href;
} catch {
href = match[1];
}
break;
}
}
}
// Log debug info for first data row to help diagnose href extraction issues
if (rowIdx === 0 && val(2)) {
(window as any).__fdiskFirstRowDebug = `StNr=${val(2)} href=${href} debug=${debugInfo}`;
}
return {
status: val(1),
standesbuchNr: val(2),
dienstgrad: val(3),
vorname: val(4),
zuname: val(5),
geburtsdatum: val(6),
svnr: val(7),
eintrittsdatum: val(8),
abmeldedatum: val(9),
href,
};
}),
);
}
/**
* Scrape additional profile fields from the member detail form.
* Called while the frame is already on the member detail page.
*/
async function scrapeDetailProfileFields(frame: Frame): Promise<{
geburtsort: string | null;
geschlecht: string | null;
beruf: string | null;
wohnort: string | null;
plz: string | null;
}> {
return frame.evaluate(() => {
const val = (selector: string): string | null => {
const el = document.querySelector(selector) as HTMLInputElement | HTMLSelectElement | null;
if (!el) return null;
if (el.tagName === 'SELECT') {
const sel = el as HTMLSelectElement;
const opt = sel.options[sel.selectedIndex];
return opt ? (opt.text || opt.value || '').trim() || null : null;
}
return (el as HTMLInputElement).value?.trim() || null;
};
return {
geburtsort: val('input[name="geburtsort"]') ?? val('input[id*="geburtsort"]'),
geschlecht: val('select[name*="geschlecht"]') ?? val('select[id*="geschlecht"]'),
beruf: val('input[name="beruf"]') ?? val('input[id*="beruf"]'),
wohnort: val('input[name="ort"]') ?? val('input[id*="_ort"]') ?? val('input[name="wohnort"]'),
plz: val('input[name="plz"]') ?? val('input[id*="plz"]'),
};
});
}
/**
* Scrape Kurse (courses) by navigating to the KursteilnehmerListEdit.aspx page.
* This page uses indexed hidden form fields (kursart_bez_N, datum_von_N, etc.)
* which are far more reliable than heuristic table column detection.
*/
async function scrapeAusbildungenFromDetailPage(
frame: Frame,
member: FdiskMember,
idMitgliedschaft?: string | null,
idPersonen?: string | null,
): Promise<FdiskAusbildung[]> {
if (!idMitgliedschaft || !idPersonen) {
log(` Kurse for StNr ${member.standesbuchNr}: missing mitgliedschaft/personen IDs, skipping`);
return [];
}
const url = `${BASE_URL}/fdisk/module/mgvw/kursteilnehmer/KursteilnehmerListEdit.aspx`
+ `?search=1&searchid_personen=${idPersonen}&searchid_mitgliedschaften=${idMitgliedschaft}`
+ `&id_personen=${idPersonen}&id_mitgliedschaften=${idMitgliedschaft}`
+ `&anzeige_count=ALLE`;
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → Kurse ERROR page: ${landed}`);
return [];
}
// Ensure all rows are visible (the URL param should already set this, but belt-and-suspenders)
await selectAlleAnzeige(frame);
// Read indexed form fields — same pattern as scrapeMemberFahrgenehmigungen
const rawRows = await frame.evaluate((stNr: string) => {
const rows: Array<{
standesbuchNr: string;
kursname: string;
kursnummer: string | null;
kurzbezeichnung: string | null;
erfolgscode: string | null;
kursDatum: string | null;
syncKey: string;
}> = [];
for (let i = 0; i < 500; i++) {
// kursart_bez is the sentinel — if it doesn't exist, we've passed all rows
const kursartBezEl = document.querySelector(`input[name="kursart_bez_${i}"]`) as HTMLInputElement | null;
if (!kursartBezEl) break;
const kursname = kursartBezEl.value?.trim() || '';
if (!kursname) continue;
const kursnummerEl = document.querySelector(`input[name="kursnummer_${i}"]`) as HTMLInputElement | null;
const datumVonEl = document.querySelector(`input[name="datum_von_${i}"]`) as HTMLInputElement | null;
const leistungsartEl = document.querySelector(`input[name="leistungsart_${i}"]`) as HTMLInputElement | null;
const kursnummer = kursnummerEl?.value?.trim() || null;
// datum_von format: "D.M.YYYY HH:MM:SS" — pass raw, parseDate handles it
const kursDatum = datumVonEl?.value?.trim() || null;
const erfolgscode = leistungsartEl?.value?.trim() || null;
// Kurzbezeichnung: extract from <nobr> in the same table row as the kursnummer input
let kurzbezeichnung: string | null = null;
const row = kursnummerEl?.closest('tr');
if (row) {
const nobrs = row.querySelectorAll('nobr');
// First <nobr> is kurzbezeichnung, second is kursname
if (nobrs.length >= 1) {
kurzbezeichnung = nobrs[0].textContent?.replace(/\u00A0/g, ' ').trim() || null;
}
}
rows.push({
standesbuchNr: stNr,
kursname,
kursnummer,
kurzbezeichnung,
erfolgscode,
kursDatum,
syncKey: `${stNr}::${kursname}::${kursDatum ?? ''}`,
});
}
return rows;
}, member.standesbuchNr).catch(() => [] as Array<{
standesbuchNr: string; kursname: string; kursnummer: string | null;
kurzbezeichnung: string | null; erfolgscode: string | null; kursDatum: string | null;
syncKey: string;
}>);
log(` → Kurse form-field extraction: ${rawRows.length} rows found`);
// Post-process: parse dates and rebuild syncKeys with ISO dates
const results: FdiskAusbildung[] = rawRows.map(a => {
const kursDatum = parseDate(a.kursDatum);
return {
standesbuchNr: a.standesbuchNr,
kursname: a.kursname,
kursnummer: a.kursnummer,
kurzbezeichnung: a.kurzbezeichnung,
erfolgscode: a.erfolgscode,
kursDatum,
ablaufdatum: null,
ort: null,
bemerkung: null,
syncKey: `${a.standesbuchNr}::${a.kursname}::${kursDatum ?? ''}`,
};
});
return results;
}
/**
* Navigate to a sub-section URL and wait for any data table.
* Logs the actual URL and title so wrong-page issues are visible.
* Returns all <tr> rows from the first table found, or null if none.
*/
async function navigateAndGetTableRows(
frame: Frame,
url: string,
): Promise<{ rows: Array<{ cells: string[] }>; dateColIdx: number } | null> {
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
// Check for FDISK error pages
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → ERROR page: ${landed}`);
return null;
}
// Show all rows (default is 10)
await selectAlleAnzeige(frame);
// Log all table classes on the page for diagnostics (first time only)
const tableInfo = await frame.evaluate(() => {
return Array.from(document.querySelectorAll('table')).map((t, i) => {
const cls = t.className || '(no class)';
const id = t.id || '';
const rowCount = t.querySelectorAll('tr').length;
return `${i}:cls="${cls}"${id ? ` id="${id}"` : ''} rows=${rowCount}`;
}).join(' | ');
}).catch(() => 'N/A');
log(` → tables: ${tableInfo}`);
// Collect rows from ALL tables, reading input/select values for inline-edit pages
const allRows = await frame.evaluate(() => {
const results: Array<{ cells: string[]; tableClass: string }> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
for (const tr of Array.from(table.querySelectorAll('tbody tr, tr'))) {
// Skip rows that are nested inside a child table
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue; // skip single-cell nav/header rows
results.push({
tableClass: cls,
cells: tds.map(td => {
const input = td.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input && input.value?.trim()) return input.value.trim();
const sel = td.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const opt = sel.options[sel.selectedIndex];
return (opt?.text || opt?.value || '').trim();
}
// For FDISK list tables, the value is in <a title="..."> inside each cell
const anchor = td.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return td.textContent?.trim() ?? '';
}),
});
}
}
return results;
}).catch(() => [] as Array<{ cells: string[]; tableClass: string }>);
// Prefer rows from FdcLayList-class tables
const fdcRows = allRows.filter(r => r.tableClass.includes('FdcLayList'));
const resultRows = fdcRows.length > 0 ? fdcRows : allRows;
// Strip \u00A0 (non-breaking space) from all cell values and trim
const mapped = resultRows.map(r => ({
cells: r.cells.map(c => c.replace(/\u00A0/g, ' ').trim()),
}));
// Find date column dynamically: count date matches per column across ALL rows
// and pick the column with the MOST matches (avoids picking stray date in nav tables)
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
const dateCountByCol: Record<number, number> = {};
for (const r of mapped) {
for (let ci = 0; ci < r.cells.length; ci++) {
if (datePattern.test(r.cells[ci] ?? '')) {
dateCountByCol[ci] = (dateCountByCol[ci] || 0) + 1;
}
}
}
let dateColIdx = -1;
let maxCount = 0;
for (const [col, count] of Object.entries(dateCountByCol)) {
const colNum = Number(col);
if (count > maxCount || (count === maxCount && (dateColIdx === -1 || colNum < dateColIdx))) {
dateColIdx = colNum;
maxCount = count;
}
}
const dataRows = dateColIdx >= 0
? mapped.filter(r => datePattern.test(r.cells[dateColIdx] ?? ''))
: [];
log(`${allRows.length} total rows, ${fdcRows.length} FdcLayList rows, ${dataRows.length} data rows (date in col ${dateColIdx})`);
return { rows: dataRows, dateColIdx };
}
/**
* Navigate to the Beförderungen sub-page and scrape all promotions.
*/
async function scrapeMemberBefoerderungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskBefoerderung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/befoerderungen/befoerderungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
const result = await navigateAndGetTableRows(frame, url);
if (!result) return [];
const { rows, dateColIdx } = result;
const results: FdiskBefoerderung[] = [];
for (const row of rows) {
const datum = parseDate(row.cells[dateColIdx]);
// The next non-empty column after the date holds the Dienstgrad
let dienstgrad = '';
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
const v = cellText(row.cells[ci]);
if (v) { dienstgrad = v; break; }
}
const syncKey = `${standesbuchNr}::${dienstgrad}::${datum ?? ''}`;
results.push({ standesbuchNr, datum, dienstgrad, syncKey });
}
log(` Beförderungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const b of results) log(` ${b.datum ?? '—'} ${b.dienstgrad}`);
return results;
}
/**
* Navigate to the Untersuchungen sub-page and scrape all medical exams.
*/
async function scrapeMemberUntersuchungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskUntersuchung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/untersuchungen/UntersuchungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
// Always dump for diagnosis when debug is on
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → Untersuchungen ERROR page: ${landed}`);
return [];
}
// Show all rows
await selectAlleAnzeige(frame);
// Try to navigate to history/detail view if available
// FDISK may show only the most recent per exam type on the list page.
// Look for a "Verlauf" or "Detail" or "Alle anzeigen" link/button
const hasHistoryLink = await frame.evaluate(() => {
const links = Array.from(document.querySelectorAll('a, input[type="button"], button'));
for (const el of links) {
const text = (el.textContent || '').toLowerCase();
const title = (el.getAttribute('title') || '').toLowerCase();
if (text.includes('verlauf') || text.includes('historie') || text.includes('alle anzeigen')
|| title.includes('verlauf') || title.includes('historie')) {
return (el as HTMLElement).id || (el as HTMLAnchorElement).href || text;
}
}
return null;
}).catch(() => null);
if (hasHistoryLink) {
log(` → Found history link: ${hasHistoryLink}, navigating...`);
// Try to click or navigate to the history page for more complete data
try {
const navigated = await frame.evaluate(() => {
const links = Array.from(document.querySelectorAll('a, input[type="button"], button'));
for (const el of links) {
const text = (el.textContent || '').toLowerCase();
const title = (el.getAttribute('title') || '').toLowerCase();
if (text.includes('verlauf') || text.includes('historie') || text.includes('alle anzeigen')
|| title.includes('verlauf') || title.includes('historie')) {
if ((el as HTMLAnchorElement).href) {
return (el as HTMLAnchorElement).href;
}
(el as HTMLElement).click();
return 'clicked';
}
}
return null;
}).catch(() => null);
if (navigated && navigated !== 'clicked') {
await frame_goto(frame, navigated);
} else if (navigated === 'clicked') {
await frame.waitForNavigation({ timeout: 5000 }).catch(() => {});
}
await selectAlleAnzeige(frame);
} catch (e) {
log(` → Failed to follow history link: ${e}`);
}
}
// Parse the table using navigateAndGetTableRows logic (reuse existing page state)
// Re-collect rows from the already-loaded page
const allRows = await frame.evaluate(() => {
const results: Array<{ cells: string[]; tableClass: string }> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
for (const tr of Array.from(table.querySelectorAll('tbody tr, tr'))) {
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue;
results.push({
tableClass: cls,
cells: tds.map(td => {
const input = td.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input) return input.value?.trim() ?? '';
const sel = td.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const opt = sel.options[sel.selectedIndex];
return (opt?.text || opt?.value || '').trim();
}
const anchor = td.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return td.textContent?.trim() ?? '';
}),
});
}
}
return results;
}).catch(() => [] as Array<{ cells: string[]; tableClass: string }>);
const fdcRows = allRows.filter(r => r.tableClass.includes('FdcLayList'));
const resultRows = fdcRows.length > 0 ? fdcRows : allRows;
const mapped = resultRows.map(r => ({
cells: r.cells.map(c => c.replace(/\u00A0/g, ' ').trim()),
}));
// Find date column
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
let dateColIdx = -1;
for (const r of mapped) {
for (let ci = 0; ci < r.cells.length; ci++) {
if (datePattern.test(r.cells[ci] ?? '')) {
dateColIdx = ci;
break;
}
}
if (dateColIdx >= 0) break;
}
const dataRows = dateColIdx >= 0
? mapped.filter(r => datePattern.test(r.cells[dateColIdx] ?? ''))
: [];
log(` → Untersuchungen: ${allRows.length} total rows, ${dataRows.length} data rows (date in col ${dateColIdx})`);
const results: FdiskUntersuchung[] = [];
for (const row of dataRows) {
const valueCols: string[] = [];
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
const v = cellText(row.cells[ci]);
if (v !== null) valueCols.push(v);
}
const anmerkungen = valueCols[0] ?? null;
const art = valueCols[1] ?? null;
const ergebnis = valueCols[2] ?? null;
if (!art) continue;
const datum = parseDate(row.cells[dateColIdx]);
const syncKey = `${standesbuchNr}::${art}::${datum ?? ''}`;
results.push({
standesbuchNr,
datum,
anmerkungen,
art,
ergebnis,
syncKey,
});
}
log(` Untersuchungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const u of results) log(` ${u.datum ?? '—'} [${u.art}] ${u.ergebnis ?? '—'} | ${u.anmerkungen ?? ''}`);
return results;
}
/**
* Navigate to the Gesetzliche Fahrgenehmigungen sub-page and scrape all entries.
* This page is a ListEdit page with form fields named by row index pattern:
* ausstellungsdatum_{i}, gueltig_bis_{i}, behoerde_{i}, nummer_{i}, id_fahrgenehmigungsklassen_{i}
* Falls back to table-based parsing if field IDs are not found.
*/
async function scrapeMemberFahrgenehmigungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
idInstanzen: string,
): Promise<FdiskFahrgenehmigung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/ges_fahrgenehmigungen/Ges_fahrgenehmigungenListEdit.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`
+ `&searchid_instanzen=${idInstanzen}`;
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → Fahrgenehmigungen ERROR page: ${landed}`);
return [];
}
// Show all rows (default is 10)
await selectAlleAnzeige(frame);
// Read form fields by ID pattern: {fieldname}_{rowIndex}
const rawRows = await frame.evaluate(() => {
const rows: Array<{
ausstellungsdatum: string;
gueltigBis: string;
behoerde: string;
nummer: string;
klasse: string;
}> = [];
for (let i = 0; i < 100; i++) {
// Try to find any field for this row index — if none exist, we've passed all rows
const ausstellungEl = document.querySelector(`input[name="ausstellungsdatum_${i}"], input[id="ausstellungsdatum_${i}"]`) as HTMLInputElement | null;
const gueltigEl = document.querySelector(`input[name="gueltig_bis_${i}"], input[id="gueltig_bis_${i}"]`) as HTMLInputElement | null;
const behoerdeEl = document.querySelector(`input[name="behoerde_${i}"], input[id="behoerde_${i}"]`) as HTMLInputElement | null;
const nummerEl = document.querySelector(`input[name="nummer_${i}"], input[id="nummer_${i}"]`) as HTMLInputElement | null;
const klasseEl = document.querySelector(`select[name="id_fahrgenehmigungsklassen_${i}"], select[id="id_fahrgenehmigungsklassen_${i}"]`) as HTMLSelectElement | null;
// If no field found at all, stop
if (!ausstellungEl && !gueltigEl && !behoerdeEl && !nummerEl && !klasseEl) break;
// Read klasse from select: try selectedIndex, then fallback to [selected] attribute
let klasse = '';
if (klasseEl) {
const idx = klasseEl.selectedIndex;
if (idx >= 0 && klasseEl.options[idx]) {
klasse = (klasseEl.options[idx].text || klasseEl.options[idx].value || '').trim();
}
if (!klasse) {
const selectedOpt = klasseEl.querySelector('option[selected]') as HTMLOptionElement | null;
if (selectedOpt) {
klasse = (selectedOpt.text || selectedOpt.value || '').trim();
}
}
if (!klasse && klasseEl.value?.trim()) {
klasse = klasseEl.value.trim();
}
}
rows.push({
ausstellungsdatum: ausstellungEl?.value?.trim() ?? '',
gueltigBis: gueltigEl?.value?.trim() ?? '',
behoerde: behoerdeEl?.value?.trim() ?? '',
nummer: nummerEl?.value?.trim() ?? '',
klasse,
});
}
return rows;
}).catch(() => [] as Array<{ ausstellungsdatum: string; gueltigBis: string; behoerde: string; nummer: string; klasse: string }>);
log(` → Fahrgenehmigungen form-field extraction: ${rawRows.length} rows found`);
// If form-field approach found rows, use them
if (rawRows.length > 0) {
const VALID_LICENSE_CLASSES = new Set([
'A', 'A1', 'A2', 'AM', 'B', 'B1', 'BE', 'C', 'C1', 'CE', 'C1E',
'D', 'D1', 'DE', 'D1E', 'F', 'G', 'L', 'T',
]);
const results: FdiskFahrgenehmigung[] = [];
for (const row of rawRows) {
let klasse = cellText(row.klasse);
if (!klasse) continue;
// FDISK select option text includes prefix "KFZ-Führerschein / B" — extract just the class code
if (klasse.includes(' / ')) klasse = klasse.split(' / ').pop()!.trim();
// Validate klasse against whitelist — skip non-class data
if (!VALID_LICENSE_CLASSES.has(klasse.toUpperCase())) {
log(` → Skipping invalid klasse: "${klasse}"`);
continue;
}
const ausstellungsdatum = parseDate(row.ausstellungsdatum);
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
results.push({
standesbuchNr,
ausstellungsdatum,
gueltigBis: parseDate(row.gueltigBis),
behoerde: cellText(row.behoerde),
nummer: cellText(row.nummer),
klasse,
syncKey,
});
}
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
return results;
}
// Fallback: table-based parsing (original approach with extractCellValue)
log(` → Fahrgenehmigungen: no form fields found, falling back to table parsing`);
const pageData = await frame.evaluate(() => {
const extractCellValue = (cell: Element): string => {
const input = cell.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input && input.value?.trim()) return input.value.trim();
const sel = cell.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const idx = sel.selectedIndex;
if (idx >= 0 && sel.options[idx]) {
const t = (sel.options[idx].text || sel.options[idx].value || '').trim();
if (t) return t;
}
// Fallback: read the selected attribute directly from HTML
const selectedOpt = sel.querySelector('option[selected]') as HTMLOptionElement | null;
if (selectedOpt) {
const t = (selectedOpt.text || selectedOpt.value || '').trim();
if (t) return t;
}
if (sel.value?.trim()) return sel.value.trim();
}
const anchor = cell.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return cell.textContent?.trim() ?? '';
};
const tables: Array<{
tableClass: string;
headers: string[];
rows: Array<{ cells: string[] }>;
}> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
const thElements = Array.from(table.querySelectorAll('thead th, tr th'));
const headers = thElements.map(th => extractCellValue(th));
const dataRows: Array<{ cells: string[] }> = [];
for (const tr of Array.from(table.querySelectorAll('tr'))) {
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue;
if (tr.querySelectorAll('th').length > 0) continue;
dataRows.push({ cells: tds.map(td => extractCellValue(td)) });
}
tables.push({ tableClass: cls, headers, rows: dataRows });
}
return tables;
}).catch(() => [] as Array<{ tableClass: string; headers: string[]; rows: Array<{ cells: string[] }> }>);
// Diagnostic: log all tables found
for (let ti = 0; ti < pageData.length; ti++) {
const t = pageData[ti];
log(` → table ${ti}: cls="${t.tableClass}" headers=[${t.headers.join(', ')}] dataRows=${t.rows.length}`);
for (let ri = 0; ri < t.rows.length; ri++) {
const preview = t.rows[ri].cells.slice(0, 8).map((c, j) => `[${j}]="${c}"`).join(' ');
log(` row ${ri}: ${preview}`);
}
}
const bestTable = pageData.find(t => t.tableClass.includes('FdcLayList') && t.rows.length > 0)
|| pageData.filter(t => t.rows.length > 0).sort((a, b) => b.rows.length - a.rows.length)[0];
if (!bestTable || bestTable.rows.length === 0) {
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: no data table found`);
return [];
}
const headers = bestTable.headers.map(h => h.toLowerCase());
log(` Fahrgenehmigungen headers: [${headers.join(', ')}]`);
let klasseIdx = headers.findIndex(h => h.includes('klasse') || h.includes('fahrgenehmigung'));
let ausstellungIdx = headers.findIndex(h => h.includes('ausstellung'));
let gueltigIdx = headers.findIndex(h => h.includes('gültig') || h.includes('gultig') || h.includes('ablauf'));
let behoerdeIdx = headers.findIndex(h => h.includes('behörde') || h.includes('behorde'));
let nummerIdx = headers.findIndex(h => h.includes('nummer') || h.includes('nr'));
const KNOWN_KLASSEN = new Set([
'AM', 'A1', 'A2', 'A', 'B', 'BE', 'C1', 'C1E', 'C', 'CE',
'D1', 'D1E', 'D', 'DE', 'F', 'L', 'L17', 'B+E', 'C+E', 'D+E',
]);
if (klasseIdx === -1) {
for (const row of bestTable.rows.slice(0, 3)) {
for (let ci = 0; ci < row.cells.length; ci++) {
const val = row.cells[ci]?.trim();
// Match known klassen or values containing "Führerschein" etc.
if (KNOWN_KLASSEN.has(val.toUpperCase()) || /führerschein|lenkberechtigung/i.test(val)) {
klasseIdx = ci;
log(` Fahrgenehmigungen: found Klasse in column ${ci} by data inspection`);
break;
}
}
if (klasseIdx >= 0) break;
}
}
if (ausstellungIdx === -1) {
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
for (const row of bestTable.rows.slice(0, 3)) {
for (let ci = 0; ci < row.cells.length; ci++) {
if (ci === klasseIdx) continue;
if (datePattern.test(row.cells[ci]?.trim())) {
ausstellungIdx = ci;
break;
}
}
if (ausstellungIdx >= 0) break;
}
}
log(` Fahrgenehmigungen column map: klasse=${klasseIdx} ausstellung=${ausstellungIdx} gueltig=${gueltigIdx} behoerde=${behoerdeIdx} nummer=${nummerIdx}`);
if (klasseIdx === -1) {
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: could not determine Klasse column. Returning empty.`);
return [];
}
const results: FdiskFahrgenehmigung[] = [];
for (const row of bestTable.rows) {
const klasse = cellText(row.cells[klasseIdx]);
if (!klasse) continue;
if (/klasse|fahrgenehmigung|ausstellung|datensätze|information|tiefennavigation/i.test(klasse)) continue;
if (/^\d{2}\.\d{2}\.\d{4}$/.test(klasse)) continue;
const ausstellungsdatum = parseDate(ausstellungIdx >= 0 ? row.cells[ausstellungIdx] : undefined);
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
results.push({
standesbuchNr,
ausstellungsdatum,
gueltigBis: parseDate(gueltigIdx >= 0 ? row.cells[gueltigIdx] : undefined),
behoerde: cellText(behoerdeIdx >= 0 ? row.cells[behoerdeIdx] : undefined),
nummer: cellText(nummerIdx >= 0 ? row.cells[nummerIdx] : undefined),
klasse,
syncKey,
});
}
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
return results;
}
// Legacy export kept for compatibility — delegates to the new unified flow
export async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
if (!member.detailUrl) return [];
await frame_goto(frame, member.detailUrl);
// Try to extract IDs from the detail URL
const urlObj = new URL(member.detailUrl, frame.url());
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
const idPersonen = urlObj.searchParams.get('id_personen');
return scrapeAusbildungenFromDetailPage(frame, member, idMitgliedschaft, idPersonen);
}