Files
dashboard/sync/src/scraper.ts
Matthias Hochmeister e666ff434e update
2026-03-13 21:44:54 +01:00

814 lines
33 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { chromium, Page, Frame } from '@playwright/test';
import {
FdiskMember,
FdiskAusbildung,
FdiskBefoerderung,
FdiskUntersuchung,
FdiskFahrgenehmigung,
} from './types';
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
function log(msg: string) {
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
}
/**
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
* Returns null if empty or unparseable.
*/
function parseDate(raw: string | null | undefined): string | null {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
if (!match) return null;
return `${match[3]}-${match[2]}-${match[1]}`;
}
/**
* Extract text content from a cell, trimmed, or null if empty.
*/
function cellText(text: string | undefined | null): string | null {
const t = (text ?? '').trim();
return t || null;
}
export async function scrapeAll(username: string, password: string, knownStNrs: Set<string>, knownNames: Set<string>): Promise<{
members: FdiskMember[];
ausbildungen: FdiskAusbildung[];
befoerderungen: FdiskBefoerderung[];
untersuchungen: FdiskUntersuchung[];
fahrgenehmigungen: FdiskFahrgenehmigung[];
}> {
const browser = await chromium.launch({
headless: true,
args: ['--disable-gpu', '--disable-software-rasterizer'],
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
await login(page, username, password);
// After login, page is on Start.aspx (frameset).
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
// the server reads the org context from session variables set by the menu.
// Navigate via the menu frame (left.aspx) to set session state properly.
const mainFrame = await navigateToMemberList(page);
const members = await scrapeMembers(mainFrame);
log(`Found ${members.length} members`);
const ausbildungen: FdiskAusbildung[] = [];
const befoerderungen: FdiskBefoerderung[] = [];
const untersuchungen: FdiskUntersuchung[] = [];
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
for (const member of members) {
// Only scrape detail pages for members with a dashboard account
// (matched by standesbuchNr or by name for first-time linking)
const nameKey = `${member.vorname.toLowerCase()}::${member.zuname.toLowerCase()}`;
if (!knownStNrs.has(member.standesbuchNr) && !knownNames.has(nameKey)) {
continue;
}
try {
// Navigate to member detail page — use direct URL if available, else search+click fallback
const onDetail = member.detailUrl
? (await frame_goto(mainFrame, member.detailUrl), true)
: await navigateToMemberDetailBySearch(mainFrame, member.standesbuchNr);
if (!onDetail) {
log(` SKIP ${member.vorname} ${member.zuname} (${member.standesbuchNr}): could not reach detail page`);
continue;
}
// Scrape extra profile fields from the detail form
const profileFields = await scrapeDetailProfileFields(mainFrame);
member.geburtsort = profileFields.geburtsort;
member.geschlecht = profileFields.geschlecht;
member.beruf = profileFields.beruf;
member.wohnort = profileFields.wohnort;
member.plz = profileFields.plz;
// Extract mitgliedschaft + person params from the current URL for constructing sub-section URLs.
// PersonenForm.aspx is in the personen module; sub-sections are each in their own module.
// URL pattern: ?search=1&searchid_mitgliedschaften=X&id_personen=Y&id_mitgliedschaften=X&searchid_personen=Y&searchid_maskmode=
const currentUrl = mainFrame.url();
const urlObj = new URL(currentUrl);
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
const idPersonen = urlObj.searchParams.get('id_personen');
const idInstanzen = urlObj.searchParams.get('id_instanzen') ?? ID_INSTANZEN;
// Ausbildungen
const quals = await scrapeAusbildungenFromDetailPage(mainFrame, member);
ausbildungen.push(...quals);
// Beförderungen
const befos = (idMitgliedschaft && idPersonen)
? await scrapeMemberBefoerderungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
befoerderungen.push(...befos);
// Untersuchungen
const unters = (idMitgliedschaft && idPersonen)
? await scrapeMemberUntersuchungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
: [];
untersuchungen.push(...unters);
// Fahrgenehmigungen
const fahrg = (idMitgliedschaft && idPersonen)
? await scrapeMemberFahrgenehmigungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen, idInstanzen)
: [];
fahrgenehmigungen.push(...fahrg);
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen, ${befos.length} Beförderungen, ${unters.length} Untersuchungen, ${fahrg.length} Fahrgenehmigungen`);
await page.waitForTimeout(500);
} catch (err) {
log(` WARN: could not scrape detail for ${member.vorname} ${member.zuname}: ${err}`);
}
}
return { members, ausbildungen, befoerderungen, untersuchungen, fahrgenehmigungen };
} finally {
await browser.close();
}
}
/** Navigate a frame, waiting for networkidle. Wrapper to avoid repetition. */
async function frame_goto(frame: Frame, url: string): Promise<void> {
await frame.goto(url, { waitUntil: 'networkidle' });
}
async function login(page: Page, username: string, password: string): Promise<void> {
log(`Navigating to ${LOGIN_URL}`);
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
await page.waitForLoadState('networkidle');
// Check if already logged in
const currentUrlBefore = page.url();
if (!currentUrlBefore.toLowerCase().includes('login')) {
log(`Already logged in, on: ${currentUrlBefore}`);
return;
}
// Exact selectors from the known login form HTML
const usernameField = page.locator('#login');
const passwordField = page.locator('#password');
const submitButton = page.locator('#Submit2');
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
await usernameField.fill(username);
await passwordField.fill(password);
await submitButton.click();
// Wait for navigation away from the login page (up to 15s)
try {
await page.waitForURL(
(url) => !url.toString().toLowerCase().includes('login'),
{ waitUntil: 'networkidle', timeout: 15000 },
);
} catch {
// waitForURL timed out — fall through to the URL check below
}
// Verify we're logged in
const currentUrl = page.url();
if (currentUrl.toLowerCase().includes('login')) {
throw new Error(`Login failed — still on login page: ${currentUrl}`);
}
log(`Logged in successfully, redirected to: ${currentUrl}`);
}
/**
* Fallback navigation to a member's detail page when no direct URL is available.
* Navigates to the member list, filters by exact standesbuchNr, then clicks the result row.
* Returns true if we successfully landed on a detail page.
*/
async function navigateToMemberDetailBySearch(frame: Frame, standesbuchNr: string): Promise<boolean> {
// Navigate to the member list
await frame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await frame.waitForLoadState('networkidle');
// Set exact standesbuchNr filter in the search form
const formOk = await frame.evaluate((stNr) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
if (!fromFld || !toFld) return false;
fromFld.value = stNr;
toFld.value = stNr;
return true;
}, standesbuchNr);
if (!formOk) {
log(` WARN navigateToMemberDetailBySearch: search form not usable for StNr ${standesbuchNr}`);
return false;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
// Click on the first data row — FDISK rows navigate to the detail page on click
const firstRowLink = await frame.$('table.FdcLayList tbody tr:first-child a, table.FdcLayList tbody tr:first-child td');
if (!firstRowLink) {
log(` WARN navigateToMemberDetailBySearch: no result row for StNr ${standesbuchNr}`);
return false;
}
try {
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }),
firstRowLink.click(),
]);
} catch {
// waitForNavigation may time out if click didn't navigate (e.g. onclick vs href)
// Check whether the URL changed at all
}
const url = frame.url();
const onDetailPage = !url.includes('MitgliedschaftenList') && !url.includes('meine_Mitglieder');
if (onDetailPage) {
log(` Navigated to detail via search+click: ${url}`);
} else {
log(` WARN navigateToMemberDetailBySearch: still on list page after click for StNr ${standesbuchNr}`);
}
return onDetailPage;
}
async function navigateToMemberList(page: Page): Promise<Frame> {
const mainFrame = page.frame({ name: 'mainFrame' });
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await mainFrame.waitForLoadState('networkidle');
const url = mainFrame.url();
const title = await mainFrame.title();
log(`mainFrame loaded: ${url} — title: "${title}"`);
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
throw new Error(`Member list returned error page: ${url}`);
}
return mainFrame;
}
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`Scraping member list from: ${frame.url()}`);
// Clear the Standesbuchnummer filter if the search form is present.
// FDISK pre-fills the logged-in user's own Standesbuchnummer, which limits results to 1 member.
// We clear it before submitting so all members of the fire station are returned.
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
if (hasForm) {
const fieldDump = await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return { cleared: [], pageSizeSet: null as string | null, allFields: [] };
const cleared: string[] = [];
const allFields: string[] = [];
let pageSizeSet: string | null = null;
for (const el of Array.from(form.elements) as HTMLInputElement[]) {
if (el.type === 'hidden') continue;
const name = (el.name ?? '').toLowerCase();
const id = (el.id ?? '').toLowerCase();
if (el.value) allFields.push(`${el.name || el.id}=${el.value}`);
if (name.includes('standesbuch') || id.includes('standesbuch')) {
el.value = '';
cleared.push(el.name || el.id);
}
// Maximize page size: look for a select AND its paired hidden input
// FDISK uses a custom Dd widget where <select name="xDd_dd"> is the visible dropdown
// but the actual POST value comes from <input type="hidden" name="xDd_id"> or similar.
if ((name.includes('anzahl') || id.includes('anzahl') ||
name.includes('pagesize') || id.includes('pagesize') ||
name.includes('rows') || id.includes('rows')) &&
el.tagName === 'SELECT') {
const select = el as unknown as HTMLSelectElement;
// Pick the largest numeric option value, or the last option as fallback
let bestOption: HTMLOptionElement | null = null;
let bestVal = -1;
for (const opt of Array.from(select.options)) {
const n = parseInt(opt.value, 10);
if (!isNaN(n) && n > bestVal) { bestVal = n; bestOption = opt; }
}
if (!bestOption && select.options.length > 0) {
bestOption = select.options[select.options.length - 1];
}
if (bestOption) {
select.value = bestOption.value;
pageSizeSet = `${el.name || el.id}=${bestOption.value}`;
// Also update the paired hidden field used by the Dd custom widget.
// Common patterns: xDd_dd → xDd_id or xDd_hd
const baseName = (el.name || el.id).replace(/_dd$/i, '');
for (const suffix of ['_id', '_hd', '_val']) {
const hidden = form.elements[baseName + suffix] as HTMLInputElement | undefined;
if (hidden && hidden.type === 'hidden') {
hidden.value = bestOption.value;
pageSizeSet += ` (also set ${baseName + suffix})`;
}
}
}
}
}
return { cleared, pageSizeSet, allFields };
});
if (fieldDump.allFields.length > 0) {
log(`Search form active filters before clear: ${fieldDump.allFields.join(', ')}`);
}
if (fieldDump.cleared.length > 0) {
log(`Cleared Standesbuchnummer filter fields: ${fieldDump.cleared.join(', ')}`);
} else {
log('Search form found — no Standesbuchnummer field detected, submitting as-is');
}
if (fieldDump.pageSizeSet) {
log(`Set page size: ${fieldDump.pageSizeSet}`);
} else {
log('No page size field found — will paginate through all results');
}
// Use Promise.all to start waiting for navigation BEFORE triggering the submit,
// otherwise waitForLoadState resolves against the already-idle current page.
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
log(`After form submit: ${frame.url()}`);
}
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
const firstRows = await parseRowsFromTable(frame);
log(`Initial fetch: ${firstRows.length} rows`);
// Log href debug info for the first row to diagnose URL extraction
const rowDebug = await frame.evaluate(() => (window as any).__fdiskFirstRowDebug ?? 'no debug info');
log(`Row href debug: ${rowDebug}`);
for (const row of firstRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
const pagination = await frame.evaluate(() =>
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
);
log(`Pagination: "${pagination}"`);
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
const allRows: ParsedRow[] = [...firstRows];
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
let startNr = 1;
let consecutiveEmpty = 0;
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
const endNr = startNr + BATCH - 1;
// Set StNr range in the search form and submit
const formOk = await frame.evaluate(({ s, e }: { s: number; e: number }) => {
const form = (document as any).forms['frmsearch'];
if (!form) return false;
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
if (!fromFld || !toFld) return false;
fromFld.value = String(s);
toFld.value = String(e);
return true;
}, { s: startNr, e: endNr });
if (!formOk) {
log('WARN: could not set StNr range fields — aborting range queries');
break;
}
await Promise.all([
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
]);
const rangeRows = await parseRowsFromTable(frame);
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
allRows.push(...newRows);
log(`StNr ${startNr}${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
for (const row of newRows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
}
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
startNr = endNr + 1;
}
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
}
log(`Parsed ${allRows.length} raw rows total`);
const members: FdiskMember[] = [];
for (const row of allRows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
geburtsort: null,
geschlecht: null,
beruf: null,
wohnort: null,
plz: null,
});
}
return members;
}
async function parseRowsFromTable(frame: Frame) {
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
// Each <td> contains an <a title="value"> — the title is the clean cell text.
// Navigation may be via href or onclick handlers (FDISK uses both depending on version).
return frame.$$eval('table.FdcLayList tbody tr', (trs) =>
trs.map((tr, rowIdx) => {
const cells = Array.from(tr.querySelectorAll('td'));
const val = (i: number) => {
const a = cells[i]?.querySelector('a');
const title = a?.getAttribute('title')?.trim();
// Use title only if non-empty; otherwise fall back to textContent
return (title || cells[i]?.textContent || '').trim();
};
// Extract detail URL — try multiple strategies:
// 1. Standard <a href="..."> pointing to an .aspx page
// 2. onclick attribute on <a>, <td>, or <tr> containing an .aspx URL
let href: string | null = null;
let debugInfo = '';
for (const a of Array.from(tr.querySelectorAll('a'))) {
const rawHref = (a as Element).getAttribute('href') ?? '';
debugInfo += `a.href="${rawHref}" `;
if (rawHref && rawHref !== '#' && rawHref !== '' && !rawHref.startsWith('javascript:')) {
href = (a as HTMLAnchorElement).href; // resolves relative → absolute
break;
}
}
if (!href) {
// Scan onclick on all ancestors + cells + anchors for .aspx URLs
const candidates: Element[] = [tr, ...Array.from(tr.querySelectorAll('a, td'))];
for (const el of candidates) {
const onclick = el.getAttribute('onclick') ?? '';
if (onclick) debugInfo += `onclick="${onclick}" `;
const match = onclick.match(/['"]([^'"]*\.aspx[^'"]*)['"]/);
if (match) {
try {
href = new URL(match[1], (window as Window).location.href).href;
} catch {
href = match[1];
}
break;
}
}
}
// Log debug info for first data row to help diagnose href extraction issues
if (rowIdx === 0 && val(2)) {
(window as any).__fdiskFirstRowDebug = `StNr=${val(2)} href=${href} debug=${debugInfo}`;
}
return {
status: val(1),
standesbuchNr: val(2),
dienstgrad: val(3),
vorname: val(4),
zuname: val(5),
geburtsdatum: val(6),
svnr: val(7),
eintrittsdatum: val(8),
abmeldedatum: val(9),
href,
};
}),
);
}
/**
* Scrape additional profile fields from the member detail form.
* Called while the frame is already on the member detail page.
*/
async function scrapeDetailProfileFields(frame: Frame): Promise<{
geburtsort: string | null;
geschlecht: string | null;
beruf: string | null;
wohnort: string | null;
plz: string | null;
}> {
return frame.evaluate(() => {
const val = (selector: string): string | null => {
const el = document.querySelector(selector) as HTMLInputElement | HTMLSelectElement | null;
if (!el) return null;
if (el.tagName === 'SELECT') {
const sel = el as HTMLSelectElement;
const opt = sel.options[sel.selectedIndex];
return opt ? (opt.text || opt.value || '').trim() || null : null;
}
return (el as HTMLInputElement).value?.trim() || null;
};
return {
geburtsort: val('input[name="geburtsort"]') ?? val('input[id*="geburtsort"]'),
geschlecht: val('select[name*="geschlecht"]') ?? val('select[id*="geschlecht"]'),
beruf: val('input[name="beruf"]') ?? val('input[id*="beruf"]'),
wohnort: val('input[name="ort"]') ?? val('input[id*="_ort"]') ?? val('input[name="wohnort"]'),
plz: val('input[name="plz"]') ?? val('input[id*="plz"]'),
};
});
}
/**
* Scrape Ausbildungen from the detail page (already loaded).
* Navigates to the Ausbildung sub-page if needed.
*/
async function scrapeAusbildungenFromDetailPage(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
// Look for Ausbildungsliste section — it's likely a table or list
const ausbildungSection = frame.locator('text=Ausbildung, text=Ausbildungsliste').first();
const hasSec = await ausbildungSection.isVisible().catch(() => false);
if (!hasSec) {
// Try navigating to an Ausbildung tab/link if present
const ausbildungLink = frame.locator('a:has-text("Ausbildung")').first();
const hasLink = await ausbildungLink.isVisible().catch(() => false);
if (hasLink) {
await ausbildungLink.click();
await frame.waitForLoadState('networkidle').catch(() => {});
}
}
// Parse the qualification table
const tables = await frame.$$('table');
const ausbildungen: FdiskAusbildung[] = [];
for (const table of tables) {
const rows = await table.$$eval('tr', (rows) => {
return rows.map(row => ({
cells: Array.from(row.querySelectorAll('td, th')).map(c => (c as Element).textContent?.trim() ?? ''),
}));
});
if (rows.length < 2) continue;
const header = rows[0].cells.map(c => c.toLowerCase());
const isAusbildungTable =
header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
if (!isAusbildungTable) continue;
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
const ortIdx = header.findIndex(h => h.includes('ort'));
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
for (const row of rows.slice(1)) {
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
if (!kursname) continue;
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
ausbildungen.push({
standesbuchNr: member.standesbuchNr,
kursname,
kursDatum,
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
syncKey,
});
}
break; // only process the first Ausbildung table found
}
return ausbildungen;
}
/**
* Navigate to a sub-section URL and wait for any data table.
* Logs the actual URL and title so wrong-page issues are visible.
* Returns all <tr> rows from the first table found, or null if none.
*/
async function navigateAndGetTableRows(
frame: Frame,
url: string,
): Promise<Array<{ cells: string[] }> | null> {
await frame_goto(frame, url);
const landed = frame.url();
const title = await frame.title().catch(() => '');
// Check for FDISK error pages
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
log(` → ERROR page: ${landed}`);
return null;
}
// Log all table classes on the page for diagnostics (first time only)
const tableInfo = await frame.evaluate(() => {
return Array.from(document.querySelectorAll('table')).map((t, i) => {
const cls = t.className || '(no class)';
const id = t.id || '';
const rowCount = t.querySelectorAll('tr').length;
return `${i}:cls="${cls}"${id ? ` id="${id}"` : ''} rows=${rowCount}`;
}).join(' | ');
}).catch(() => 'N/A');
log(` → tables: ${tableInfo}`);
// Collect rows from ALL tables, reading input/select values for inline-edit pages
const allRows = await frame.evaluate(() => {
const results: Array<{ cells: string[]; tableClass: string }> = [];
for (const table of Array.from(document.querySelectorAll('table'))) {
const cls = table.className || '';
for (const tr of Array.from(table.querySelectorAll('tbody tr, tr'))) {
// Skip rows that are nested inside a child table
if (tr.closest('table') !== table) continue;
const tds = Array.from(tr.querySelectorAll('td'));
if (tds.length < 2) continue; // skip single-cell nav/header rows
results.push({
tableClass: cls,
cells: tds.map(td => {
const input = td.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
if (input) return input.value?.trim() ?? '';
const sel = td.querySelector('select') as HTMLSelectElement | null;
if (sel) {
const opt = sel.options[sel.selectedIndex];
return (opt?.text || opt?.value || '').trim();
}
// For FDISK list tables, the value is in <a title="..."> inside each cell
const anchor = td.querySelector('a');
const atitle = anchor?.getAttribute('title')?.trim();
if (atitle) return atitle;
return td.textContent?.trim() ?? '';
}),
});
}
}
return results;
}).catch(() => [] as Array<{ cells: string[]; tableClass: string }>);
// Prefer rows from FdcLayList-class tables
const fdcRows = allRows.filter(r => r.tableClass.includes('FdcLayList'));
const resultRows = fdcRows.length > 0 ? fdcRows : allRows;
// Filter: only keep rows where cells[0] looks like a DD.MM.YYYY date
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
const dataRows = resultRows
.map(r => ({ cells: r.cells }))
.filter(r => datePattern.test(r.cells[0]?.trim() ?? ''));
log(`${allRows.length} total rows, ${fdcRows.length} FdcLayList rows, ${dataRows.length} data rows (with date in cells[0])`);
return dataRows;
}
/**
* Navigate to the Beförderungen sub-page and scrape all promotions.
*/
async function scrapeMemberBefoerderungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskBefoerderung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/befoerderungen/befoerderungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
const rows = await navigateAndGetTableRows(frame, url);
if (!rows) return [];
const results: FdiskBefoerderung[] = [];
for (const row of rows) {
const datum = parseDate(row.cells[0]);
const dienstgrad = cellText(row.cells[1]) ?? '';
const syncKey = `${standesbuchNr}::${dienstgrad}::${datum ?? ''}`;
results.push({ standesbuchNr, datum, dienstgrad, syncKey });
}
log(` Beförderungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const b of results) log(` ${b.datum ?? '—'} ${b.dienstgrad}`);
return results;
}
/**
* Navigate to the Untersuchungen sub-page and scrape all medical exams.
*/
async function scrapeMemberUntersuchungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
): Promise<FdiskUntersuchung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/untersuchungen/UntersuchungenList.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
const rows = await navigateAndGetTableRows(frame, url);
if (!rows) return [];
const results: FdiskUntersuchung[] = [];
for (const row of rows) {
// Columns: 0=Datum, 1=Anmerkungen, 2=Untersuchungsart, 3=Tauglichkeitsstufe
const art = cellText(row.cells[2]);
if (!art) continue;
const datum = parseDate(row.cells[0]);
const syncKey = `${standesbuchNr}::${art}::${datum ?? ''}`;
results.push({
standesbuchNr,
datum,
anmerkungen: cellText(row.cells[1]),
art,
ergebnis: cellText(row.cells[3]),
syncKey,
});
}
log(` Untersuchungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const u of results) log(` ${u.datum ?? '—'} [${u.art}] ${u.ergebnis ?? '—'} | ${u.anmerkungen ?? ''}`);
return results;
}
/**
* Navigate to the Gesetzliche Fahrgenehmigungen sub-page and scrape all entries.
*/
async function scrapeMemberFahrgenehmigungen(
frame: Frame,
standesbuchNr: string,
idMitgliedschaft: string,
idPersonen: string,
idInstanzen: string,
): Promise<FdiskFahrgenehmigung[]> {
const url = `${BASE_URL}/fdisk/module/mgvw/ges_fahrgenehmigungen/Ges_fahrgenehmigungenListEdit.aspx`
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`
+ `&searchid_instanzen=${idInstanzen}`;
const rows = await navigateAndGetTableRows(frame, url);
if (!rows) return [];
const results: FdiskFahrgenehmigung[] = [];
for (const row of rows) {
// Columns: 0=Ausstellungsdatum, 1=Gültig bis, 2=Behörde, 3=Nummer, 4=Fahrgenehmigungsklasse
const klasse = cellText(row.cells[4]);
if (!klasse) continue;
const ausstellungsdatum = parseDate(row.cells[0]);
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
results.push({
standesbuchNr,
ausstellungsdatum,
gueltigBis: parseDate(row.cells[1]),
behoerde: cellText(row.cells[2]),
nummer: cellText(row.cells[3]),
klasse,
syncKey,
});
}
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
return results;
}
// Legacy export kept for compatibility — delegates to the new unified flow
export async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
if (!member.detailUrl) return [];
await frame_goto(frame, member.detailUrl);
return scrapeAusbildungenFromDetailPage(frame, member);
}