1314 lines
53 KiB
TypeScript
1314 lines
53 KiB
TypeScript
import { chromium, Page, Frame } from '@playwright/test';
|
||
import * as fs from 'fs';
|
||
import * as path from 'path';
|
||
import {
|
||
FdiskMember,
|
||
FdiskAusbildung,
|
||
FdiskBefoerderung,
|
||
FdiskUntersuchung,
|
||
FdiskFahrgenehmigung,
|
||
} from './types';
|
||
|
||
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
|
||
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
|
||
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
|
||
const DEBUG_HTML = process.env.FDISK_DEBUG_HTML === '1' || process.env.FDISK_DEBUG_HTML === 'true';
|
||
|
||
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
|
||
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
|
||
|
||
/** Save frame HTML to debug/ folder when FDISK_DEBUG_HTML=1 */
|
||
async function dumpHtml(frame: Frame, label: string): Promise<void> {
|
||
if (!DEBUG_HTML) return;
|
||
try {
|
||
const debugDir = path.resolve(process.cwd(), 'debug');
|
||
fs.mkdirSync(debugDir, { recursive: true });
|
||
const html = await frame.content();
|
||
const safeName = label.replace(/[^a-zA-Z0-9_-]/g, '_');
|
||
const filePath = path.join(debugDir, `${safeName}.html`);
|
||
fs.writeFileSync(filePath, html, 'utf-8');
|
||
log(` [debug] saved HTML → ${filePath} (${(html.length / 1024).toFixed(1)} KB)`);
|
||
} catch (err: any) {
|
||
log(` [debug] failed to save HTML for "${label}": ${err.message}`);
|
||
}
|
||
}
|
||
|
||
function log(msg: string) {
|
||
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
|
||
}
|
||
|
||
/**
|
||
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
|
||
* Returns null if empty or unparseable.
|
||
*/
|
||
function parseDate(raw: string | null | undefined): string | null {
|
||
if (!raw) return null;
|
||
const trimmed = raw.trim();
|
||
if (!trimmed) return null;
|
||
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
|
||
if (!match) return null;
|
||
return `${match[3]}-${match[2]}-${match[1]}`;
|
||
}
|
||
|
||
/**
|
||
* Extract text content from a cell, trimmed, or null if empty.
|
||
*/
|
||
function cellText(text: string | undefined | null): string | null {
|
||
const t = (text ?? '').trim();
|
||
return t || null;
|
||
}
|
||
|
||
/**
|
||
* Fetch only members we care about, rather than scraping the full member list.
|
||
*
|
||
* Phase 1: one search per known StNr (exact match).
|
||
* Phase 2: if knownNames is non-empty, a single unfiltered fetch (page 1 only)
|
||
* to pick up members matched by name (first-time linking).
|
||
*
|
||
* Returns deduplicated FdiskMember[].
|
||
*/
|
||
async function scrapeKnownMembers(
|
||
frame: Frame,
|
||
knownStNrs: Set<string>,
|
||
knownNames: Set<string>,
|
||
): Promise<FdiskMember[]> {
|
||
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
|
||
|
||
const seenStNrs = new Set<string>();
|
||
const allRows: ParsedRow[] = [];
|
||
|
||
// --- Phase 1: fetch by exact StNr ---
|
||
log(`scrapeKnownMembers: fetching ${knownStNrs.size} known StNrs`);
|
||
for (const stNr of knownStNrs) {
|
||
const formOk = await frame.evaluate((sn) => {
|
||
const form = (document as any).forms['frmsearch'];
|
||
if (!form) return false;
|
||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||
if (!fromFld || !toFld) return false;
|
||
fromFld.value = sn;
|
||
toFld.value = sn;
|
||
return true;
|
||
}, stNr);
|
||
|
||
if (!formOk) {
|
||
log(` WARN: search form not usable for StNr ${stNr}`);
|
||
continue;
|
||
}
|
||
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||
]);
|
||
|
||
const rows = await parseRowsFromTable(frame);
|
||
for (const r of rows) {
|
||
if (r.standesbuchNr && !seenStNrs.has(r.standesbuchNr)) {
|
||
seenStNrs.add(r.standesbuchNr);
|
||
allRows.push(r);
|
||
}
|
||
}
|
||
log(` StNr ${stNr}: ${rows.length} row(s)`);
|
||
|
||
// Be gentle on the server
|
||
await frame.page().waitForTimeout(300);
|
||
}
|
||
|
||
// --- Phase 2: single unfiltered fetch for name-matching ---
|
||
if (knownNames.size > 0) {
|
||
log(`scrapeKnownMembers: unfiltered fetch for ${knownNames.size} name-based matches`);
|
||
|
||
// Clear StNr filter
|
||
await frame.evaluate(() => {
|
||
const form = (document as any).forms['frmsearch'];
|
||
if (!form) return;
|
||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||
if (fromFld) fromFld.value = '';
|
||
if (toFld) toFld.value = '';
|
||
});
|
||
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||
]);
|
||
|
||
const rows = await parseRowsFromTable(frame);
|
||
let matched = 0;
|
||
for (const r of rows) {
|
||
if (!r.standesbuchNr || seenStNrs.has(r.standesbuchNr)) continue;
|
||
const nameKey = `${(r.vorname || '').toLowerCase()}::${(r.zuname || '').toLowerCase()}`;
|
||
if (knownNames.has(nameKey)) {
|
||
seenStNrs.add(r.standesbuchNr);
|
||
allRows.push(r);
|
||
matched++;
|
||
}
|
||
}
|
||
log(` Unfiltered page: ${rows.length} total rows, ${matched} name-matched`);
|
||
}
|
||
|
||
log(`scrapeKnownMembers: ${allRows.length} members collected`);
|
||
|
||
// Build FdiskMember objects
|
||
const members: FdiskMember[] = [];
|
||
for (const row of allRows) {
|
||
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
|
||
const abmeldedatum = parseDate(row.abmeldedatum);
|
||
members.push({
|
||
standesbuchNr: row.standesbuchNr,
|
||
dienstgrad: row.dienstgrad,
|
||
vorname: row.vorname,
|
||
zuname: row.zuname,
|
||
geburtsdatum: parseDate(row.geburtsdatum),
|
||
svnr: row.svnr || null,
|
||
eintrittsdatum: parseDate(row.eintrittsdatum),
|
||
abmeldedatum,
|
||
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
||
detailUrl: row.href,
|
||
geburtsort: null,
|
||
geschlecht: null,
|
||
beruf: null,
|
||
wohnort: null,
|
||
plz: null,
|
||
});
|
||
}
|
||
return members;
|
||
}
|
||
|
||
export async function scrapeAll(username: string, password: string, knownStNrs: Set<string>, knownNames: Set<string>): Promise<{
|
||
members: FdiskMember[];
|
||
ausbildungen: FdiskAusbildung[];
|
||
befoerderungen: FdiskBefoerderung[];
|
||
untersuchungen: FdiskUntersuchung[];
|
||
fahrgenehmigungen: FdiskFahrgenehmigung[];
|
||
}> {
|
||
const browser = await chromium.launch({
|
||
headless: true,
|
||
args: ['--disable-gpu', '--disable-software-rasterizer'],
|
||
});
|
||
const context = await browser.newContext({
|
||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
});
|
||
const page = await context.newPage();
|
||
|
||
try {
|
||
await login(page, username, password);
|
||
|
||
// After login, page is on Start.aspx (frameset).
|
||
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
|
||
// the server reads the org context from session variables set by the menu.
|
||
// Navigate via the menu frame (left.aspx) to set session state properly.
|
||
const mainFrame = await navigateToMemberList(page);
|
||
|
||
const members = await scrapeKnownMembers(mainFrame, knownStNrs, knownNames);
|
||
log(`Found ${members.length} members (targeted query)`);
|
||
if (DEBUG_HTML) log(`[debug] HTML dump mode ON — saving pages to debug/`);
|
||
|
||
const ausbildungen: FdiskAusbildung[] = [];
|
||
const befoerderungen: FdiskBefoerderung[] = [];
|
||
const untersuchungen: FdiskUntersuchung[] = [];
|
||
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
|
||
|
||
for (const member of members) {
|
||
try {
|
||
// Navigate to member detail page — use direct URL if available, else search+click fallback
|
||
const onDetail = member.detailUrl
|
||
? (await frame_goto(mainFrame, member.detailUrl), true)
|
||
: await navigateToMemberDetailBySearch(mainFrame, member.standesbuchNr);
|
||
|
||
if (!onDetail) {
|
||
log(` SKIP ${member.vorname} ${member.zuname} (${member.standesbuchNr}): could not reach detail page`);
|
||
continue;
|
||
}
|
||
|
||
// Scrape extra profile fields from the detail form
|
||
const profileFields = await scrapeDetailProfileFields(mainFrame);
|
||
member.geburtsort = profileFields.geburtsort;
|
||
member.geschlecht = profileFields.geschlecht;
|
||
member.beruf = profileFields.beruf;
|
||
member.wohnort = profileFields.wohnort;
|
||
member.plz = profileFields.plz;
|
||
|
||
// Debug: dump the member detail page (Ausbildungen are scraped from here)
|
||
await dumpHtml(mainFrame, `detail_StNr${member.standesbuchNr}`);
|
||
|
||
// Extract mitgliedschaft + person params from the current URL for constructing sub-section URLs.
|
||
// PersonenForm.aspx is in the personen module; sub-sections are each in their own module.
|
||
// URL pattern: ?search=1&searchid_mitgliedschaften=X&id_personen=Y&id_mitgliedschaften=X&searchid_personen=Y&searchid_maskmode=
|
||
const currentUrl = mainFrame.url();
|
||
const urlObj = new URL(currentUrl);
|
||
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
|
||
const idPersonen = urlObj.searchParams.get('id_personen');
|
||
const idInstanzen = urlObj.searchParams.get('id_instanzen') ?? ID_INSTANZEN;
|
||
|
||
// Ausbildungen
|
||
const quals = await scrapeAusbildungenFromDetailPage(mainFrame, member, idMitgliedschaft, idPersonen);
|
||
ausbildungen.push(...quals);
|
||
|
||
// Beförderungen
|
||
const befos = (idMitgliedschaft && idPersonen)
|
||
? await scrapeMemberBefoerderungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
|
||
: [];
|
||
befoerderungen.push(...befos);
|
||
|
||
// Untersuchungen
|
||
const unters = (idMitgliedschaft && idPersonen)
|
||
? await scrapeMemberUntersuchungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen)
|
||
: [];
|
||
untersuchungen.push(...unters);
|
||
|
||
// Fahrgenehmigungen
|
||
const fahrg = (idMitgliedschaft && idPersonen)
|
||
? await scrapeMemberFahrgenehmigungen(mainFrame, member.standesbuchNr, idMitgliedschaft, idPersonen, idInstanzen)
|
||
: [];
|
||
fahrgenehmigungen.push(...fahrg);
|
||
|
||
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen, ${befos.length} Beförderungen, ${unters.length} Untersuchungen, ${fahrg.length} Fahrgenehmigungen`);
|
||
await page.waitForTimeout(500);
|
||
} catch (err) {
|
||
log(` WARN: could not scrape detail for ${member.vorname} ${member.zuname}: ${err}`);
|
||
}
|
||
}
|
||
|
||
return { members, ausbildungen, befoerderungen, untersuchungen, fahrgenehmigungen };
|
||
} finally {
|
||
await browser.close();
|
||
}
|
||
}
|
||
|
||
/** Navigate a frame, waiting for networkidle. Wrapper to avoid repetition. */
|
||
async function frame_goto(frame: Frame, url: string): Promise<void> {
|
||
await frame.goto(url, { waitUntil: 'networkidle' });
|
||
}
|
||
|
||
async function login(page: Page, username: string, password: string): Promise<void> {
|
||
log(`Navigating to ${LOGIN_URL}`);
|
||
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
|
||
await page.waitForLoadState('networkidle');
|
||
|
||
// Check if already logged in
|
||
const currentUrlBefore = page.url();
|
||
if (!currentUrlBefore.toLowerCase().includes('login')) {
|
||
log(`Already logged in, on: ${currentUrlBefore}`);
|
||
return;
|
||
}
|
||
|
||
// Exact selectors from the known login form HTML
|
||
const usernameField = page.locator('#login');
|
||
const passwordField = page.locator('#password');
|
||
const submitButton = page.locator('#Submit2');
|
||
|
||
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
|
||
await usernameField.fill(username);
|
||
await passwordField.fill(password);
|
||
await submitButton.click();
|
||
|
||
// Wait for navigation away from the login page (up to 15s)
|
||
try {
|
||
await page.waitForURL(
|
||
(url) => !url.toString().toLowerCase().includes('login'),
|
||
{ waitUntil: 'networkidle', timeout: 15000 },
|
||
);
|
||
} catch {
|
||
// waitForURL timed out — fall through to the URL check below
|
||
}
|
||
|
||
// Verify we're logged in
|
||
const currentUrl = page.url();
|
||
if (currentUrl.toLowerCase().includes('login')) {
|
||
throw new Error(`Login failed — still on login page: ${currentUrl}`);
|
||
}
|
||
log(`Logged in successfully, redirected to: ${currentUrl}`);
|
||
}
|
||
|
||
/**
|
||
* Fallback navigation to a member's detail page when no direct URL is available.
|
||
* Navigates to the member list, filters by exact standesbuchNr, then clicks the result row.
|
||
* Returns true if we successfully landed on a detail page.
|
||
*/
|
||
async function navigateToMemberDetailBySearch(frame: Frame, standesbuchNr: string): Promise<boolean> {
|
||
// Navigate to the member list
|
||
await frame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
|
||
await frame.waitForLoadState('networkidle');
|
||
|
||
// Set exact standesbuchNr filter in the search form
|
||
const formOk = await frame.evaluate((stNr) => {
|
||
const form = (document as any).forms['frmsearch'];
|
||
if (!form) return false;
|
||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||
if (!fromFld || !toFld) return false;
|
||
fromFld.value = stNr;
|
||
toFld.value = stNr;
|
||
return true;
|
||
}, standesbuchNr);
|
||
|
||
if (!formOk) {
|
||
log(` WARN navigateToMemberDetailBySearch: search form not usable for StNr ${standesbuchNr}`);
|
||
return false;
|
||
}
|
||
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||
]);
|
||
|
||
// Click on the first data row — FDISK rows navigate to the detail page on click
|
||
const firstRowLink = await frame.$('table.FdcLayList tbody tr:first-child a, table.FdcLayList tbody tr:first-child td');
|
||
if (!firstRowLink) {
|
||
log(` WARN navigateToMemberDetailBySearch: no result row for StNr ${standesbuchNr}`);
|
||
return false;
|
||
}
|
||
|
||
try {
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }),
|
||
firstRowLink.click(),
|
||
]);
|
||
} catch {
|
||
// waitForNavigation may time out if click didn't navigate (e.g. onclick vs href)
|
||
// Check whether the URL changed at all
|
||
}
|
||
|
||
const url = frame.url();
|
||
const onDetailPage = !url.includes('MitgliedschaftenList') && !url.includes('meine_Mitglieder');
|
||
if (onDetailPage) {
|
||
log(` Navigated to detail via search+click: ${url}`);
|
||
} else {
|
||
log(` WARN navigateToMemberDetailBySearch: still on list page after click for StNr ${standesbuchNr}`);
|
||
}
|
||
return onDetailPage;
|
||
}
|
||
|
||
async function navigateToMemberList(page: Page): Promise<Frame> {
|
||
const mainFrame = page.frame({ name: 'mainFrame' });
|
||
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
|
||
|
||
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
|
||
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
|
||
await mainFrame.waitForLoadState('networkidle');
|
||
|
||
const url = mainFrame.url();
|
||
const title = await mainFrame.title();
|
||
log(`mainFrame loaded: ${url} — title: "${title}"`);
|
||
|
||
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
|
||
throw new Error(`Member list returned error page: ${url}`);
|
||
}
|
||
|
||
return mainFrame;
|
||
}
|
||
|
||
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
|
||
log(`Scraping member list from: ${frame.url()}`);
|
||
|
||
// Clear the Standesbuchnummer filter if the search form is present.
|
||
// FDISK pre-fills the logged-in user's own Standesbuchnummer, which limits results to 1 member.
|
||
// We clear it before submitting so all members of the fire station are returned.
|
||
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
|
||
if (hasForm) {
|
||
const fieldDump = await frame.evaluate(() => {
|
||
const form = (document as any).forms['frmsearch'];
|
||
if (!form) return { cleared: [], pageSizeSet: null as string | null, allFields: [] };
|
||
const cleared: string[] = [];
|
||
const allFields: string[] = [];
|
||
let pageSizeSet: string | null = null;
|
||
for (const el of Array.from(form.elements) as HTMLInputElement[]) {
|
||
if (el.type === 'hidden') continue;
|
||
const name = (el.name ?? '').toLowerCase();
|
||
const id = (el.id ?? '').toLowerCase();
|
||
if (el.value) allFields.push(`${el.name || el.id}=${el.value}`);
|
||
if (name.includes('standesbuch') || id.includes('standesbuch')) {
|
||
el.value = '';
|
||
cleared.push(el.name || el.id);
|
||
}
|
||
// Maximize page size: look for a select AND its paired hidden input
|
||
// FDISK uses a custom Dd widget where <select name="xDd_dd"> is the visible dropdown
|
||
// but the actual POST value comes from <input type="hidden" name="xDd_id"> or similar.
|
||
if ((name.includes('anzahl') || id.includes('anzahl') ||
|
||
name.includes('pagesize') || id.includes('pagesize') ||
|
||
name.includes('rows') || id.includes('rows')) &&
|
||
el.tagName === 'SELECT') {
|
||
const select = el as unknown as HTMLSelectElement;
|
||
// Pick the largest numeric option value, or the last option as fallback
|
||
let bestOption: HTMLOptionElement | null = null;
|
||
let bestVal = -1;
|
||
for (const opt of Array.from(select.options)) {
|
||
const n = parseInt(opt.value, 10);
|
||
if (!isNaN(n) && n > bestVal) { bestVal = n; bestOption = opt; }
|
||
}
|
||
if (!bestOption && select.options.length > 0) {
|
||
bestOption = select.options[select.options.length - 1];
|
||
}
|
||
if (bestOption) {
|
||
select.value = bestOption.value;
|
||
pageSizeSet = `${el.name || el.id}=${bestOption.value}`;
|
||
// Also update the paired hidden field used by the Dd custom widget.
|
||
// Common patterns: xDd_dd → xDd_id or xDd_hd
|
||
const baseName = (el.name || el.id).replace(/_dd$/i, '');
|
||
for (const suffix of ['_id', '_hd', '_val']) {
|
||
const hidden = form.elements[baseName + suffix] as HTMLInputElement | undefined;
|
||
if (hidden && hidden.type === 'hidden') {
|
||
hidden.value = bestOption.value;
|
||
pageSizeSet += ` (also set ${baseName + suffix})`;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return { cleared, pageSizeSet, allFields };
|
||
});
|
||
if (fieldDump.allFields.length > 0) {
|
||
log(`Search form active filters before clear: ${fieldDump.allFields.join(', ')}`);
|
||
}
|
||
if (fieldDump.cleared.length > 0) {
|
||
log(`Cleared Standesbuchnummer filter fields: ${fieldDump.cleared.join(', ')}`);
|
||
} else {
|
||
log('Search form found — no Standesbuchnummer field detected, submitting as-is');
|
||
}
|
||
if (fieldDump.pageSizeSet) {
|
||
log(`Set page size: ${fieldDump.pageSizeSet}`);
|
||
} else {
|
||
log('No page size field found — will paginate through all results');
|
||
}
|
||
// Use Promise.all to start waiting for navigation BEFORE triggering the submit,
|
||
// otherwise waitForLoadState resolves against the already-idle current page.
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||
]);
|
||
log(`After form submit: ${frame.url()}`);
|
||
}
|
||
|
||
// --- Phase 1: initial fetch (no StNr filter) to get the first batch and total count ---
|
||
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
|
||
|
||
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
|
||
const firstRows = await parseRowsFromTable(frame);
|
||
log(`Initial fetch: ${firstRows.length} rows`);
|
||
|
||
// Log href debug info for the first row to diagnose URL extraction
|
||
const rowDebug = await frame.evaluate(() => (window as any).__fdiskFirstRowDebug ?? 'no debug info');
|
||
log(`Row href debug: ${rowDebug}`);
|
||
|
||
for (const row of firstRows) {
|
||
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
||
}
|
||
|
||
const pagination = await frame.evaluate(() =>
|
||
document.querySelector('table.FdcLayListNav')?.textContent?.trim() ?? ''
|
||
);
|
||
log(`Pagination: "${pagination}"`);
|
||
|
||
const pagMatch = pagination.match(/(\d+)-(\d+)\s+von\s+(\d+)/i);
|
||
const totalExpected = pagMatch ? parseInt(pagMatch[3], 10) : null;
|
||
const shownSoFar = pagMatch ? parseInt(pagMatch[2], 10) : null;
|
||
|
||
const seenStNrs = new Set<string>(firstRows.map(r => r.standesbuchNr).filter(Boolean));
|
||
const allRows: ParsedRow[] = [...firstRows];
|
||
|
||
// --- Phase 2: if more members exist and pagination is disabled, use StNr range queries ---
|
||
if (totalExpected && shownSoFar && shownSoFar < totalExpected) {
|
||
log(`Pagination disabled (FDISK limitation). Switching to StNr range queries to fetch remaining ${totalExpected - seenStNrs.size} members.`);
|
||
|
||
const BATCH = 15; // fetch 15 StNr slots at a time — safely under the 20-row page limit
|
||
const MAX_STNR = 9999; // upper bound; we stop earlier if we have all members
|
||
let startNr = 1;
|
||
let consecutiveEmpty = 0;
|
||
|
||
while (seenStNrs.size < totalExpected && startNr <= MAX_STNR && consecutiveEmpty < 5) {
|
||
const endNr = startNr + BATCH - 1;
|
||
|
||
// Set StNr range in the search form and submit
|
||
const formOk = await frame.evaluate(({ s, e }: { s: number; e: number }) => {
|
||
const form = (document as any).forms['frmsearch'];
|
||
if (!form) return false;
|
||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement;
|
||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement;
|
||
if (!fromFld || !toFld) return false;
|
||
fromFld.value = String(s);
|
||
toFld.value = String(e);
|
||
return true;
|
||
}, { s: startNr, e: endNr });
|
||
|
||
if (!formOk) {
|
||
log('WARN: could not set StNr range fields — aborting range queries');
|
||
break;
|
||
}
|
||
|
||
await Promise.all([
|
||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||
]);
|
||
|
||
const rangeRows = await parseRowsFromTable(frame);
|
||
const newRows = rangeRows.filter(r => r.standesbuchNr && !seenStNrs.has(r.standesbuchNr));
|
||
newRows.forEach(r => { if (r.standesbuchNr) seenStNrs.add(r.standesbuchNr); });
|
||
allRows.push(...newRows);
|
||
|
||
log(`StNr ${startNr}–${endNr}: ${newRows.length} new members (collected ${seenStNrs.size}/${totalExpected})`);
|
||
for (const row of newRows) {
|
||
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
||
}
|
||
|
||
consecutiveEmpty = newRows.length === 0 ? consecutiveEmpty + 1 : 0;
|
||
startNr = endNr + 1;
|
||
}
|
||
|
||
log(`Range queries complete: ${seenStNrs.size} unique members collected (expected ${totalExpected})`);
|
||
}
|
||
|
||
log(`Parsed ${allRows.length} raw rows total`);
|
||
|
||
const members: FdiskMember[] = [];
|
||
for (const row of allRows) {
|
||
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
|
||
const abmeldedatum = parseDate(row.abmeldedatum);
|
||
members.push({
|
||
standesbuchNr: row.standesbuchNr,
|
||
dienstgrad: row.dienstgrad,
|
||
vorname: row.vorname,
|
||
zuname: row.zuname,
|
||
geburtsdatum: parseDate(row.geburtsdatum),
|
||
svnr: row.svnr || null,
|
||
eintrittsdatum: parseDate(row.eintrittsdatum),
|
||
abmeldedatum,
|
||
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
||
detailUrl: row.href,
|
||
geburtsort: null,
|
||
geschlecht: null,
|
||
beruf: null,
|
||
wohnort: null,
|
||
plz: null,
|
||
});
|
||
}
|
||
return members;
|
||
}
|
||
|
||
async function parseRowsFromTable(frame: Frame) {
|
||
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
|
||
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
|
||
// Each <td> contains an <a title="value"> — the title is the clean cell text.
|
||
// Navigation may be via href or onclick handlers (FDISK uses both depending on version).
|
||
return frame.$$eval('table.FdcLayList tbody tr', (trs) =>
|
||
trs.map((tr, rowIdx) => {
|
||
const cells = Array.from(tr.querySelectorAll('td'));
|
||
const val = (i: number) => {
|
||
const a = cells[i]?.querySelector('a');
|
||
const title = a?.getAttribute('title')?.trim();
|
||
// Use title only if non-empty; otherwise fall back to textContent
|
||
return (title || cells[i]?.textContent || '').trim();
|
||
};
|
||
|
||
// Extract detail URL — try multiple strategies:
|
||
// 1. Standard <a href="..."> pointing to an .aspx page
|
||
// 2. onclick attribute on <a>, <td>, or <tr> containing an .aspx URL
|
||
let href: string | null = null;
|
||
let debugInfo = '';
|
||
|
||
for (const a of Array.from(tr.querySelectorAll('a'))) {
|
||
const rawHref = (a as Element).getAttribute('href') ?? '';
|
||
debugInfo += `a.href="${rawHref}" `;
|
||
if (rawHref && rawHref !== '#' && rawHref !== '' && !rawHref.startsWith('javascript:')) {
|
||
href = (a as HTMLAnchorElement).href; // resolves relative → absolute
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!href) {
|
||
// Scan onclick on all ancestors + cells + anchors for .aspx URLs
|
||
const candidates: Element[] = [tr, ...Array.from(tr.querySelectorAll('a, td'))];
|
||
for (const el of candidates) {
|
||
const onclick = el.getAttribute('onclick') ?? '';
|
||
if (onclick) debugInfo += `onclick="${onclick}" `;
|
||
const match = onclick.match(/['"]([^'"]*\.aspx[^'"]*)['"]/);
|
||
if (match) {
|
||
try {
|
||
href = new URL(match[1], (window as Window).location.href).href;
|
||
} catch {
|
||
href = match[1];
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Log debug info for first data row to help diagnose href extraction issues
|
||
if (rowIdx === 0 && val(2)) {
|
||
(window as any).__fdiskFirstRowDebug = `StNr=${val(2)} href=${href} debug=${debugInfo}`;
|
||
}
|
||
|
||
return {
|
||
status: val(1),
|
||
standesbuchNr: val(2),
|
||
dienstgrad: val(3),
|
||
vorname: val(4),
|
||
zuname: val(5),
|
||
geburtsdatum: val(6),
|
||
svnr: val(7),
|
||
eintrittsdatum: val(8),
|
||
abmeldedatum: val(9),
|
||
href,
|
||
};
|
||
}),
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Scrape additional profile fields from the member detail form.
|
||
* Called while the frame is already on the member detail page.
|
||
*/
|
||
async function scrapeDetailProfileFields(frame: Frame): Promise<{
|
||
geburtsort: string | null;
|
||
geschlecht: string | null;
|
||
beruf: string | null;
|
||
wohnort: string | null;
|
||
plz: string | null;
|
||
}> {
|
||
return frame.evaluate(() => {
|
||
const val = (selector: string): string | null => {
|
||
const el = document.querySelector(selector) as HTMLInputElement | HTMLSelectElement | null;
|
||
if (!el) return null;
|
||
if (el.tagName === 'SELECT') {
|
||
const sel = el as HTMLSelectElement;
|
||
const opt = sel.options[sel.selectedIndex];
|
||
return opt ? (opt.text || opt.value || '').trim() || null : null;
|
||
}
|
||
return (el as HTMLInputElement).value?.trim() || null;
|
||
};
|
||
|
||
return {
|
||
geburtsort: val('input[name="geburtsort"]') ?? val('input[id*="geburtsort"]'),
|
||
geschlecht: val('select[name*="geschlecht"]') ?? val('select[id*="geschlecht"]'),
|
||
beruf: val('input[name="beruf"]') ?? val('input[id*="beruf"]'),
|
||
wohnort: val('input[name="ort"]') ?? val('input[id*="_ort"]') ?? val('input[name="wohnort"]'),
|
||
plz: val('input[name="plz"]') ?? val('input[id*="plz"]'),
|
||
};
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Scrape Ausbildungen by navigating to the AusbildungenListEdit.aspx page.
|
||
* This is a ListEdit page (like Fahrgenehmigungen) with <input>/<select> elements.
|
||
*/
|
||
async function scrapeAusbildungenFromDetailPage(
|
||
frame: Frame,
|
||
member: FdiskMember,
|
||
idMitgliedschaft?: string | null,
|
||
idPersonen?: string | null,
|
||
): Promise<FdiskAusbildung[]> {
|
||
// If we don't have the IDs, we cannot navigate to the Ausbildungen page
|
||
if (!idMitgliedschaft || !idPersonen) {
|
||
log(` Ausbildungen for StNr ${member.standesbuchNr}: missing mitgliedschaft/personen IDs, skipping`);
|
||
return [];
|
||
}
|
||
|
||
const url = `${BASE_URL}/fdisk/module/mgvw/ausbildungen/AusbildungenListEdit.aspx`
|
||
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
|
||
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
|
||
|
||
await frame_goto(frame, url);
|
||
|
||
const landed = frame.url();
|
||
const title = await frame.title().catch(() => '');
|
||
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
|
||
log(` → Ausbildungen ERROR page: ${landed}`);
|
||
return [];
|
||
}
|
||
|
||
// Dump HTML for debugging
|
||
await dumpHtml(frame, `ausbildungen_StNr${member.standesbuchNr}`);
|
||
|
||
// This is a ListEdit page — read form fields by ID pattern or parse table with extractCellValue
|
||
const ausbildungen = await frame.evaluate((stNr: string) => {
|
||
const extractCellValue = (cell: Element): string => {
|
||
const input = cell.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
|
||
if (input && input.value?.trim()) return input.value.trim();
|
||
const sel = cell.querySelector('select') as HTMLSelectElement | null;
|
||
if (sel) {
|
||
// Try selectedIndex first
|
||
const idx = sel.selectedIndex;
|
||
if (idx >= 0 && sel.options[idx]) {
|
||
const t = (sel.options[idx].text || sel.options[idx].value || '').trim();
|
||
if (t) return t;
|
||
}
|
||
// Fallback: read the selected attribute directly from HTML
|
||
const selectedOpt = sel.querySelector('option[selected]') as HTMLOptionElement | null;
|
||
if (selectedOpt) {
|
||
const t = (selectedOpt.text || selectedOpt.value || '').trim();
|
||
if (t) return t;
|
||
}
|
||
}
|
||
const anchor = cell.querySelector('a');
|
||
const atitle = anchor?.getAttribute('title')?.trim();
|
||
if (atitle) return atitle;
|
||
return cell.textContent?.replace(/\u00A0/g, ' ').trim() ?? '';
|
||
};
|
||
|
||
const results: Array<{
|
||
standesbuchNr: string;
|
||
kursname: string | null;
|
||
kursDatum: string | null;
|
||
ablaufdatum: string | null;
|
||
ort: string | null;
|
||
bemerkung: string | null;
|
||
syncKey: string;
|
||
}> = [];
|
||
|
||
// Collect rows from all tables, find the data table
|
||
const tables = Array.from(document.querySelectorAll('table'));
|
||
let bestRows: Array<{ cells: string[] }> = [];
|
||
let bestHeaders: string[] = [];
|
||
|
||
for (const table of tables) {
|
||
const rows: Array<{ cells: string[] }> = [];
|
||
const headerCells: string[] = [];
|
||
|
||
// Get headers
|
||
for (const th of Array.from(table.querySelectorAll('thead th, tr:first-child th'))) {
|
||
headerCells.push(extractCellValue(th));
|
||
}
|
||
|
||
// Get data rows
|
||
for (const tr of Array.from(table.querySelectorAll('tr'))) {
|
||
if (tr.closest('table') !== table) continue;
|
||
const tds = Array.from(tr.querySelectorAll('td'));
|
||
if (tds.length < 2) continue;
|
||
if (tr.querySelectorAll('th').length > 0) continue;
|
||
rows.push({ cells: tds.map(td => extractCellValue(td)) });
|
||
}
|
||
|
||
if (rows.length > bestRows.length) {
|
||
bestRows = rows;
|
||
bestHeaders = headerCells;
|
||
}
|
||
}
|
||
|
||
if (bestRows.length === 0) return results;
|
||
|
||
// Try to find column indices from headers
|
||
const hdr = bestHeaders.map(h => h.toLowerCase());
|
||
let kursnameIdx = hdr.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
||
let datumIdx = hdr.findIndex(h => h.includes('datum') || h.includes('abschluss'));
|
||
let ablaufIdx = hdr.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
|
||
let ortIdx = hdr.findIndex(h => h.includes('ort'));
|
||
let bemIdx = hdr.findIndex(h => h.includes('bem') || h.includes('info'));
|
||
|
||
// If headers didn't help, scan data for date-like columns and text columns
|
||
if (kursnameIdx === -1 && bestRows.length > 0) {
|
||
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
|
||
// Find date columns
|
||
const dateCols = new Set<number>();
|
||
const textCols: number[] = [];
|
||
for (const row of bestRows.slice(0, 3)) {
|
||
for (let ci = 0; ci < row.cells.length; ci++) {
|
||
const v = row.cells[ci]?.trim();
|
||
if (!v) continue;
|
||
if (datePattern.test(v)) dateCols.add(ci);
|
||
else if (v.length > 2 && !/^[\d.,]+$/.test(v)) textCols.push(ci);
|
||
}
|
||
}
|
||
// The longest text column is likely the Kursname
|
||
if (textCols.length > 0) {
|
||
let maxLen = 0;
|
||
for (const ci of textCols) {
|
||
const len = (bestRows[0]?.cells[ci] ?? '').length;
|
||
if (len > maxLen) { maxLen = len; kursnameIdx = ci; }
|
||
}
|
||
}
|
||
// First date column is Datum, second is Ablaufdatum
|
||
const sortedDates = Array.from(dateCols).sort((a, b) => a - b);
|
||
if (sortedDates.length > 0 && datumIdx === -1) datumIdx = sortedDates[0];
|
||
if (sortedDates.length > 1 && ablaufIdx === -1) ablaufIdx = sortedDates[1];
|
||
}
|
||
|
||
for (const row of bestRows) {
|
||
const kursname = ((kursnameIdx >= 0 ? row.cells[kursnameIdx] : row.cells[0])?.trim()) || '';
|
||
if (!kursname) continue;
|
||
// Skip header-like rows
|
||
if (/kurs|ausbildung|bezeichnung|datensätze|tiefennavigation/i.test(kursname)) continue;
|
||
|
||
const rawDatum = datumIdx >= 0 ? row.cells[datumIdx]?.trim() : null;
|
||
const rawAblauf = ablaufIdx >= 0 ? row.cells[ablaufIdx]?.trim() : null;
|
||
const rawOrt = ortIdx >= 0 ? row.cells[ortIdx]?.trim() || null : null;
|
||
const rawBem = bemIdx >= 0 ? row.cells[bemIdx]?.trim() || null : null;
|
||
|
||
// parseDate is not available inside evaluate; return raw values
|
||
results.push({
|
||
standesbuchNr: stNr,
|
||
kursname,
|
||
kursDatum: rawDatum || null,
|
||
ablaufdatum: rawAblauf || null,
|
||
ort: rawOrt,
|
||
bemerkung: rawBem,
|
||
syncKey: `${stNr}::${kursname}::${rawDatum ?? ''}`,
|
||
});
|
||
}
|
||
|
||
return results;
|
||
}, member.standesbuchNr).catch(() => [] as FdiskAusbildung[]);
|
||
|
||
// Post-process: parse dates and rebuild syncKeys
|
||
const results: FdiskAusbildung[] = ausbildungen.filter(a => !!a.kursname).map(a => {
|
||
const kursDatum = parseDate(a.kursDatum);
|
||
return {
|
||
standesbuchNr: a.standesbuchNr,
|
||
kursname: a.kursname as string,
|
||
kursDatum,
|
||
ablaufdatum: parseDate(a.ablaufdatum),
|
||
ort: a.ort,
|
||
bemerkung: a.bemerkung,
|
||
syncKey: `${a.standesbuchNr}::${a.kursname}::${kursDatum ?? ''}`,
|
||
};
|
||
});
|
||
|
||
// Debug: dump HTML when no Ausbildungen found
|
||
if (results.length === 0) {
|
||
await dumpHtml(frame, `ausbildungen_empty_StNr${member.standesbuchNr}`);
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* Navigate to a sub-section URL and wait for any data table.
|
||
* Logs the actual URL and title so wrong-page issues are visible.
|
||
* Returns all <tr> rows from the first table found, or null if none.
|
||
*/
|
||
async function navigateAndGetTableRows(
|
||
frame: Frame,
|
||
url: string,
|
||
): Promise<{ rows: Array<{ cells: string[] }>; dateColIdx: number } | null> {
|
||
await frame_goto(frame, url);
|
||
|
||
const landed = frame.url();
|
||
const title = await frame.title().catch(() => '');
|
||
|
||
// Check for FDISK error pages
|
||
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
|
||
log(` → ERROR page: ${landed}`);
|
||
return null;
|
||
}
|
||
|
||
// Log all table classes on the page for diagnostics (first time only)
|
||
const tableInfo = await frame.evaluate(() => {
|
||
return Array.from(document.querySelectorAll('table')).map((t, i) => {
|
||
const cls = t.className || '(no class)';
|
||
const id = t.id || '';
|
||
const rowCount = t.querySelectorAll('tr').length;
|
||
return `${i}:cls="${cls}"${id ? ` id="${id}"` : ''} rows=${rowCount}`;
|
||
}).join(' | ');
|
||
}).catch(() => 'N/A');
|
||
log(` → tables: ${tableInfo}`);
|
||
|
||
// Collect rows from ALL tables, reading input/select values for inline-edit pages
|
||
const allRows = await frame.evaluate(() => {
|
||
const results: Array<{ cells: string[]; tableClass: string }> = [];
|
||
for (const table of Array.from(document.querySelectorAll('table'))) {
|
||
const cls = table.className || '';
|
||
for (const tr of Array.from(table.querySelectorAll('tbody tr, tr'))) {
|
||
// Skip rows that are nested inside a child table
|
||
if (tr.closest('table') !== table) continue;
|
||
const tds = Array.from(tr.querySelectorAll('td'));
|
||
if (tds.length < 2) continue; // skip single-cell nav/header rows
|
||
results.push({
|
||
tableClass: cls,
|
||
cells: tds.map(td => {
|
||
const input = td.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
|
||
if (input) return input.value?.trim() ?? '';
|
||
const sel = td.querySelector('select') as HTMLSelectElement | null;
|
||
if (sel) {
|
||
const opt = sel.options[sel.selectedIndex];
|
||
return (opt?.text || opt?.value || '').trim();
|
||
}
|
||
// For FDISK list tables, the value is in <a title="..."> inside each cell
|
||
const anchor = td.querySelector('a');
|
||
const atitle = anchor?.getAttribute('title')?.trim();
|
||
if (atitle) return atitle;
|
||
return td.textContent?.trim() ?? '';
|
||
}),
|
||
});
|
||
}
|
||
}
|
||
return results;
|
||
}).catch(() => [] as Array<{ cells: string[]; tableClass: string }>);
|
||
|
||
// Prefer rows from FdcLayList-class tables
|
||
const fdcRows = allRows.filter(r => r.tableClass.includes('FdcLayList'));
|
||
const resultRows = fdcRows.length > 0 ? fdcRows : allRows;
|
||
|
||
// Strip \u00A0 (non-breaking space) from all cell values and trim
|
||
const mapped = resultRows.map(r => ({
|
||
cells: r.cells.map(c => c.replace(/\u00A0/g, ' ').trim()),
|
||
}));
|
||
|
||
// Find date column dynamically: look for a DD.MM.YYYY pattern in any column
|
||
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
|
||
let dateColIdx = -1;
|
||
for (const r of mapped) {
|
||
for (let ci = 0; ci < r.cells.length; ci++) {
|
||
if (datePattern.test(r.cells[ci] ?? '')) {
|
||
dateColIdx = ci;
|
||
break;
|
||
}
|
||
}
|
||
if (dateColIdx >= 0) break;
|
||
}
|
||
|
||
const dataRows = dateColIdx >= 0
|
||
? mapped.filter(r => datePattern.test(r.cells[dateColIdx] ?? ''))
|
||
: [];
|
||
|
||
log(` → ${allRows.length} total rows, ${fdcRows.length} FdcLayList rows, ${dataRows.length} data rows (date in col ${dateColIdx})`);
|
||
|
||
// Debug: dump HTML when no data rows found
|
||
if (dataRows.length === 0) {
|
||
const urlSlug = url.split('/').pop()?.split('?')[0] ?? 'unknown';
|
||
await dumpHtml(frame, `navigateAndGetTableRows_${urlSlug}`);
|
||
}
|
||
|
||
return { rows: dataRows, dateColIdx };
|
||
}
|
||
|
||
/**
|
||
* Navigate to the Beförderungen sub-page and scrape all promotions.
|
||
*/
|
||
async function scrapeMemberBefoerderungen(
|
||
frame: Frame,
|
||
standesbuchNr: string,
|
||
idMitgliedschaft: string,
|
||
idPersonen: string,
|
||
): Promise<FdiskBefoerderung[]> {
|
||
const url = `${BASE_URL}/fdisk/module/mgvw/befoerderungen/befoerderungenList.aspx`
|
||
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
|
||
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
|
||
|
||
const result = await navigateAndGetTableRows(frame, url);
|
||
if (!result) return [];
|
||
|
||
const { rows, dateColIdx } = result;
|
||
const results: FdiskBefoerderung[] = [];
|
||
for (const row of rows) {
|
||
const datum = parseDate(row.cells[dateColIdx]);
|
||
// The next non-empty column after the date holds the Dienstgrad
|
||
let dienstgrad = '';
|
||
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
|
||
const v = cellText(row.cells[ci]);
|
||
if (v) { dienstgrad = v; break; }
|
||
}
|
||
const syncKey = `${standesbuchNr}::${dienstgrad}::${datum ?? ''}`;
|
||
results.push({ standesbuchNr, datum, dienstgrad, syncKey });
|
||
}
|
||
log(` Beförderungen for StNr ${standesbuchNr}: ${results.length} rows`);
|
||
for (const b of results) log(` ${b.datum ?? '—'} ${b.dienstgrad}`);
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* Navigate to the Untersuchungen sub-page and scrape all medical exams.
|
||
*/
|
||
async function scrapeMemberUntersuchungen(
|
||
frame: Frame,
|
||
standesbuchNr: string,
|
||
idMitgliedschaft: string,
|
||
idPersonen: string,
|
||
): Promise<FdiskUntersuchung[]> {
|
||
const url = `${BASE_URL}/fdisk/module/mgvw/untersuchungen/UntersuchungenList.aspx`
|
||
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
|
||
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`;
|
||
|
||
const result = await navigateAndGetTableRows(frame, url);
|
||
if (!result) return [];
|
||
|
||
const { rows, dateColIdx } = result;
|
||
const results: FdiskUntersuchung[] = [];
|
||
for (const row of rows) {
|
||
// Collect non-empty values from columns after the date column
|
||
const valueCols: string[] = [];
|
||
for (let ci = dateColIdx + 1; ci < row.cells.length; ci++) {
|
||
const v = cellText(row.cells[ci]);
|
||
if (v !== null) valueCols.push(v);
|
||
}
|
||
// Original layout: 0=Datum, 1=Anmerkungen, 2=Untersuchungsart, 3=Tauglichkeitsstufe
|
||
// With spacer columns the date may not be at 0; use relative offsets from collected values
|
||
const anmerkungen = valueCols[0] ?? null;
|
||
const art = valueCols[1] ?? null;
|
||
const ergebnis = valueCols[2] ?? null;
|
||
if (!art) continue;
|
||
const datum = parseDate(row.cells[dateColIdx]);
|
||
const syncKey = `${standesbuchNr}::${art}::${datum ?? ''}`;
|
||
results.push({
|
||
standesbuchNr,
|
||
datum,
|
||
anmerkungen,
|
||
art,
|
||
ergebnis,
|
||
syncKey,
|
||
});
|
||
}
|
||
log(` Untersuchungen for StNr ${standesbuchNr}: ${results.length} rows`);
|
||
for (const u of results) log(` ${u.datum ?? '—'} [${u.art}] ${u.ergebnis ?? '—'} | ${u.anmerkungen ?? ''}`);
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* Navigate to the Gesetzliche Fahrgenehmigungen sub-page and scrape all entries.
|
||
* This page is a ListEdit page with form fields named by row index pattern:
|
||
* ausstellungsdatum_{i}, gueltig_bis_{i}, behoerde_{i}, nummer_{i}, id_fahrgenehmigungsklassen_{i}
|
||
* Falls back to table-based parsing if field IDs are not found.
|
||
*/
|
||
async function scrapeMemberFahrgenehmigungen(
|
||
frame: Frame,
|
||
standesbuchNr: string,
|
||
idMitgliedschaft: string,
|
||
idPersonen: string,
|
||
idInstanzen: string,
|
||
): Promise<FdiskFahrgenehmigung[]> {
|
||
const url = `${BASE_URL}/fdisk/module/mgvw/ges_fahrgenehmigungen/Ges_fahrgenehmigungenListEdit.aspx`
|
||
+ `?search=1&searchid_mitgliedschaften=${idMitgliedschaft}&id_personen=${idPersonen}`
|
||
+ `&id_mitgliedschaften=${idMitgliedschaft}&searchid_personen=${idPersonen}&searchid_maskmode=`
|
||
+ `&searchid_instanzen=${idInstanzen}`;
|
||
|
||
await frame_goto(frame, url);
|
||
|
||
const landed = frame.url();
|
||
const title = await frame.title().catch(() => '');
|
||
if (landed.includes('BLError') || landed.includes('support.aspx') || title.toLowerCase().includes('fehler')) {
|
||
log(` → Fahrgenehmigungen ERROR page: ${landed}`);
|
||
return [];
|
||
}
|
||
|
||
// Dump HTML for diagnostics
|
||
await dumpHtml(frame, `fahrgenehmigungen_StNr${standesbuchNr}`);
|
||
|
||
// Read form fields by ID pattern: {fieldname}_{rowIndex}
|
||
const rawRows = await frame.evaluate(() => {
|
||
const rows: Array<{
|
||
ausstellungsdatum: string;
|
||
gueltigBis: string;
|
||
behoerde: string;
|
||
nummer: string;
|
||
klasse: string;
|
||
}> = [];
|
||
|
||
for (let i = 0; i < 100; i++) {
|
||
// Try to find any field for this row index — if none exist, we've passed all rows
|
||
const ausstellungEl = document.querySelector(`input[name="ausstellungsdatum_${i}"], input[id="ausstellungsdatum_${i}"]`) as HTMLInputElement | null;
|
||
const gueltigEl = document.querySelector(`input[name="gueltig_bis_${i}"], input[id="gueltig_bis_${i}"]`) as HTMLInputElement | null;
|
||
const behoerdeEl = document.querySelector(`input[name="behoerde_${i}"], input[id="behoerde_${i}"]`) as HTMLInputElement | null;
|
||
const nummerEl = document.querySelector(`input[name="nummer_${i}"], input[id="nummer_${i}"]`) as HTMLInputElement | null;
|
||
const klasseEl = document.querySelector(`select[name="id_fahrgenehmigungsklassen_${i}"], select[id="id_fahrgenehmigungsklassen_${i}"]`) as HTMLSelectElement | null;
|
||
|
||
// If no field found at all, stop
|
||
if (!ausstellungEl && !gueltigEl && !behoerdeEl && !nummerEl && !klasseEl) break;
|
||
|
||
// Read klasse from select: try selectedIndex, then fallback to [selected] attribute
|
||
let klasse = '';
|
||
if (klasseEl) {
|
||
const idx = klasseEl.selectedIndex;
|
||
if (idx >= 0 && klasseEl.options[idx]) {
|
||
klasse = (klasseEl.options[idx].text || klasseEl.options[idx].value || '').trim();
|
||
}
|
||
if (!klasse) {
|
||
const selectedOpt = klasseEl.querySelector('option[selected]') as HTMLOptionElement | null;
|
||
if (selectedOpt) {
|
||
klasse = (selectedOpt.text || selectedOpt.value || '').trim();
|
||
}
|
||
}
|
||
if (!klasse && klasseEl.value?.trim()) {
|
||
klasse = klasseEl.value.trim();
|
||
}
|
||
}
|
||
|
||
rows.push({
|
||
ausstellungsdatum: ausstellungEl?.value?.trim() ?? '',
|
||
gueltigBis: gueltigEl?.value?.trim() ?? '',
|
||
behoerde: behoerdeEl?.value?.trim() ?? '',
|
||
nummer: nummerEl?.value?.trim() ?? '',
|
||
klasse,
|
||
});
|
||
}
|
||
|
||
return rows;
|
||
}).catch(() => [] as Array<{ ausstellungsdatum: string; gueltigBis: string; behoerde: string; nummer: string; klasse: string }>);
|
||
|
||
log(` → Fahrgenehmigungen form-field extraction: ${rawRows.length} rows found`);
|
||
|
||
// If form-field approach found rows, use them
|
||
if (rawRows.length > 0) {
|
||
const results: FdiskFahrgenehmigung[] = [];
|
||
for (const row of rawRows) {
|
||
const klasse = cellText(row.klasse);
|
||
if (!klasse) continue;
|
||
const ausstellungsdatum = parseDate(row.ausstellungsdatum);
|
||
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
|
||
results.push({
|
||
standesbuchNr,
|
||
ausstellungsdatum,
|
||
gueltigBis: parseDate(row.gueltigBis),
|
||
behoerde: cellText(row.behoerde),
|
||
nummer: cellText(row.nummer),
|
||
klasse,
|
||
syncKey,
|
||
});
|
||
}
|
||
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
|
||
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
|
||
return results;
|
||
}
|
||
|
||
// Fallback: table-based parsing (original approach with extractCellValue)
|
||
log(` → Fahrgenehmigungen: no form fields found, falling back to table parsing`);
|
||
|
||
const pageData = await frame.evaluate(() => {
|
||
const extractCellValue = (cell: Element): string => {
|
||
const input = cell.querySelector('input[type="text"], input:not([type])') as HTMLInputElement | null;
|
||
if (input && input.value?.trim()) return input.value.trim();
|
||
const sel = cell.querySelector('select') as HTMLSelectElement | null;
|
||
if (sel) {
|
||
const idx = sel.selectedIndex;
|
||
if (idx >= 0 && sel.options[idx]) {
|
||
const t = (sel.options[idx].text || sel.options[idx].value || '').trim();
|
||
if (t) return t;
|
||
}
|
||
// Fallback: read the selected attribute directly from HTML
|
||
const selectedOpt = sel.querySelector('option[selected]') as HTMLOptionElement | null;
|
||
if (selectedOpt) {
|
||
const t = (selectedOpt.text || selectedOpt.value || '').trim();
|
||
if (t) return t;
|
||
}
|
||
if (sel.value?.trim()) return sel.value.trim();
|
||
}
|
||
const anchor = cell.querySelector('a');
|
||
const atitle = anchor?.getAttribute('title')?.trim();
|
||
if (atitle) return atitle;
|
||
return cell.textContent?.trim() ?? '';
|
||
};
|
||
|
||
const tables: Array<{
|
||
tableClass: string;
|
||
headers: string[];
|
||
rows: Array<{ cells: string[] }>;
|
||
}> = [];
|
||
|
||
for (const table of Array.from(document.querySelectorAll('table'))) {
|
||
const cls = table.className || '';
|
||
const thElements = Array.from(table.querySelectorAll('thead th, tr th'));
|
||
const headers = thElements.map(th => extractCellValue(th));
|
||
const dataRows: Array<{ cells: string[] }> = [];
|
||
for (const tr of Array.from(table.querySelectorAll('tr'))) {
|
||
if (tr.closest('table') !== table) continue;
|
||
const tds = Array.from(tr.querySelectorAll('td'));
|
||
if (tds.length < 2) continue;
|
||
if (tr.querySelectorAll('th').length > 0) continue;
|
||
dataRows.push({ cells: tds.map(td => extractCellValue(td)) });
|
||
}
|
||
tables.push({ tableClass: cls, headers, rows: dataRows });
|
||
}
|
||
return tables;
|
||
}).catch(() => [] as Array<{ tableClass: string; headers: string[]; rows: Array<{ cells: string[] }> }>);
|
||
|
||
// Diagnostic: log all tables found
|
||
for (let ti = 0; ti < pageData.length; ti++) {
|
||
const t = pageData[ti];
|
||
log(` → table ${ti}: cls="${t.tableClass}" headers=[${t.headers.join(', ')}] dataRows=${t.rows.length}`);
|
||
for (let ri = 0; ri < t.rows.length; ri++) {
|
||
const preview = t.rows[ri].cells.slice(0, 8).map((c, j) => `[${j}]="${c}"`).join(' ');
|
||
log(` row ${ri}: ${preview}`);
|
||
}
|
||
}
|
||
|
||
const bestTable = pageData.find(t => t.tableClass.includes('FdcLayList') && t.rows.length > 0)
|
||
|| pageData.filter(t => t.rows.length > 0).sort((a, b) => b.rows.length - a.rows.length)[0];
|
||
|
||
if (!bestTable || bestTable.rows.length === 0) {
|
||
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: no data table found`);
|
||
return [];
|
||
}
|
||
|
||
const headers = bestTable.headers.map(h => h.toLowerCase());
|
||
log(` Fahrgenehmigungen headers: [${headers.join(', ')}]`);
|
||
|
||
let klasseIdx = headers.findIndex(h => h.includes('klasse') || h.includes('fahrgenehmigung'));
|
||
let ausstellungIdx = headers.findIndex(h => h.includes('ausstellung'));
|
||
let gueltigIdx = headers.findIndex(h => h.includes('gültig') || h.includes('gultig') || h.includes('ablauf'));
|
||
let behoerdeIdx = headers.findIndex(h => h.includes('behörde') || h.includes('behorde'));
|
||
let nummerIdx = headers.findIndex(h => h.includes('nummer') || h.includes('nr'));
|
||
|
||
const KNOWN_KLASSEN = new Set([
|
||
'AM', 'A1', 'A2', 'A', 'B', 'BE', 'C1', 'C1E', 'C', 'CE',
|
||
'D1', 'D1E', 'D', 'DE', 'F', 'L', 'L17', 'B+E', 'C+E', 'D+E',
|
||
]);
|
||
|
||
if (klasseIdx === -1) {
|
||
for (const row of bestTable.rows.slice(0, 3)) {
|
||
for (let ci = 0; ci < row.cells.length; ci++) {
|
||
const val = row.cells[ci]?.trim();
|
||
// Match known klassen or values containing "Führerschein" etc.
|
||
if (KNOWN_KLASSEN.has(val.toUpperCase()) || /führerschein|lenkberechtigung/i.test(val)) {
|
||
klasseIdx = ci;
|
||
log(` Fahrgenehmigungen: found Klasse in column ${ci} by data inspection`);
|
||
break;
|
||
}
|
||
}
|
||
if (klasseIdx >= 0) break;
|
||
}
|
||
}
|
||
|
||
if (ausstellungIdx === -1) {
|
||
const datePattern = /^\d{2}\.\d{2}\.\d{4}$/;
|
||
for (const row of bestTable.rows.slice(0, 3)) {
|
||
for (let ci = 0; ci < row.cells.length; ci++) {
|
||
if (ci === klasseIdx) continue;
|
||
if (datePattern.test(row.cells[ci]?.trim())) {
|
||
ausstellungIdx = ci;
|
||
break;
|
||
}
|
||
}
|
||
if (ausstellungIdx >= 0) break;
|
||
}
|
||
}
|
||
|
||
log(` Fahrgenehmigungen column map: klasse=${klasseIdx} ausstellung=${ausstellungIdx} gueltig=${gueltigIdx} behoerde=${behoerdeIdx} nummer=${nummerIdx}`);
|
||
|
||
if (klasseIdx === -1) {
|
||
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: could not determine Klasse column. Returning empty.`);
|
||
await dumpHtml(frame, `fahrgenehmigungen_fallback_StNr${standesbuchNr}`);
|
||
return [];
|
||
}
|
||
|
||
const results: FdiskFahrgenehmigung[] = [];
|
||
for (const row of bestTable.rows) {
|
||
const klasse = cellText(row.cells[klasseIdx]);
|
||
if (!klasse) continue;
|
||
if (/klasse|fahrgenehmigung|ausstellung|datensätze|information|tiefennavigation/i.test(klasse)) continue;
|
||
if (/^\d{2}\.\d{2}\.\d{4}$/.test(klasse)) continue;
|
||
|
||
const ausstellungsdatum = parseDate(ausstellungIdx >= 0 ? row.cells[ausstellungIdx] : undefined);
|
||
const syncKey = `${standesbuchNr}::${klasse}::${ausstellungsdatum ?? ''}`;
|
||
results.push({
|
||
standesbuchNr,
|
||
ausstellungsdatum,
|
||
gueltigBis: parseDate(gueltigIdx >= 0 ? row.cells[gueltigIdx] : undefined),
|
||
behoerde: cellText(behoerdeIdx >= 0 ? row.cells[behoerdeIdx] : undefined),
|
||
nummer: cellText(nummerIdx >= 0 ? row.cells[nummerIdx] : undefined),
|
||
klasse,
|
||
syncKey,
|
||
});
|
||
}
|
||
log(` Fahrgenehmigungen for StNr ${standesbuchNr}: ${results.length} rows`);
|
||
for (const f of results) log(` ${f.ausstellungsdatum ?? '—'} [${f.klasse}] ${f.behoerde ?? ''} ${f.nummer ?? ''}`);
|
||
return results;
|
||
}
|
||
|
||
// Legacy export kept for compatibility — delegates to the new unified flow
|
||
export async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
|
||
if (!member.detailUrl) return [];
|
||
await frame_goto(frame, member.detailUrl);
|
||
// Try to extract IDs from the detail URL
|
||
const urlObj = new URL(member.detailUrl, frame.url());
|
||
const idMitgliedschaft = urlObj.searchParams.get('id_mitgliedschaften');
|
||
const idPersonen = urlObj.searchParams.get('id_personen');
|
||
return scrapeAusbildungenFromDetailPage(frame, member, idMitgliedschaft, idPersonen);
|
||
}
|