import { chromium, Page, Frame } from '@playwright/test'; import { FdiskMember, FdiskAusbildung } from './types'; const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at'; const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164'; const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853'; const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`; const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`; function log(msg: string) { console.log(`[scraper] ${new Date().toISOString()} ${msg}`); } /** * Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD). * Returns null if empty or unparseable. */ function parseDate(raw: string | null | undefined): string | null { if (!raw) return null; const trimmed = raw.trim(); if (!trimmed) return null; const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/); if (!match) return null; return `${match[3]}-${match[2]}-${match[1]}`; } /** * Extract text content from a cell, trimmed, or null if empty. */ function cellText(text: string | undefined | null): string | null { const t = (text ?? '').trim(); return t || null; } export async function scrapeAll(username: string, password: string): Promise<{ members: FdiskMember[]; ausbildungen: FdiskAusbildung[]; }> { const browser = await chromium.launch({ headless: true, args: ['--disable-gpu', '--disable-software-rasterizer'], }); const context = await browser.newContext({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }); const page = await context.newPage(); try { await login(page, username, password); // After login, page is on Start.aspx (frameset). // Direct navigation to MitgliedschaftenList.aspx causes a server BLError because // the server reads the org context from session variables set by the menu. // Navigate via the menu frame (left.aspx) to set session state properly. const mainFrame = await navigateToMemberList(page); const members = await scrapeMembers(mainFrame); log(`Found ${members.length} members`); const ausbildungen: FdiskAusbildung[] = []; for (const member of members) { if (!member.detailUrl) continue; try { const quals = await scrapeMemberAusbildung(mainFrame, member); ausbildungen.push(...quals); log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`); await page.waitForTimeout(500); } catch (err) { log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`); } } return { members, ausbildungen }; } finally { await browser.close(); } } async function login(page: Page, username: string, password: string): Promise { log(`Navigating to ${LOGIN_URL}`); await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' }); await page.waitForLoadState('networkidle'); // Check if already logged in const currentUrlBefore = page.url(); if (!currentUrlBefore.toLowerCase().includes('login')) { log(`Already logged in, on: ${currentUrlBefore}`); return; } // Exact selectors from the known login form HTML const usernameField = page.locator('#login'); const passwordField = page.locator('#password'); const submitButton = page.locator('#Submit2'); await usernameField.waitFor({ state: 'visible', timeout: 10000 }); await usernameField.fill(username); await passwordField.fill(password); await submitButton.click(); // Wait for navigation away from the login page (up to 15s) try { await page.waitForURL( (url) => !url.toString().toLowerCase().includes('login'), { waitUntil: 'networkidle', timeout: 15000 }, ); } catch { // waitForURL timed out — fall through to the URL check below } // Verify we're logged in const currentUrl = page.url(); if (currentUrl.toLowerCase().includes('login')) { throw new Error(`Login failed — still on login page: ${currentUrl}`); } log(`Logged in successfully, redirected to: ${currentUrl}`); } async function navigateToMemberList(page: Page): Promise { const mainFrame = page.frame({ name: 'mainFrame' }); if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset'); log(`Navigating mainFrame to: ${MEMBERS_URL}`); await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' }); await mainFrame.waitForLoadState('networkidle'); const url = mainFrame.url(); const title = await mainFrame.title(); log(`mainFrame loaded: ${url} — title: "${title}"`); if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) { throw new Error(`Member list returned error page: ${url}`); } return mainFrame; } async function scrapeMembers(frame: Frame): Promise { log(`Scraping member list from: ${frame.url()}`); // If the page landed on a search form (not results yet), submit it const hasForm = await frame.$('form[name="frmsearch"]') !== null; const hasTable = await frame.$('table.FdcLayList') !== null; if (hasForm && !hasTable) { log('Search form found without results — submitting...'); await frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }); await frame.waitForLoadState('networkidle'); log(`After form submit: ${frame.url()}`); } // Log tables found for diagnostics const tableInfo = await frame.$$eval('table', (ts) => ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`), ); log(`Tables: ${tableInfo.join(', ') || 'none'}`); // The member table uses class FdcLayList await frame.waitForSelector('table.FdcLayList', { timeout: 20000 }); // Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad, // 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon // Each contains an — the title is the clean cell text. // The href on each is the member detail URL (same link repeated across all cells in a row). const rows = await frame.$$eval('table.FdcLayList tbody tr', (trs) => trs.map((tr) => { const cells = Array.from(tr.querySelectorAll('td')); const val = (i: number) => { const a = cells[i]?.querySelector('a'); const title = a?.getAttribute('title')?.trim(); // Use title only if non-empty; otherwise fall back to textContent return (title || cells[i]?.textContent || '').trim(); }; const href = (tr.querySelector('a') as HTMLAnchorElement | null)?.href ?? null; return { status: val(1), standesbuchNr: val(2), dienstgrad: val(3), vorname: val(4), zuname: val(5), geburtsdatum: val(6), svnr: val(7), eintrittsdatum: val(8), abmeldedatum: val(9), href, }; }), ); log(`Parsed ${rows.length} rows from member table`); for (const row of rows) { log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}"`); } const members: FdiskMember[] = []; for (const row of rows) { if (!row.standesbuchNr || !row.vorname || !row.zuname) { log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`); continue; } const abmeldedatum = parseDate(row.abmeldedatum); members.push({ standesbuchNr: row.standesbuchNr, dienstgrad: row.dienstgrad, vorname: row.vorname, zuname: row.zuname, geburtsdatum: parseDate(row.geburtsdatum), svnr: row.svnr || null, eintrittsdatum: parseDate(row.eintrittsdatum), abmeldedatum, status: abmeldedatum ? 'ausgetreten' : 'aktiv', detailUrl: row.href, }); } return members; } async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise { if (!member.detailUrl) return []; await frame.goto(member.detailUrl, { waitUntil: 'networkidle' }); // Look for Ausbildungsliste section — it's likely a table or list const ausbildungSection = frame.locator('text=Ausbildung, text=Ausbildungsliste').first(); const hasSec = await ausbildungSection.isVisible().catch(() => false); if (!hasSec) { // Try navigating to an Ausbildung tab/link if present const ausbildungLink = frame.locator('a:has-text("Ausbildung")').first(); const hasLink = await ausbildungLink.isVisible().catch(() => false); if (hasLink) { await ausbildungLink.click(); await frame.waitForLoadState('networkidle').catch(() => {}); } } // Parse the qualification table // Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary) const tables = await frame.$$('table'); const ausbildungen: FdiskAusbildung[] = []; for (const table of tables) { const rows = await table.$$eval('tr', (rows) => { return rows.map(row => ({ cells: Array.from(row.querySelectorAll('td, th')).map(c => (c as Element).textContent?.trim() ?? ''), })); }); if (rows.length < 2) continue; // Detect if this looks like an Ausbildung table const header = rows[0].cells.map(c => c.toLowerCase()); const isAusbildungTable = header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung')); if (!isAusbildungTable) continue; const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung')); const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss')); const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig')); const ortIdx = header.findIndex(h => h.includes('ort')); const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info')); for (const row of rows.slice(1)) { const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]); if (!kursname) continue; const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null); const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`; ausbildungen.push({ standesbuchNr: member.standesbuchNr, kursname, kursDatum, ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null), ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null, bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null, syncKey, }); } break; // only process the first Ausbildung table found } return ausbildungen; }