Files
dashboard/sync/src/scraper.ts
Matthias Hochmeister bc6d09200a update
2026-03-13 19:23:39 +01:00

306 lines
11 KiB
TypeScript

import { chromium, Page, Frame } from '@playwright/test';
import { FdiskMember, FdiskAusbildung } from './types';
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
function log(msg: string) {
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
}
/**
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
* Returns null if empty or unparseable.
*/
function parseDate(raw: string | null | undefined): string | null {
if (!raw) return null;
const trimmed = raw.trim();
if (!trimmed) return null;
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
if (!match) return null;
return `${match[3]}-${match[2]}-${match[1]}`;
}
/**
* Extract text content from a cell, trimmed, or null if empty.
*/
function cellText(text: string | undefined | null): string | null {
const t = (text ?? '').trim();
return t || null;
}
export async function scrapeAll(username: string, password: string): Promise<{
members: FdiskMember[];
ausbildungen: FdiskAusbildung[];
}> {
const browser = await chromium.launch({
headless: true,
args: ['--disable-gpu', '--disable-software-rasterizer'],
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
await login(page, username, password);
// After login, page is on Start.aspx (frameset).
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
// the server reads the org context from session variables set by the menu.
// Navigate via the menu frame (left.aspx) to set session state properly.
const mainFrame = await navigateToMemberList(page);
const members = await scrapeMembers(mainFrame);
log(`Found ${members.length} members`);
const ausbildungen: FdiskAusbildung[] = [];
for (const member of members) {
if (!member.detailUrl) continue;
try {
const quals = await scrapeMemberAusbildung(mainFrame, member);
ausbildungen.push(...quals);
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
await page.waitForTimeout(500);
} catch (err) {
log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`);
}
}
return { members, ausbildungen };
} finally {
await browser.close();
}
}
async function login(page: Page, username: string, password: string): Promise<void> {
log(`Navigating to ${LOGIN_URL}`);
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
await page.waitForLoadState('networkidle');
// Check if already logged in
const currentUrlBefore = page.url();
if (!currentUrlBefore.toLowerCase().includes('login')) {
log(`Already logged in, on: ${currentUrlBefore}`);
return;
}
// Exact selectors from the known login form HTML
const usernameField = page.locator('#login');
const passwordField = page.locator('#password');
const submitButton = page.locator('#Submit2');
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
await usernameField.fill(username);
await passwordField.fill(password);
await submitButton.click();
// Wait for navigation away from the login page (up to 15s)
try {
await page.waitForURL(
(url) => !url.toString().toLowerCase().includes('login'),
{ waitUntil: 'networkidle', timeout: 15000 },
);
} catch {
// waitForURL timed out — fall through to the URL check below
}
// Verify we're logged in
const currentUrl = page.url();
if (currentUrl.toLowerCase().includes('login')) {
throw new Error(`Login failed — still on login page: ${currentUrl}`);
}
log(`Logged in successfully, redirected to: ${currentUrl}`);
}
async function navigateToMemberList(page: Page): Promise<Frame> {
const mainFrame = page.frame({ name: 'mainFrame' });
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
await mainFrame.waitForLoadState('networkidle');
const url = mainFrame.url();
const title = await mainFrame.title();
log(`mainFrame loaded: ${url} — title: "${title}"`);
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
throw new Error(`Member list returned error page: ${url}`);
}
return mainFrame;
}
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
log(`Scraping member list from: ${frame.url()}`);
// Clear the Standesbuchnummer filter if the search form is present.
// FDISK pre-fills the logged-in user's own Standesbuchnummer, which limits results to 1 member.
// We clear it before submitting so all members of the fire station are returned.
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
if (hasForm) {
const cleared = await frame.evaluate(() => {
const form = (document as any).forms['frmsearch'];
if (!form) return [];
const clearedFields: string[] = [];
for (const el of Array.from(form.elements) as HTMLInputElement[]) {
const name = (el.name ?? '').toLowerCase();
const id = (el.id ?? '').toLowerCase();
if (name.includes('standesbuch') || id.includes('standesbuch')) {
el.value = '';
clearedFields.push(el.name || el.id);
}
}
return clearedFields;
});
if (cleared.length > 0) {
log(`Cleared Standesbuchnummer filter fields: ${cleared.join(', ')}`);
} else {
log('Search form found — no Standesbuchnummer field detected, submitting as-is');
}
await frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); });
await frame.waitForLoadState('networkidle');
log(`After form submit: ${frame.url()}`);
}
// Log tables found for diagnostics
const tableInfo = await frame.$$eval('table', (ts) =>
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`),
);
log(`Tables: ${tableInfo.join(', ') || 'none'}`);
// The member table uses class FdcLayList
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
// Each <td> contains an <a title="value"> — the title is the clean cell text.
// The href on each <a> is the member detail URL (same link repeated across all cells in a row).
const rows = await frame.$$eval('table.FdcLayList tbody tr', (trs) =>
trs.map((tr) => {
const cells = Array.from(tr.querySelectorAll('td'));
const val = (i: number) => {
const a = cells[i]?.querySelector('a');
const title = a?.getAttribute('title')?.trim();
// Use title only if non-empty; otherwise fall back to textContent
return (title || cells[i]?.textContent || '').trim();
};
const href = (tr.querySelector('a') as HTMLAnchorElement | null)?.href ?? null;
return {
status: val(1),
standesbuchNr: val(2),
dienstgrad: val(3),
vorname: val(4),
zuname: val(5),
geburtsdatum: val(6),
svnr: val(7),
eintrittsdatum: val(8),
abmeldedatum: val(9),
href,
};
}),
);
log(`Parsed ${rows.length} rows from member table`);
for (const row of rows) {
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}"`);
}
const members: FdiskMember[] = [];
for (const row of rows) {
if (!row.standesbuchNr || !row.vorname || !row.zuname) {
log(` SKIP: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" — missing required field`);
continue;
}
const abmeldedatum = parseDate(row.abmeldedatum);
members.push({
standesbuchNr: row.standesbuchNr,
dienstgrad: row.dienstgrad,
vorname: row.vorname,
zuname: row.zuname,
geburtsdatum: parseDate(row.geburtsdatum),
svnr: row.svnr || null,
eintrittsdatum: parseDate(row.eintrittsdatum),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: row.href,
});
}
return members;
}
async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
if (!member.detailUrl) return [];
await frame.goto(member.detailUrl, { waitUntil: 'networkidle' });
// Look for Ausbildungsliste section — it's likely a table or list
const ausbildungSection = frame.locator('text=Ausbildung, text=Ausbildungsliste').first();
const hasSec = await ausbildungSection.isVisible().catch(() => false);
if (!hasSec) {
// Try navigating to an Ausbildung tab/link if present
const ausbildungLink = frame.locator('a:has-text("Ausbildung")').first();
const hasLink = await ausbildungLink.isVisible().catch(() => false);
if (hasLink) {
await ausbildungLink.click();
await frame.waitForLoadState('networkidle').catch(() => {});
}
}
// Parse the qualification table
// Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary)
const tables = await frame.$$('table');
const ausbildungen: FdiskAusbildung[] = [];
for (const table of tables) {
const rows = await table.$$eval('tr', (rows) => {
return rows.map(row => ({
cells: Array.from(row.querySelectorAll('td, th')).map(c => (c as Element).textContent?.trim() ?? ''),
}));
});
if (rows.length < 2) continue;
// Detect if this looks like an Ausbildung table
const header = rows[0].cells.map(c => c.toLowerCase());
const isAusbildungTable =
header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
if (!isAusbildungTable) continue;
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
const ortIdx = header.findIndex(h => h.includes('ort'));
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
for (const row of rows.slice(1)) {
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
if (!kursname) continue;
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
ausbildungen.push({
standesbuchNr: member.standesbuchNr,
kursname,
kursDatum,
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
syncKey,
});
}
break; // only process the first Ausbildung table found
}
return ausbildungen;
}