276 lines
10 KiB
TypeScript
276 lines
10 KiB
TypeScript
import { chromium, Page, Frame } from '@playwright/test';
|
|
import { FdiskMember, FdiskAusbildung } from './types';
|
|
|
|
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
|
|
const ID_FEUERWEHREN = process.env.FDISK_ID_FEUERWEHREN ?? '164';
|
|
const ID_INSTANZEN = process.env.FDISK_ID_INSTANZEN ?? '2853';
|
|
|
|
const LOGIN_URL = `${BASE_URL}/fdisk/module/vws/logins/logins.aspx`;
|
|
const MEMBERS_URL = `${BASE_URL}/fdisk/module/mgvw/mitgliedschaften/meine_Mitglieder.aspx`;
|
|
|
|
function log(msg: string) {
|
|
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
|
|
}
|
|
|
|
/**
|
|
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
|
|
* Returns null if empty or unparseable.
|
|
*/
|
|
function parseDate(raw: string | null | undefined): string | null {
|
|
if (!raw) return null;
|
|
const trimmed = raw.trim();
|
|
if (!trimmed) return null;
|
|
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
|
|
if (!match) return null;
|
|
return `${match[3]}-${match[2]}-${match[1]}`;
|
|
}
|
|
|
|
/**
|
|
* Extract text content from a cell, trimmed, or null if empty.
|
|
*/
|
|
function cellText(text: string | undefined | null): string | null {
|
|
const t = (text ?? '').trim();
|
|
return t || null;
|
|
}
|
|
|
|
export async function scrapeAll(username: string, password: string): Promise<{
|
|
members: FdiskMember[];
|
|
ausbildungen: FdiskAusbildung[];
|
|
}> {
|
|
const browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
await login(page, username, password);
|
|
|
|
// After login, page is on Start.aspx (frameset).
|
|
// Direct navigation to MitgliedschaftenList.aspx causes a server BLError because
|
|
// the server reads the org context from session variables set by the menu.
|
|
// Navigate via the menu frame (left.aspx) to set session state properly.
|
|
const mainFrame = await navigateToMemberList(page);
|
|
|
|
const members = await scrapeMembers(mainFrame);
|
|
log(`Found ${members.length} members`);
|
|
|
|
const ausbildungen: FdiskAusbildung[] = [];
|
|
for (const member of members) {
|
|
if (!member.detailUrl) continue;
|
|
try {
|
|
const quals = await scrapeMemberAusbildung(mainFrame, member);
|
|
ausbildungen.push(...quals);
|
|
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
|
|
await page.waitForTimeout(500);
|
|
} catch (err) {
|
|
log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`);
|
|
}
|
|
}
|
|
|
|
return { members, ausbildungen };
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
async function login(page: Page, username: string, password: string): Promise<void> {
|
|
log(`Navigating to ${LOGIN_URL}`);
|
|
await page.goto(LOGIN_URL, { waitUntil: 'domcontentloaded' });
|
|
await page.waitForLoadState('networkidle');
|
|
|
|
// Check if already logged in
|
|
const currentUrlBefore = page.url();
|
|
if (!currentUrlBefore.toLowerCase().includes('login')) {
|
|
log(`Already logged in, on: ${currentUrlBefore}`);
|
|
return;
|
|
}
|
|
|
|
// Exact selectors from the known login form HTML
|
|
const usernameField = page.locator('#login');
|
|
const passwordField = page.locator('#password');
|
|
const submitButton = page.locator('#Submit2');
|
|
|
|
await usernameField.waitFor({ state: 'visible', timeout: 10000 });
|
|
await usernameField.fill(username);
|
|
await passwordField.fill(password);
|
|
await submitButton.click();
|
|
|
|
// Wait for navigation away from the login page (up to 15s)
|
|
try {
|
|
await page.waitForURL(
|
|
(url) => !url.toString().toLowerCase().includes('login'),
|
|
{ waitUntil: 'networkidle', timeout: 15000 },
|
|
);
|
|
} catch {
|
|
// waitForURL timed out — fall through to the URL check below
|
|
}
|
|
|
|
// Verify we're logged in
|
|
const currentUrl = page.url();
|
|
if (currentUrl.toLowerCase().includes('login')) {
|
|
throw new Error(`Login failed — still on login page: ${currentUrl}`);
|
|
}
|
|
log(`Logged in successfully, redirected to: ${currentUrl}`);
|
|
}
|
|
|
|
async function navigateToMemberList(page: Page): Promise<Frame> {
|
|
const mainFrame = page.frame({ name: 'mainFrame' });
|
|
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
|
|
|
|
log(`Navigating mainFrame to: ${MEMBERS_URL}`);
|
|
await mainFrame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
|
|
await mainFrame.waitForLoadState('networkidle');
|
|
|
|
const url = mainFrame.url();
|
|
const title = await mainFrame.title();
|
|
log(`mainFrame loaded: ${url} — title: "${title}"`);
|
|
|
|
if (url.includes('BLError') || url.includes('support.aspx') || url.includes('Error')) {
|
|
throw new Error(`Member list returned error page: ${url}`);
|
|
}
|
|
|
|
return mainFrame;
|
|
}
|
|
|
|
async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
|
|
log(`Scraping member list from: ${frame.url()}`);
|
|
|
|
// If the page landed on a search form (not results yet), submit it
|
|
const hasForm = await frame.$('form[name="frmsearch"]') !== null;
|
|
const hasTable = await frame.$('table.FdcLayList') !== null;
|
|
if (hasForm && !hasTable) {
|
|
log('Search form found without results — submitting...');
|
|
await frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); });
|
|
await frame.waitForLoadState('networkidle');
|
|
log(`After form submit: ${frame.url()}`);
|
|
}
|
|
|
|
// Log tables found for diagnostics
|
|
const tableInfo = await frame.$$eval('table', (ts) =>
|
|
ts.map((t) => `${t.className || '(no-class)'}[${t.querySelectorAll('tr').length}rows]`),
|
|
);
|
|
log(`Tables: ${tableInfo.join(', ') || 'none'}`);
|
|
|
|
// The member table uses class FdcLayList
|
|
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
|
|
|
|
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
|
|
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
|
|
// Each <td> contains an <a title="value"> — the title is the clean cell text.
|
|
// The href on each <a> is the member detail URL (same link repeated across all cells in a row).
|
|
const rows = await frame.$$eval('table.FdcLayList tbody tr', (trs) =>
|
|
trs.map((tr) => {
|
|
const cells = Array.from(tr.querySelectorAll('td'));
|
|
const val = (i: number) => {
|
|
const a = cells[i]?.querySelector('a');
|
|
return (a?.getAttribute('title') ?? cells[i]?.textContent ?? '').trim();
|
|
};
|
|
const href = (tr.querySelector('a') as HTMLAnchorElement | null)?.href ?? null;
|
|
return {
|
|
status: val(1),
|
|
standesbuchNr: val(2),
|
|
dienstgrad: val(3),
|
|
vorname: val(4),
|
|
zuname: val(5),
|
|
geburtsdatum: val(6),
|
|
svnr: val(7),
|
|
eintrittsdatum: val(8),
|
|
abmeldedatum: val(9),
|
|
href,
|
|
};
|
|
}),
|
|
);
|
|
|
|
log(`Parsed ${rows.length} rows from member table`);
|
|
|
|
const members: FdiskMember[] = [];
|
|
for (const row of rows) {
|
|
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
|
|
const abmeldedatum = parseDate(row.abmeldedatum);
|
|
members.push({
|
|
standesbuchNr: row.standesbuchNr,
|
|
dienstgrad: row.dienstgrad,
|
|
vorname: row.vorname,
|
|
zuname: row.zuname,
|
|
geburtsdatum: parseDate(row.geburtsdatum),
|
|
svnr: row.svnr || null,
|
|
eintrittsdatum: parseDate(row.eintrittsdatum),
|
|
abmeldedatum,
|
|
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
|
detailUrl: row.href,
|
|
});
|
|
}
|
|
return members;
|
|
}
|
|
|
|
async function scrapeMemberAusbildung(frame: Frame, member: FdiskMember): Promise<FdiskAusbildung[]> {
|
|
if (!member.detailUrl) return [];
|
|
|
|
await frame.goto(member.detailUrl, { waitUntil: 'networkidle' });
|
|
|
|
// Look for Ausbildungsliste section — it's likely a table or list
|
|
const ausbildungSection = frame.locator('text=Ausbildung, text=Ausbildungsliste').first();
|
|
const hasSec = await ausbildungSection.isVisible().catch(() => false);
|
|
|
|
if (!hasSec) {
|
|
// Try navigating to an Ausbildung tab/link if present
|
|
const ausbildungLink = frame.locator('a:has-text("Ausbildung")').first();
|
|
const hasLink = await ausbildungLink.isVisible().catch(() => false);
|
|
if (hasLink) {
|
|
await ausbildungLink.click();
|
|
await frame.waitForLoadState('networkidle').catch(() => {});
|
|
}
|
|
}
|
|
|
|
// Parse the qualification table
|
|
// Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary)
|
|
const tables = await frame.$$('table');
|
|
const ausbildungen: FdiskAusbildung[] = [];
|
|
|
|
for (const table of tables) {
|
|
const rows = await table.$$eval('tr', (rows) => {
|
|
return rows.map(row => ({
|
|
cells: Array.from(row.querySelectorAll('td, th')).map(c => (c as Element).textContent?.trim() ?? ''),
|
|
}));
|
|
});
|
|
|
|
if (rows.length < 2) continue;
|
|
|
|
// Detect if this looks like an Ausbildung table
|
|
const header = rows[0].cells.map(c => c.toLowerCase());
|
|
const isAusbildungTable =
|
|
header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
|
|
if (!isAusbildungTable) continue;
|
|
|
|
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
|
|
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
|
|
const ortIdx = header.findIndex(h => h.includes('ort'));
|
|
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
|
|
|
|
for (const row of rows.slice(1)) {
|
|
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
|
|
if (!kursname) continue;
|
|
|
|
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
|
|
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
|
|
|
|
ausbildungen.push({
|
|
standesbuchNr: member.standesbuchNr,
|
|
kursname,
|
|
kursDatum,
|
|
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
|
|
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
|
|
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
|
|
syncKey,
|
|
});
|
|
}
|
|
|
|
break; // only process the first Ausbildung table found
|
|
}
|
|
|
|
return ausbildungen;
|
|
}
|