252 lines
9.5 KiB
TypeScript
252 lines
9.5 KiB
TypeScript
import { chromium, Page } from '@playwright/test';
|
|
import { FdiskMember, FdiskAusbildung } from './types';
|
|
|
|
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
|
|
const LOGIN_URL = `${BASE_URL}/fdisk/`;
|
|
const MEMBERS_URL = `${BASE_URL}/fdisk/module/vws/Start.aspx`;
|
|
|
|
function log(msg: string) {
|
|
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
|
|
}
|
|
|
|
/**
|
|
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
|
|
* Returns null if empty or unparseable.
|
|
*/
|
|
function parseDate(raw: string | null | undefined): string | null {
|
|
if (!raw) return null;
|
|
const trimmed = raw.trim();
|
|
if (!trimmed) return null;
|
|
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
|
|
if (!match) return null;
|
|
return `${match[3]}-${match[2]}-${match[1]}`;
|
|
}
|
|
|
|
/**
|
|
* Extract text content from a cell, trimmed, or null if empty.
|
|
*/
|
|
function cellText(text: string | undefined | null): string | null {
|
|
const t = (text ?? '').trim();
|
|
return t || null;
|
|
}
|
|
|
|
export async function scrapeAll(username: string, password: string): Promise<{
|
|
members: FdiskMember[];
|
|
ausbildungen: FdiskAusbildung[];
|
|
}> {
|
|
const browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
await login(page, username, password);
|
|
const members = await scrapeMembers(page);
|
|
log(`Found ${members.length} members`);
|
|
|
|
const ausbildungen: FdiskAusbildung[] = [];
|
|
for (const member of members) {
|
|
if (!member.detailUrl) continue;
|
|
try {
|
|
const quals = await scrapeMemberAusbildung(page, member);
|
|
ausbildungen.push(...quals);
|
|
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
|
|
// polite delay between requests
|
|
await page.waitForTimeout(500);
|
|
} catch (err) {
|
|
log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`);
|
|
}
|
|
}
|
|
|
|
return { members, ausbildungen };
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
async function login(page: Page, username: string, password: string): Promise<void> {
|
|
log(`Navigating to ${LOGIN_URL}`);
|
|
await page.goto(LOGIN_URL, { waitUntil: 'networkidle' });
|
|
|
|
// ASP.NET WebForms login — try common selector patterns
|
|
// Adjust these selectors if login fails
|
|
const usernameField = page.locator('input[type="text"], input[name*="user"], input[name*="User"], input[id*="user"], input[id*="User"]').first();
|
|
const passwordField = page.locator('input[type="password"]').first();
|
|
|
|
await usernameField.fill(username);
|
|
await passwordField.fill(password);
|
|
|
|
// Submit — look for a login/submit button
|
|
const submitButton = page.locator('input[type="submit"], button[type="submit"]').first();
|
|
await Promise.all([
|
|
page.waitForNavigation({ waitUntil: 'networkidle' }),
|
|
submitButton.click(),
|
|
]);
|
|
|
|
// Verify we're logged in by checking we're not still on the login page
|
|
const currentUrl = page.url();
|
|
if (currentUrl.includes('login') || currentUrl.includes('Login') || currentUrl === LOGIN_URL) {
|
|
throw new Error(`Login failed — still on login page: ${currentUrl}`);
|
|
}
|
|
log(`Logged in successfully, redirected to: ${currentUrl}`);
|
|
}
|
|
|
|
async function scrapeMembers(page: Page): Promise<FdiskMember[]> {
|
|
log(`Navigating to members list: ${MEMBERS_URL}`);
|
|
await page.goto(MEMBERS_URL, { waitUntil: 'networkidle' });
|
|
|
|
// Wait for the member table to appear
|
|
// ASP.NET GridView renders as an HTML table — find the data table
|
|
await page.waitForSelector('table', { timeout: 15000 });
|
|
|
|
// Find the main data table (likely the one with the most rows)
|
|
// Columns: Status, St.-Nr., Dienstgrad, Vorname, Zuname, Geburtsdatum, SVNR, Eintrittsdatum, Abmeldedatum
|
|
const rows = await page.$$eval('table tr', (rows) => {
|
|
return rows.map(row => {
|
|
const cells = Array.from(row.querySelectorAll('td'));
|
|
const link = row.querySelector('a');
|
|
return {
|
|
cells: cells.map(c => c.textContent?.trim() ?? ''),
|
|
href: link?.href ?? null,
|
|
onclick: link?.getAttribute('onclick') ?? row.getAttribute('onclick') ?? null,
|
|
};
|
|
});
|
|
});
|
|
|
|
// Find the header row to determine column positions
|
|
const headerRow = await page.$eval('table tr:first-child', (row) => {
|
|
const cells = Array.from(row.querySelectorAll('th, td'));
|
|
return cells.map(c => c.textContent?.trim().toLowerCase() ?? '');
|
|
});
|
|
|
|
// Detect column indices from headers
|
|
const colIdx = {
|
|
status: headerRow.findIndex(h => h.includes('status')),
|
|
standesbuchNr: headerRow.findIndex(h => h.includes('st.-nr') || h.includes('stnr') || h.includes('nr')),
|
|
dienstgrad: headerRow.findIndex(h => h.includes('dienstgrad')),
|
|
vorname: headerRow.findIndex(h => h.includes('vorname')),
|
|
zuname: headerRow.findIndex(h => h.includes('zuname') || h.includes('nachname')),
|
|
geburtsdatum: headerRow.findIndex(h => h.includes('geburt')),
|
|
svnr: headerRow.findIndex(h => h.includes('svnr') || h.includes('sv-nr')),
|
|
eintrittsdatum: headerRow.findIndex(h => h.includes('eintritt')),
|
|
abmeldedatum: headerRow.findIndex(h => h.includes('abmeld')),
|
|
};
|
|
|
|
log(`Detected columns: ${JSON.stringify(colIdx)}`);
|
|
|
|
// Fallback to positional columns if detection failed
|
|
// Based on screenshot: Status(0), St.-Nr.(1), Dienstgrad(2), Vorname(3), Zuname(4),
|
|
// Geburtsdatum(5), SVNR(6), Eintrittsdatum(7), Abmeldedatum(8)
|
|
if (colIdx.standesbuchNr === -1) colIdx.standesbuchNr = 1;
|
|
if (colIdx.dienstgrad === -1) colIdx.dienstgrad = 2;
|
|
if (colIdx.vorname === -1) colIdx.vorname = 3;
|
|
if (colIdx.zuname === -1) colIdx.zuname = 4;
|
|
if (colIdx.geburtsdatum === -1) colIdx.geburtsdatum = 5;
|
|
if (colIdx.svnr === -1) colIdx.svnr = 6;
|
|
if (colIdx.eintrittsdatum === -1) colIdx.eintrittsdatum = 7;
|
|
if (colIdx.abmeldedatum === -1) colIdx.abmeldedatum = 8;
|
|
|
|
const members: FdiskMember[] = [];
|
|
|
|
for (const row of rows) {
|
|
const { cells, href, onclick } = row;
|
|
// Skip header rows and empty rows
|
|
if (cells.length < 5) continue;
|
|
const stnr = cellText(cells[colIdx.standesbuchNr]);
|
|
const vorname = cellText(cells[colIdx.vorname]);
|
|
const zuname = cellText(cells[colIdx.zuname]);
|
|
if (!stnr || !vorname || !zuname) continue;
|
|
|
|
const abmeldedatum = parseDate(cells[colIdx.abmeldedatum]);
|
|
|
|
members.push({
|
|
standesbuchNr: stnr,
|
|
dienstgrad: cellText(cells[colIdx.dienstgrad]) ?? '',
|
|
vorname,
|
|
zuname,
|
|
geburtsdatum: parseDate(cells[colIdx.geburtsdatum]),
|
|
svnr: cellText(cells[colIdx.svnr]),
|
|
eintrittsdatum: parseDate(cells[colIdx.eintrittsdatum]),
|
|
abmeldedatum,
|
|
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
|
detailUrl: href,
|
|
});
|
|
}
|
|
|
|
return members;
|
|
}
|
|
|
|
async function scrapeMemberAusbildung(page: Page, member: FdiskMember): Promise<FdiskAusbildung[]> {
|
|
if (!member.detailUrl) return [];
|
|
|
|
await page.goto(member.detailUrl, { waitUntil: 'networkidle' });
|
|
|
|
// Look for Ausbildungsliste section — it's likely a table or list
|
|
// Try to find it by heading text
|
|
const ausbildungSection = page.locator('text=Ausbildung, text=Ausbildungsliste').first();
|
|
const hasSec = await ausbildungSection.isVisible().catch(() => false);
|
|
|
|
if (!hasSec) {
|
|
// Try navigating to an Ausbildung tab/link if present
|
|
const ausbildungLink = page.locator('a:has-text("Ausbildung")').first();
|
|
const hasLink = await ausbildungLink.isVisible().catch(() => false);
|
|
if (hasLink) {
|
|
await Promise.all([
|
|
page.waitForNavigation({ waitUntil: 'networkidle' }).catch(() => {}),
|
|
ausbildungLink.click(),
|
|
]);
|
|
}
|
|
}
|
|
|
|
// Parse the qualification table
|
|
// Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary)
|
|
const tables = await page.$$('table');
|
|
const ausbildungen: FdiskAusbildung[] = [];
|
|
|
|
for (const table of tables) {
|
|
const rows = await table.$$eval('tr', (rows) => {
|
|
return rows.map(row => ({
|
|
cells: Array.from(row.querySelectorAll('td, th')).map(c => c.textContent?.trim() ?? ''),
|
|
}));
|
|
});
|
|
|
|
if (rows.length < 2) continue;
|
|
|
|
// Detect if this looks like an Ausbildung table
|
|
const header = rows[0].cells.map(c => c.toLowerCase());
|
|
const isAusbildungTable =
|
|
header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
|
|
if (!isAusbildungTable) continue;
|
|
|
|
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
|
|
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
|
|
const ortIdx = header.findIndex(h => h.includes('ort'));
|
|
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
|
|
|
|
for (const row of rows.slice(1)) {
|
|
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
|
|
if (!kursname) continue;
|
|
|
|
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
|
|
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
|
|
|
|
ausbildungen.push({
|
|
standesbuchNr: member.standesbuchNr,
|
|
kursname,
|
|
kursDatum,
|
|
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
|
|
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
|
|
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
|
|
syncKey,
|
|
});
|
|
}
|
|
|
|
break; // only process the first Ausbildung table found
|
|
}
|
|
|
|
return ausbildungen;
|
|
}
|