Files
dashboard/sync/dist/scraper.js
Matthias Hochmeister 02797554aa fix: commit pre-compiled dist and simplify sync Dockerfile
Server npm proxy silently drops devDependencies, making TypeScript
unavailable in Docker. Solution: compile locally and commit dist/.
Dockerfile now only needs prod deps + Playwright, both of which
install cleanly via the public registry.

Also fix TS2688/TS2304 errors: add DOM to tsconfig lib and cast
querySelectorAll results to Element inside $$eval callbacks.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-01 13:51:48 +01:00

230 lines
10 KiB
JavaScript

"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.scrapeAll = scrapeAll;
const test_1 = require("@playwright/test");
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
const LOGIN_URL = `${BASE_URL}/fdisk/`;
const MEMBERS_URL = `${BASE_URL}/fdisk/module/vws/Start.aspx`;
function log(msg) {
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
}
/**
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
* Returns null if empty or unparseable.
*/
function parseDate(raw) {
if (!raw)
return null;
const trimmed = raw.trim();
if (!trimmed)
return null;
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
if (!match)
return null;
return `${match[3]}-${match[2]}-${match[1]}`;
}
/**
* Extract text content from a cell, trimmed, or null if empty.
*/
function cellText(text) {
const t = (text ?? '').trim();
return t || null;
}
async function scrapeAll(username, password) {
const browser = await test_1.chromium.launch({ headless: true });
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
await login(page, username, password);
const members = await scrapeMembers(page);
log(`Found ${members.length} members`);
const ausbildungen = [];
for (const member of members) {
if (!member.detailUrl)
continue;
try {
const quals = await scrapeMemberAusbildung(page, member);
ausbildungen.push(...quals);
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
// polite delay between requests
await page.waitForTimeout(500);
}
catch (err) {
log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`);
}
}
return { members, ausbildungen };
}
finally {
await browser.close();
}
}
async function login(page, username, password) {
log(`Navigating to ${LOGIN_URL}`);
await page.goto(LOGIN_URL, { waitUntil: 'networkidle' });
// ASP.NET WebForms login — try common selector patterns
// Adjust these selectors if login fails
const usernameField = page.locator('input[type="text"], input[name*="user"], input[name*="User"], input[id*="user"], input[id*="User"]').first();
const passwordField = page.locator('input[type="password"]').first();
await usernameField.fill(username);
await passwordField.fill(password);
// Submit — look for a login/submit button
const submitButton = page.locator('input[type="submit"], button[type="submit"]').first();
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }),
submitButton.click(),
]);
// Verify we're logged in by checking we're not still on the login page
const currentUrl = page.url();
if (currentUrl.includes('login') || currentUrl.includes('Login') || currentUrl === LOGIN_URL) {
throw new Error(`Login failed — still on login page: ${currentUrl}`);
}
log(`Logged in successfully, redirected to: ${currentUrl}`);
}
async function scrapeMembers(page) {
log(`Navigating to members list: ${MEMBERS_URL}`);
await page.goto(MEMBERS_URL, { waitUntil: 'networkidle' });
// Wait for the member table to appear
// ASP.NET GridView renders as an HTML table — find the data table
await page.waitForSelector('table', { timeout: 15000 });
// Find the main data table (likely the one with the most rows)
// Columns: Status, St.-Nr., Dienstgrad, Vorname, Zuname, Geburtsdatum, SVNR, Eintrittsdatum, Abmeldedatum
const rows = await page.$$eval('table tr', (rows) => {
return rows.map(row => {
const cells = Array.from(row.querySelectorAll('td'));
const link = row.querySelector('a');
return {
cells: cells.map(c => c.textContent?.trim() ?? ''),
href: link?.href ?? null,
onclick: link?.getAttribute('onclick') ?? row.getAttribute('onclick') ?? null,
};
});
});
// Find the header row to determine column positions
const headerRow = await page.$eval('table tr:first-child', (row) => {
const cells = Array.from(row.querySelectorAll('th, td'));
return cells.map(c => c.textContent?.trim().toLowerCase() ?? '');
});
// Detect column indices from headers
const colIdx = {
status: headerRow.findIndex(h => h.includes('status')),
standesbuchNr: headerRow.findIndex(h => h.includes('st.-nr') || h.includes('stnr') || h.includes('nr')),
dienstgrad: headerRow.findIndex(h => h.includes('dienstgrad')),
vorname: headerRow.findIndex(h => h.includes('vorname')),
zuname: headerRow.findIndex(h => h.includes('zuname') || h.includes('nachname')),
geburtsdatum: headerRow.findIndex(h => h.includes('geburt')),
svnr: headerRow.findIndex(h => h.includes('svnr') || h.includes('sv-nr')),
eintrittsdatum: headerRow.findIndex(h => h.includes('eintritt')),
abmeldedatum: headerRow.findIndex(h => h.includes('abmeld')),
};
log(`Detected columns: ${JSON.stringify(colIdx)}`);
// Fallback to positional columns if detection failed
// Based on screenshot: Status(0), St.-Nr.(1), Dienstgrad(2), Vorname(3), Zuname(4),
// Geburtsdatum(5), SVNR(6), Eintrittsdatum(7), Abmeldedatum(8)
if (colIdx.standesbuchNr === -1)
colIdx.standesbuchNr = 1;
if (colIdx.dienstgrad === -1)
colIdx.dienstgrad = 2;
if (colIdx.vorname === -1)
colIdx.vorname = 3;
if (colIdx.zuname === -1)
colIdx.zuname = 4;
if (colIdx.geburtsdatum === -1)
colIdx.geburtsdatum = 5;
if (colIdx.svnr === -1)
colIdx.svnr = 6;
if (colIdx.eintrittsdatum === -1)
colIdx.eintrittsdatum = 7;
if (colIdx.abmeldedatum === -1)
colIdx.abmeldedatum = 8;
const members = [];
for (const row of rows) {
const { cells, href, onclick } = row;
// Skip header rows and empty rows
if (cells.length < 5)
continue;
const stnr = cellText(cells[colIdx.standesbuchNr]);
const vorname = cellText(cells[colIdx.vorname]);
const zuname = cellText(cells[colIdx.zuname]);
if (!stnr || !vorname || !zuname)
continue;
const abmeldedatum = parseDate(cells[colIdx.abmeldedatum]);
members.push({
standesbuchNr: stnr,
dienstgrad: cellText(cells[colIdx.dienstgrad]) ?? '',
vorname,
zuname,
geburtsdatum: parseDate(cells[colIdx.geburtsdatum]),
svnr: cellText(cells[colIdx.svnr]),
eintrittsdatum: parseDate(cells[colIdx.eintrittsdatum]),
abmeldedatum,
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
detailUrl: href,
});
}
return members;
}
async function scrapeMemberAusbildung(page, member) {
if (!member.detailUrl)
return [];
await page.goto(member.detailUrl, { waitUntil: 'networkidle' });
// Look for Ausbildungsliste section — it's likely a table or list
// Try to find it by heading text
const ausbildungSection = page.locator('text=Ausbildung, text=Ausbildungsliste').first();
const hasSec = await ausbildungSection.isVisible().catch(() => false);
if (!hasSec) {
// Try navigating to an Ausbildung tab/link if present
const ausbildungLink = page.locator('a:has-text("Ausbildung")').first();
const hasLink = await ausbildungLink.isVisible().catch(() => false);
if (hasLink) {
await Promise.all([
page.waitForNavigation({ waitUntil: 'networkidle' }).catch(() => { }),
ausbildungLink.click(),
]);
}
}
// Parse the qualification table
// Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary)
const tables = await page.$$('table');
const ausbildungen = [];
for (const table of tables) {
const rows = await table.$$eval('tr', (rows) => {
return rows.map(row => ({
cells: Array.from(row.querySelectorAll('td, th')).map(c => c.textContent?.trim() ?? ''),
}));
});
if (rows.length < 2)
continue;
// Detect if this looks like an Ausbildung table
const header = rows[0].cells.map(c => c.toLowerCase());
const isAusbildungTable = header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
if (!isAusbildungTable)
continue;
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
const ortIdx = header.findIndex(h => h.includes('ort'));
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
for (const row of rows.slice(1)) {
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
if (!kursname)
continue;
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
ausbildungen.push({
standesbuchNr: member.standesbuchNr,
kursname,
kursDatum,
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
syncKey,
});
}
break; // only process the first Ausbildung table found
}
return ausbildungen;
}