Server npm proxy silently drops devDependencies, making TypeScript unavailable in Docker. Solution: compile locally and commit dist/. Dockerfile now only needs prod deps + Playwright, both of which install cleanly via the public registry. Also fix TS2688/TS2304 errors: add DOM to tsconfig lib and cast querySelectorAll results to Element inside $$eval callbacks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
230 lines
10 KiB
JavaScript
230 lines
10 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.scrapeAll = scrapeAll;
|
|
const test_1 = require("@playwright/test");
|
|
const BASE_URL = process.env.FDISK_BASE_URL ?? 'https://app.fdisk.at';
|
|
const LOGIN_URL = `${BASE_URL}/fdisk/`;
|
|
const MEMBERS_URL = `${BASE_URL}/fdisk/module/vws/Start.aspx`;
|
|
function log(msg) {
|
|
console.log(`[scraper] ${new Date().toISOString()} ${msg}`);
|
|
}
|
|
/**
|
|
* Parse a date string from FDISK (DD.MM.YYYY) to ISO format (YYYY-MM-DD).
|
|
* Returns null if empty or unparseable.
|
|
*/
|
|
function parseDate(raw) {
|
|
if (!raw)
|
|
return null;
|
|
const trimmed = raw.trim();
|
|
if (!trimmed)
|
|
return null;
|
|
const match = trimmed.match(/^(\d{2})\.(\d{2})\.(\d{4})$/);
|
|
if (!match)
|
|
return null;
|
|
return `${match[3]}-${match[2]}-${match[1]}`;
|
|
}
|
|
/**
|
|
* Extract text content from a cell, trimmed, or null if empty.
|
|
*/
|
|
function cellText(text) {
|
|
const t = (text ?? '').trim();
|
|
return t || null;
|
|
}
|
|
async function scrapeAll(username, password) {
|
|
const browser = await test_1.chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
});
|
|
const page = await context.newPage();
|
|
try {
|
|
await login(page, username, password);
|
|
const members = await scrapeMembers(page);
|
|
log(`Found ${members.length} members`);
|
|
const ausbildungen = [];
|
|
for (const member of members) {
|
|
if (!member.detailUrl)
|
|
continue;
|
|
try {
|
|
const quals = await scrapeMemberAusbildung(page, member);
|
|
ausbildungen.push(...quals);
|
|
log(` ${member.vorname} ${member.zuname}: ${quals.length} Ausbildungen`);
|
|
// polite delay between requests
|
|
await page.waitForTimeout(500);
|
|
}
|
|
catch (err) {
|
|
log(` WARN: could not scrape Ausbildung for ${member.vorname} ${member.zuname}: ${err}`);
|
|
}
|
|
}
|
|
return { members, ausbildungen };
|
|
}
|
|
finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
async function login(page, username, password) {
|
|
log(`Navigating to ${LOGIN_URL}`);
|
|
await page.goto(LOGIN_URL, { waitUntil: 'networkidle' });
|
|
// ASP.NET WebForms login — try common selector patterns
|
|
// Adjust these selectors if login fails
|
|
const usernameField = page.locator('input[type="text"], input[name*="user"], input[name*="User"], input[id*="user"], input[id*="User"]').first();
|
|
const passwordField = page.locator('input[type="password"]').first();
|
|
await usernameField.fill(username);
|
|
await passwordField.fill(password);
|
|
// Submit — look for a login/submit button
|
|
const submitButton = page.locator('input[type="submit"], button[type="submit"]').first();
|
|
await Promise.all([
|
|
page.waitForNavigation({ waitUntil: 'networkidle' }),
|
|
submitButton.click(),
|
|
]);
|
|
// Verify we're logged in by checking we're not still on the login page
|
|
const currentUrl = page.url();
|
|
if (currentUrl.includes('login') || currentUrl.includes('Login') || currentUrl === LOGIN_URL) {
|
|
throw new Error(`Login failed — still on login page: ${currentUrl}`);
|
|
}
|
|
log(`Logged in successfully, redirected to: ${currentUrl}`);
|
|
}
|
|
async function scrapeMembers(page) {
|
|
log(`Navigating to members list: ${MEMBERS_URL}`);
|
|
await page.goto(MEMBERS_URL, { waitUntil: 'networkidle' });
|
|
// Wait for the member table to appear
|
|
// ASP.NET GridView renders as an HTML table — find the data table
|
|
await page.waitForSelector('table', { timeout: 15000 });
|
|
// Find the main data table (likely the one with the most rows)
|
|
// Columns: Status, St.-Nr., Dienstgrad, Vorname, Zuname, Geburtsdatum, SVNR, Eintrittsdatum, Abmeldedatum
|
|
const rows = await page.$$eval('table tr', (rows) => {
|
|
return rows.map(row => {
|
|
const cells = Array.from(row.querySelectorAll('td'));
|
|
const link = row.querySelector('a');
|
|
return {
|
|
cells: cells.map(c => c.textContent?.trim() ?? ''),
|
|
href: link?.href ?? null,
|
|
onclick: link?.getAttribute('onclick') ?? row.getAttribute('onclick') ?? null,
|
|
};
|
|
});
|
|
});
|
|
// Find the header row to determine column positions
|
|
const headerRow = await page.$eval('table tr:first-child', (row) => {
|
|
const cells = Array.from(row.querySelectorAll('th, td'));
|
|
return cells.map(c => c.textContent?.trim().toLowerCase() ?? '');
|
|
});
|
|
// Detect column indices from headers
|
|
const colIdx = {
|
|
status: headerRow.findIndex(h => h.includes('status')),
|
|
standesbuchNr: headerRow.findIndex(h => h.includes('st.-nr') || h.includes('stnr') || h.includes('nr')),
|
|
dienstgrad: headerRow.findIndex(h => h.includes('dienstgrad')),
|
|
vorname: headerRow.findIndex(h => h.includes('vorname')),
|
|
zuname: headerRow.findIndex(h => h.includes('zuname') || h.includes('nachname')),
|
|
geburtsdatum: headerRow.findIndex(h => h.includes('geburt')),
|
|
svnr: headerRow.findIndex(h => h.includes('svnr') || h.includes('sv-nr')),
|
|
eintrittsdatum: headerRow.findIndex(h => h.includes('eintritt')),
|
|
abmeldedatum: headerRow.findIndex(h => h.includes('abmeld')),
|
|
};
|
|
log(`Detected columns: ${JSON.stringify(colIdx)}`);
|
|
// Fallback to positional columns if detection failed
|
|
// Based on screenshot: Status(0), St.-Nr.(1), Dienstgrad(2), Vorname(3), Zuname(4),
|
|
// Geburtsdatum(5), SVNR(6), Eintrittsdatum(7), Abmeldedatum(8)
|
|
if (colIdx.standesbuchNr === -1)
|
|
colIdx.standesbuchNr = 1;
|
|
if (colIdx.dienstgrad === -1)
|
|
colIdx.dienstgrad = 2;
|
|
if (colIdx.vorname === -1)
|
|
colIdx.vorname = 3;
|
|
if (colIdx.zuname === -1)
|
|
colIdx.zuname = 4;
|
|
if (colIdx.geburtsdatum === -1)
|
|
colIdx.geburtsdatum = 5;
|
|
if (colIdx.svnr === -1)
|
|
colIdx.svnr = 6;
|
|
if (colIdx.eintrittsdatum === -1)
|
|
colIdx.eintrittsdatum = 7;
|
|
if (colIdx.abmeldedatum === -1)
|
|
colIdx.abmeldedatum = 8;
|
|
const members = [];
|
|
for (const row of rows) {
|
|
const { cells, href, onclick } = row;
|
|
// Skip header rows and empty rows
|
|
if (cells.length < 5)
|
|
continue;
|
|
const stnr = cellText(cells[colIdx.standesbuchNr]);
|
|
const vorname = cellText(cells[colIdx.vorname]);
|
|
const zuname = cellText(cells[colIdx.zuname]);
|
|
if (!stnr || !vorname || !zuname)
|
|
continue;
|
|
const abmeldedatum = parseDate(cells[colIdx.abmeldedatum]);
|
|
members.push({
|
|
standesbuchNr: stnr,
|
|
dienstgrad: cellText(cells[colIdx.dienstgrad]) ?? '',
|
|
vorname,
|
|
zuname,
|
|
geburtsdatum: parseDate(cells[colIdx.geburtsdatum]),
|
|
svnr: cellText(cells[colIdx.svnr]),
|
|
eintrittsdatum: parseDate(cells[colIdx.eintrittsdatum]),
|
|
abmeldedatum,
|
|
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
|
detailUrl: href,
|
|
});
|
|
}
|
|
return members;
|
|
}
|
|
async function scrapeMemberAusbildung(page, member) {
|
|
if (!member.detailUrl)
|
|
return [];
|
|
await page.goto(member.detailUrl, { waitUntil: 'networkidle' });
|
|
// Look for Ausbildungsliste section — it's likely a table or list
|
|
// Try to find it by heading text
|
|
const ausbildungSection = page.locator('text=Ausbildung, text=Ausbildungsliste').first();
|
|
const hasSec = await ausbildungSection.isVisible().catch(() => false);
|
|
if (!hasSec) {
|
|
// Try navigating to an Ausbildung tab/link if present
|
|
const ausbildungLink = page.locator('a:has-text("Ausbildung")').first();
|
|
const hasLink = await ausbildungLink.isVisible().catch(() => false);
|
|
if (hasLink) {
|
|
await Promise.all([
|
|
page.waitForNavigation({ waitUntil: 'networkidle' }).catch(() => { }),
|
|
ausbildungLink.click(),
|
|
]);
|
|
}
|
|
}
|
|
// Parse the qualification table
|
|
// Expected columns: Kursname, Datum, Ablaufdatum, Ort, Bemerkung (may vary)
|
|
const tables = await page.$$('table');
|
|
const ausbildungen = [];
|
|
for (const table of tables) {
|
|
const rows = await table.$$eval('tr', (rows) => {
|
|
return rows.map(row => ({
|
|
cells: Array.from(row.querySelectorAll('td, th')).map(c => c.textContent?.trim() ?? ''),
|
|
}));
|
|
});
|
|
if (rows.length < 2)
|
|
continue;
|
|
// Detect if this looks like an Ausbildung table
|
|
const header = rows[0].cells.map(c => c.toLowerCase());
|
|
const isAusbildungTable = header.some(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
if (!isAusbildungTable)
|
|
continue;
|
|
const kursnameIdx = header.findIndex(h => h.includes('kurs') || h.includes('ausbildung') || h.includes('bezeichnung'));
|
|
const datumIdx = header.findIndex(h => h.includes('datum') || h.includes('abschluss'));
|
|
const ablaufIdx = header.findIndex(h => h.includes('ablauf') || h.includes('gültig'));
|
|
const ortIdx = header.findIndex(h => h.includes('ort'));
|
|
const bemIdx = header.findIndex(h => h.includes('bem') || h.includes('info'));
|
|
for (const row of rows.slice(1)) {
|
|
const kursname = cellText(row.cells[kursnameIdx >= 0 ? kursnameIdx : 0]);
|
|
if (!kursname)
|
|
continue;
|
|
const kursDatum = parseDate(datumIdx >= 0 ? row.cells[datumIdx] : null);
|
|
const syncKey = `${member.standesbuchNr}::${kursname}::${kursDatum ?? ''}`;
|
|
ausbildungen.push({
|
|
standesbuchNr: member.standesbuchNr,
|
|
kursname,
|
|
kursDatum,
|
|
ablaufdatum: parseDate(ablaufIdx >= 0 ? row.cells[ablaufIdx] : null),
|
|
ort: ortIdx >= 0 ? cellText(row.cells[ortIdx]) : null,
|
|
bemerkung: bemIdx >= 0 ? cellText(row.cells[bemIdx]) : null,
|
|
syncKey,
|
|
});
|
|
}
|
|
break; // only process the first Ausbildung table found
|
|
}
|
|
return ausbildungen;
|
|
}
|