update
This commit is contained in:
@@ -73,10 +73,16 @@ export async function scrapeAll(username: string, password: string): Promise<{
|
||||
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
|
||||
|
||||
for (const member of members) {
|
||||
if (!member.detailUrl) continue;
|
||||
try {
|
||||
// Navigate to detail page and scrape all sub-sections
|
||||
await frame_goto(mainFrame, member.detailUrl);
|
||||
// Navigate to member detail page — use direct URL if available, else search+click fallback
|
||||
const onDetail = member.detailUrl
|
||||
? (await frame_goto(mainFrame, member.detailUrl), true)
|
||||
: await navigateToMemberDetailBySearch(mainFrame, member.standesbuchNr);
|
||||
|
||||
if (!onDetail) {
|
||||
log(` SKIP ${member.vorname} ${member.zuname} (${member.standesbuchNr}): could not reach detail page`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Scrape extra profile fields from the detail form
|
||||
const profileFields = await scrapeDetailProfileFields(mainFrame);
|
||||
@@ -160,6 +166,65 @@ async function login(page: Page, username: string, password: string): Promise<vo
|
||||
log(`Logged in successfully, redirected to: ${currentUrl}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback navigation to a member's detail page when no direct URL is available.
|
||||
* Navigates to the member list, filters by exact standesbuchNr, then clicks the result row.
|
||||
* Returns true if we successfully landed on a detail page.
|
||||
*/
|
||||
async function navigateToMemberDetailBySearch(frame: Frame, standesbuchNr: string): Promise<boolean> {
|
||||
// Navigate to the member list
|
||||
await frame.goto(MEMBERS_URL, { waitUntil: 'domcontentloaded' });
|
||||
await frame.waitForLoadState('networkidle');
|
||||
|
||||
// Set exact standesbuchNr filter in the search form
|
||||
const formOk = await frame.evaluate((stNr) => {
|
||||
const form = (document as any).forms['frmsearch'];
|
||||
if (!form) return false;
|
||||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||||
if (!fromFld || !toFld) return false;
|
||||
fromFld.value = stNr;
|
||||
toFld.value = stNr;
|
||||
return true;
|
||||
}, standesbuchNr);
|
||||
|
||||
if (!formOk) {
|
||||
log(` WARN navigateToMemberDetailBySearch: search form not usable for StNr ${standesbuchNr}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
await Promise.all([
|
||||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||||
]);
|
||||
|
||||
// Click on the first data row — FDISK rows navigate to the detail page on click
|
||||
const firstRowLink = await frame.$('table.FdcLayList tbody tr:first-child a, table.FdcLayList tbody tr:first-child td');
|
||||
if (!firstRowLink) {
|
||||
log(` WARN navigateToMemberDetailBySearch: no result row for StNr ${standesbuchNr}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
await Promise.all([
|
||||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }),
|
||||
firstRowLink.click(),
|
||||
]);
|
||||
} catch {
|
||||
// waitForNavigation may time out if click didn't navigate (e.g. onclick vs href)
|
||||
// Check whether the URL changed at all
|
||||
}
|
||||
|
||||
const url = frame.url();
|
||||
const onDetailPage = !url.includes('MitgliedschaftenList') && !url.includes('meine_Mitglieder');
|
||||
if (onDetailPage) {
|
||||
log(` Navigated to detail via search+click: ${url}`);
|
||||
} else {
|
||||
log(` WARN navigateToMemberDetailBySearch: still on list page after click for StNr ${standesbuchNr}`);
|
||||
}
|
||||
return onDetailPage;
|
||||
}
|
||||
|
||||
async function navigateToMemberList(page: Page): Promise<Frame> {
|
||||
const mainFrame = page.frame({ name: 'mainFrame' });
|
||||
if (!mainFrame) throw new Error('mainFrame not found in Start.aspx frameset');
|
||||
@@ -266,6 +331,11 @@ async function scrapeMembers(frame: Frame): Promise<FdiskMember[]> {
|
||||
await frame.waitForSelector('table.FdcLayList', { timeout: 20000 });
|
||||
const firstRows = await parseRowsFromTable(frame);
|
||||
log(`Initial fetch: ${firstRows.length} rows`);
|
||||
|
||||
// Log href debug info for the first row to diagnose URL extraction
|
||||
const rowDebug = await frame.evaluate(() => (window as any).__fdiskFirstRowDebug ?? 'no debug info');
|
||||
log(`Row href debug: ${rowDebug}`);
|
||||
|
||||
for (const row of firstRows) {
|
||||
log(` Row: StNr="${row.standesbuchNr}" Vorname="${row.vorname}" Zuname="${row.zuname}" Status="${row.status}" Dienstgrad="${row.dienstgrad}"`);
|
||||
}
|
||||
@@ -364,9 +434,9 @@ async function parseRowsFromTable(frame: Frame) {
|
||||
// Column layout (0-indexed td): 0=icon, 1=Status, 2=St.-Nr., 3=Dienstgrad,
|
||||
// 4=Vorname, 5=Zuname, 6=Geburtsdatum, 7=SVNR, 8=Eintrittsdatum, 9=Abmeldedatum, 10=icon
|
||||
// Each <td> contains an <a title="value"> — the title is the clean cell text.
|
||||
// The href on each <a> is the member detail URL (same link repeated across all cells in a row).
|
||||
// Navigation may be via href or onclick handlers (FDISK uses both depending on version).
|
||||
return frame.$$eval('table.FdcLayList tbody tr', (trs) =>
|
||||
trs.map((tr) => {
|
||||
trs.map((tr, rowIdx) => {
|
||||
const cells = Array.from(tr.querySelectorAll('td'));
|
||||
const val = (i: number) => {
|
||||
const a = cells[i]?.querySelector('a');
|
||||
@@ -374,7 +444,45 @@ async function parseRowsFromTable(frame: Frame) {
|
||||
// Use title only if non-empty; otherwise fall back to textContent
|
||||
return (title || cells[i]?.textContent || '').trim();
|
||||
};
|
||||
const href = (tr.querySelector('a') as HTMLAnchorElement | null)?.href ?? null;
|
||||
|
||||
// Extract detail URL — try multiple strategies:
|
||||
// 1. Standard <a href="..."> pointing to an .aspx page
|
||||
// 2. onclick attribute on <a>, <td>, or <tr> containing an .aspx URL
|
||||
let href: string | null = null;
|
||||
let debugInfo = '';
|
||||
|
||||
for (const a of Array.from(tr.querySelectorAll('a'))) {
|
||||
const rawHref = (a as Element).getAttribute('href') ?? '';
|
||||
debugInfo += `a.href="${rawHref}" `;
|
||||
if (rawHref && rawHref !== '#' && rawHref !== '' && !rawHref.startsWith('javascript:')) {
|
||||
href = (a as HTMLAnchorElement).href; // resolves relative → absolute
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!href) {
|
||||
// Scan onclick on all ancestors + cells + anchors for .aspx URLs
|
||||
const candidates: Element[] = [tr, ...Array.from(tr.querySelectorAll('a, td'))];
|
||||
for (const el of candidates) {
|
||||
const onclick = el.getAttribute('onclick') ?? '';
|
||||
if (onclick) debugInfo += `onclick="${onclick}" `;
|
||||
const match = onclick.match(/['"]([^'"]*\.aspx[^'"]*)['"]/);
|
||||
if (match) {
|
||||
try {
|
||||
href = new URL(match[1], (window as Window).location.href).href;
|
||||
} catch {
|
||||
href = match[1];
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Log debug info for first data row to help diagnose href extraction issues
|
||||
if (rowIdx === 0 && val(2)) {
|
||||
(window as any).__fdiskFirstRowDebug = `StNr=${val(2)} href=${href} debug=${debugInfo}`;
|
||||
}
|
||||
|
||||
return {
|
||||
status: val(1),
|
||||
standesbuchNr: val(2),
|
||||
|
||||
Reference in New Issue
Block a user