update
This commit is contained in:
@@ -39,6 +39,123 @@ function cellText(text: string | undefined | null): string | null {
|
||||
return t || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch only members we care about, rather than scraping the full member list.
|
||||
*
|
||||
* Phase 1: one search per known StNr (exact match).
|
||||
* Phase 2: if knownNames is non-empty, a single unfiltered fetch (page 1 only)
|
||||
* to pick up members matched by name (first-time linking).
|
||||
*
|
||||
* Returns deduplicated FdiskMember[].
|
||||
*/
|
||||
async function scrapeKnownMembers(
|
||||
frame: Frame,
|
||||
knownStNrs: Set<string>,
|
||||
knownNames: Set<string>,
|
||||
): Promise<FdiskMember[]> {
|
||||
type ParsedRow = Awaited<ReturnType<typeof parseRowsFromTable>>[number];
|
||||
|
||||
const seenStNrs = new Set<string>();
|
||||
const allRows: ParsedRow[] = [];
|
||||
|
||||
// --- Phase 1: fetch by exact StNr ---
|
||||
log(`scrapeKnownMembers: fetching ${knownStNrs.size} known StNrs`);
|
||||
for (const stNr of knownStNrs) {
|
||||
const formOk = await frame.evaluate((sn) => {
|
||||
const form = (document as any).forms['frmsearch'];
|
||||
if (!form) return false;
|
||||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||||
if (!fromFld || !toFld) return false;
|
||||
fromFld.value = sn;
|
||||
toFld.value = sn;
|
||||
return true;
|
||||
}, stNr);
|
||||
|
||||
if (!formOk) {
|
||||
log(` WARN: search form not usable for StNr ${stNr}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
await Promise.all([
|
||||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||||
]);
|
||||
|
||||
const rows = await parseRowsFromTable(frame);
|
||||
for (const r of rows) {
|
||||
if (r.standesbuchNr && !seenStNrs.has(r.standesbuchNr)) {
|
||||
seenStNrs.add(r.standesbuchNr);
|
||||
allRows.push(r);
|
||||
}
|
||||
}
|
||||
log(` StNr ${stNr}: ${rows.length} row(s)`);
|
||||
|
||||
// Be gentle on the server
|
||||
await frame.page().waitForTimeout(300);
|
||||
}
|
||||
|
||||
// --- Phase 2: single unfiltered fetch for name-matching ---
|
||||
if (knownNames.size > 0) {
|
||||
log(`scrapeKnownMembers: unfiltered fetch for ${knownNames.size} name-based matches`);
|
||||
|
||||
// Clear StNr filter
|
||||
await frame.evaluate(() => {
|
||||
const form = (document as any).forms['frmsearch'];
|
||||
if (!form) return;
|
||||
const fromFld = form.elements['ListFilter$searchstandesbuchnummer'] as HTMLInputElement | null;
|
||||
const toFld = form.elements['ListFilter$searchstandesbuchnummer_bis'] as HTMLInputElement | null;
|
||||
if (fromFld) fromFld.value = '';
|
||||
if (toFld) toFld.value = '';
|
||||
});
|
||||
|
||||
await Promise.all([
|
||||
frame.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
|
||||
frame.evaluate(() => { (document as any).forms['frmsearch'].submit(); }),
|
||||
]);
|
||||
|
||||
const rows = await parseRowsFromTable(frame);
|
||||
let matched = 0;
|
||||
for (const r of rows) {
|
||||
if (!r.standesbuchNr || seenStNrs.has(r.standesbuchNr)) continue;
|
||||
const nameKey = `${(r.vorname || '').toLowerCase()}::${(r.zuname || '').toLowerCase()}`;
|
||||
if (knownNames.has(nameKey)) {
|
||||
seenStNrs.add(r.standesbuchNr);
|
||||
allRows.push(r);
|
||||
matched++;
|
||||
}
|
||||
}
|
||||
log(` Unfiltered page: ${rows.length} total rows, ${matched} name-matched`);
|
||||
}
|
||||
|
||||
log(`scrapeKnownMembers: ${allRows.length} members collected`);
|
||||
|
||||
// Build FdiskMember objects
|
||||
const members: FdiskMember[] = [];
|
||||
for (const row of allRows) {
|
||||
if (!row.standesbuchNr || !row.vorname || !row.zuname) continue;
|
||||
const abmeldedatum = parseDate(row.abmeldedatum);
|
||||
members.push({
|
||||
standesbuchNr: row.standesbuchNr,
|
||||
dienstgrad: row.dienstgrad,
|
||||
vorname: row.vorname,
|
||||
zuname: row.zuname,
|
||||
geburtsdatum: parseDate(row.geburtsdatum),
|
||||
svnr: row.svnr || null,
|
||||
eintrittsdatum: parseDate(row.eintrittsdatum),
|
||||
abmeldedatum,
|
||||
status: abmeldedatum ? 'ausgetreten' : 'aktiv',
|
||||
detailUrl: row.href,
|
||||
geburtsort: null,
|
||||
geschlecht: null,
|
||||
beruf: null,
|
||||
wohnort: null,
|
||||
plz: null,
|
||||
});
|
||||
}
|
||||
return members;
|
||||
}
|
||||
|
||||
export async function scrapeAll(username: string, password: string, knownStNrs: Set<string>, knownNames: Set<string>): Promise<{
|
||||
members: FdiskMember[];
|
||||
ausbildungen: FdiskAusbildung[];
|
||||
@@ -64,8 +181,8 @@ export async function scrapeAll(username: string, password: string, knownStNrs:
|
||||
// Navigate via the menu frame (left.aspx) to set session state properly.
|
||||
const mainFrame = await navigateToMemberList(page);
|
||||
|
||||
const members = await scrapeMembers(mainFrame);
|
||||
log(`Found ${members.length} members`);
|
||||
const members = await scrapeKnownMembers(mainFrame, knownStNrs, knownNames);
|
||||
log(`Found ${members.length} members (targeted query)`);
|
||||
|
||||
const ausbildungen: FdiskAusbildung[] = [];
|
||||
const befoerderungen: FdiskBefoerderung[] = [];
|
||||
@@ -73,13 +190,6 @@ export async function scrapeAll(username: string, password: string, knownStNrs:
|
||||
const fahrgenehmigungen: FdiskFahrgenehmigung[] = [];
|
||||
|
||||
for (const member of members) {
|
||||
// Only scrape detail pages for members with a dashboard account
|
||||
// (matched by standesbuchNr or by name for first-time linking)
|
||||
const nameKey = `${member.vorname.toLowerCase()}::${member.zuname.toLowerCase()}`;
|
||||
if (!knownStNrs.has(member.standesbuchNr) && !knownNames.has(nameKey)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// Navigate to member detail page — use direct URL if available, else search+click fallback
|
||||
const onDetail = member.detailUrl
|
||||
|
||||
Reference in New Issue
Block a user