import { readFileSync, readdirSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); function escapeRegExp(string) { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } class HPRDataLoader { constructor() { this.episodes = []; this.hosts = []; this.comments = []; this.series = []; this.transcripts = new Map(); // Map of episode id to transcript text } /** * Load all data from JSON files and transcripts */ async load() { console.error('Loading HPR data...'); // Load JSON files this.episodes = this.loadJSON('hpr_metadata/episodes.json'); this.hosts = this.loadJSON('hpr_metadata/hosts.json'); this.comments = this.loadJSON('hpr_metadata/comments.json'); this.series = this.loadJSON('hpr_metadata/series.json'); console.error(`Loaded ${this.episodes.length} episodes`); console.error(`Loaded ${this.hosts.length} hosts`); console.error(`Loaded ${this.comments.length} comments`); console.error(`Loaded ${this.series.length} series`); // Load transcripts this.loadTranscripts(); console.error(`Loaded ${this.transcripts.size} transcripts`); console.error('HPR data loading complete!'); } /** * Load a JSON file */ loadJSON(relativePath) { const filePath = join(__dirname, relativePath); try { const data = readFileSync(filePath, 'utf-8'); return JSON.parse(data); } catch (error) { console.error(`Error loading ${relativePath}:`, error.message); return []; } } /** * Load all transcript files */ loadTranscripts() { const transcriptsDir = join(__dirname, 'hpr_transcripts'); try { const files = readdirSync(transcriptsDir); for (const file of files) { if (file.endsWith('.txt')) { // Extract episode ID from filename (e.g., hpr0016.txt -> 16) const match = file.match(/hpr(\d+)\.txt/); if (match) { const episodeId = parseInt(match[1], 10); const filePath = join(transcriptsDir, file); try { const content = readFileSync(filePath, 'utf-8'); this.transcripts.set(episodeId, content); } catch (error) { console.error(`Error loading transcript ${file}:`, error.message); } } } } } catch (error) { console.error('Error loading transcripts directory:', error.message); } } /** * Get episode by ID */ getEpisode(id) { return this.episodes.find(ep => ep.id === id); } /** * Get host by ID */ getHost(id) { return this.hosts.find(host => host.hostid === id); } /** * Get series by ID */ getSeries(id) { return this.series.find(s => s.id === id); } /** * Get transcript for episode */ getTranscript(episodeId) { return this.transcripts.get(episodeId); } /** * Get comments for episode */ getCommentsForEpisode(episodeId) { return this.comments.filter(c => c.eps_id === episodeId); } /** * Get episodes by host */ getEpisodesByHost(hostId) { return this.episodes.filter(ep => ep.hostid === hostId); } /** * Get episodes in a series */ getEpisodesInSeries(seriesId) { return this.episodes.filter(ep => ep.series === seriesId); } /** * Search episodes by keyword in title, summary, or tags */ searchEpisodes(query, options = {}) { const { limit = 20, hostId = null, seriesId = null, tag = null, fromDate = null, toDate = null } = options; const queryLower = query.toLowerCase(); let results = this.episodes.filter(ep => { // Basic text search const matchesQuery = !query || ep.title.toLowerCase().includes(queryLower) || ep.summary.toLowerCase().includes(queryLower) || ep.tags.toLowerCase().includes(queryLower) || ep.notes.toLowerCase().includes(queryLower); // Filter by host const matchesHost = !hostId || ep.hostid === hostId; // Filter by series const matchesSeries = seriesId === null || ep.series === seriesId; // Filter by tag const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase()); // Filter by date range const matchesDateRange = (!fromDate || ep.date >= fromDate) && (!toDate || ep.date <= toDate); return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange; }); // Sort by date (newest first) results.sort((a, b) => b.date.localeCompare(a.date)); return results.slice(0, limit); } /** * Search transcripts by keyword */ searchTranscripts(query, options = {}) { const { limit = 20, contextLines = 3, terms = [], matchMode = 'auto', hostId = null, hostName = null, caseSensitive = false, wholeWord = false, maxMatchesPerEpisode = 5, } = options; const resolvedHostIds = new Set(); if (hostId) { resolvedHostIds.add(Number(hostId)); } if (hostName) { const hostMatches = this.searchHosts(hostName); hostMatches.forEach(host => resolvedHostIds.add(host.hostid)); } const filterByHost = resolvedHostIds.size > 0; const explicitTerms = Array.isArray(terms) ? terms.map(t => (t ?? '').toString().trim()).filter(Boolean) : []; const splitQueryTerms = (matchMode === 'any' || matchMode === 'all') ? (query || '') .split(/[|,;\n]/) .map(part => part.trim()) .filter(Boolean) : []; const hasQuery = typeof query === 'string' && query.trim().length > 0; let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms; if (searchTerms.length === 0 && hasQuery) { searchTerms = [query.trim()]; } let resolvedMatchMode = matchMode; if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) { resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase'; } const effectiveTerms = resolvedMatchMode === 'phrase' ? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean) : searchTerms; if (effectiveTerms.length === 0) { return []; } const regexFlags = caseSensitive ? 'g' : 'gi'; const matchers = effectiveTerms.map(term => { if (!term) return null; const escaped = escapeRegExp(term); const pattern = wholeWord ? `\\b${escaped}\\b` : escaped; try { return { term, regex: new RegExp(pattern, regexFlags), }; } catch (error) { console.error(`Invalid search pattern for term "${term}":`, error.message); return null; } }).filter(Boolean); if (matchers.length === 0) { return []; } const results = []; for (const [episodeId, transcript] of this.transcripts) { if (results.length >= limit) break; const episode = this.getEpisode(episodeId); if (!episode) continue; if (filterByHost && !resolvedHostIds.has(episode.hostid)) { continue; } const lines = transcript.split(/\r?\n/); const matches = []; const matchedTerms = new Set(); const termHitCounts = new Map(); let truncated = false; for (let index = 0; index < lines.length; index++) { const line = lines[index]; const matchedOnLine = []; for (const matcher of matchers) { matcher.regex.lastIndex = 0; if (matcher.regex.test(line)) { matchedOnLine.push(matcher.term); matchedTerms.add(matcher.term); termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1); } } if (matchedOnLine.length > 0) { const start = Math.max(0, index - contextLines); const end = Math.min(lines.length, index + contextLines + 1); const context = lines.slice(start, end).join('\n'); matches.push({ lineNumber: index + 1, terms: [...new Set(matchedOnLine)], context, }); } if (matches.length >= maxMatchesPerEpisode) { truncated = true; break; } } if (matches.length === 0) { continue; } if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) { continue; } results.push({ episode, matches, matchSummary: { matchMode: resolvedMatchMode, matchedTerms: [...matchedTerms], totalMatches: matches.length, termHitCounts: Object.fromEntries(termHitCounts), truncated, }, }); } return results; } /** * Search hosts by name or email */ searchHosts(query) { const queryLower = query.toLowerCase(); return this.hosts.filter(host => host.host.toLowerCase().includes(queryLower) || host.email.toLowerCase().includes(queryLower) ); } /** * Get statistics */ getStats() { return { totalEpisodes: this.episodes.length, totalHosts: this.hosts.length, totalComments: this.comments.length, totalSeries: this.series.length, totalTranscripts: this.transcripts.size, dateRange: { earliest: this.episodes.reduce((min, ep) => ep.date < min ? ep.date : min, this.episodes[0]?.date || ''), latest: this.episodes.reduce((max, ep) => ep.date > max ? ep.date : max, this.episodes[0]?.date || '') } }; } } export default HPRDataLoader;