2025-10-26 10:54:13 +00:00
|
|
|
import { readFileSync, readdirSync } from 'fs';
|
|
|
|
|
import { join, dirname } from 'path';
|
|
|
|
|
import { fileURLToPath } from 'url';
|
|
|
|
|
|
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
|
|
|
const __dirname = dirname(__filename);
|
|
|
|
|
|
2025-10-26 14:46:24 +00:00
|
|
|
function escapeRegExp(string) {
|
|
|
|
|
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-26 10:54:13 +00:00
|
|
|
class HPRDataLoader {
|
|
|
|
|
constructor() {
|
|
|
|
|
this.episodes = [];
|
|
|
|
|
this.hosts = [];
|
|
|
|
|
this.comments = [];
|
|
|
|
|
this.series = [];
|
|
|
|
|
this.transcripts = new Map(); // Map of episode id to transcript text
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Load all data from JSON files and transcripts
|
|
|
|
|
*/
|
|
|
|
|
async load() {
|
|
|
|
|
console.error('Loading HPR data...');
|
|
|
|
|
|
|
|
|
|
// Load JSON files
|
|
|
|
|
this.episodes = this.loadJSON('hpr_metadata/episodes.json');
|
|
|
|
|
this.hosts = this.loadJSON('hpr_metadata/hosts.json');
|
|
|
|
|
this.comments = this.loadJSON('hpr_metadata/comments.json');
|
|
|
|
|
this.series = this.loadJSON('hpr_metadata/series.json');
|
|
|
|
|
|
|
|
|
|
console.error(`Loaded ${this.episodes.length} episodes`);
|
|
|
|
|
console.error(`Loaded ${this.hosts.length} hosts`);
|
|
|
|
|
console.error(`Loaded ${this.comments.length} comments`);
|
|
|
|
|
console.error(`Loaded ${this.series.length} series`);
|
|
|
|
|
|
|
|
|
|
// Load transcripts
|
|
|
|
|
this.loadTranscripts();
|
|
|
|
|
|
|
|
|
|
console.error(`Loaded ${this.transcripts.size} transcripts`);
|
|
|
|
|
console.error('HPR data loading complete!');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Load a JSON file
|
|
|
|
|
*/
|
|
|
|
|
loadJSON(relativePath) {
|
|
|
|
|
const filePath = join(__dirname, relativePath);
|
|
|
|
|
try {
|
|
|
|
|
const data = readFileSync(filePath, 'utf-8');
|
|
|
|
|
return JSON.parse(data);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error loading ${relativePath}:`, error.message);
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Load all transcript files
|
|
|
|
|
*/
|
|
|
|
|
loadTranscripts() {
|
|
|
|
|
const transcriptsDir = join(__dirname, 'hpr_transcripts');
|
|
|
|
|
try {
|
|
|
|
|
const files = readdirSync(transcriptsDir);
|
|
|
|
|
|
|
|
|
|
for (const file of files) {
|
|
|
|
|
if (file.endsWith('.txt')) {
|
|
|
|
|
// Extract episode ID from filename (e.g., hpr0016.txt -> 16)
|
|
|
|
|
const match = file.match(/hpr(\d+)\.txt/);
|
|
|
|
|
if (match) {
|
|
|
|
|
const episodeId = parseInt(match[1], 10);
|
|
|
|
|
const filePath = join(transcriptsDir, file);
|
|
|
|
|
try {
|
|
|
|
|
const content = readFileSync(filePath, 'utf-8');
|
|
|
|
|
this.transcripts.set(episodeId, content);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error loading transcript ${file}:`, error.message);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Error loading transcripts directory:', error.message);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get episode by ID
|
|
|
|
|
*/
|
|
|
|
|
getEpisode(id) {
|
|
|
|
|
return this.episodes.find(ep => ep.id === id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get host by ID
|
|
|
|
|
*/
|
|
|
|
|
getHost(id) {
|
|
|
|
|
return this.hosts.find(host => host.hostid === id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get series by ID
|
|
|
|
|
*/
|
|
|
|
|
getSeries(id) {
|
|
|
|
|
return this.series.find(s => s.id === id);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get transcript for episode
|
|
|
|
|
*/
|
|
|
|
|
getTranscript(episodeId) {
|
|
|
|
|
return this.transcripts.get(episodeId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get comments for episode
|
|
|
|
|
*/
|
|
|
|
|
getCommentsForEpisode(episodeId) {
|
|
|
|
|
return this.comments.filter(c => c.eps_id === episodeId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get episodes by host
|
|
|
|
|
*/
|
|
|
|
|
getEpisodesByHost(hostId) {
|
|
|
|
|
return this.episodes.filter(ep => ep.hostid === hostId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get episodes in a series
|
|
|
|
|
*/
|
|
|
|
|
getEpisodesInSeries(seriesId) {
|
|
|
|
|
return this.episodes.filter(ep => ep.series === seriesId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Search episodes by keyword in title, summary, or tags
|
|
|
|
|
*/
|
|
|
|
|
searchEpisodes(query, options = {}) {
|
|
|
|
|
const {
|
|
|
|
|
limit = 20,
|
|
|
|
|
hostId = null,
|
|
|
|
|
seriesId = null,
|
|
|
|
|
tag = null,
|
|
|
|
|
fromDate = null,
|
|
|
|
|
toDate = null
|
|
|
|
|
} = options;
|
|
|
|
|
|
|
|
|
|
const queryLower = query.toLowerCase();
|
|
|
|
|
let results = this.episodes.filter(ep => {
|
|
|
|
|
// Basic text search
|
|
|
|
|
const matchesQuery = !query ||
|
|
|
|
|
ep.title.toLowerCase().includes(queryLower) ||
|
|
|
|
|
ep.summary.toLowerCase().includes(queryLower) ||
|
|
|
|
|
ep.tags.toLowerCase().includes(queryLower) ||
|
|
|
|
|
ep.notes.toLowerCase().includes(queryLower);
|
|
|
|
|
|
|
|
|
|
// Filter by host
|
|
|
|
|
const matchesHost = !hostId || ep.hostid === hostId;
|
|
|
|
|
|
|
|
|
|
// Filter by series
|
|
|
|
|
const matchesSeries = seriesId === null || ep.series === seriesId;
|
|
|
|
|
|
|
|
|
|
// Filter by tag
|
|
|
|
|
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());
|
|
|
|
|
|
|
|
|
|
// Filter by date range
|
|
|
|
|
const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
|
|
|
|
|
(!toDate || ep.date <= toDate);
|
|
|
|
|
|
|
|
|
|
return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange;
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Sort by date (newest first)
|
|
|
|
|
results.sort((a, b) => b.date.localeCompare(a.date));
|
|
|
|
|
|
|
|
|
|
return results.slice(0, limit);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Search transcripts by keyword
|
|
|
|
|
*/
|
|
|
|
|
searchTranscripts(query, options = {}) {
|
2025-10-26 14:46:24 +00:00
|
|
|
const {
|
|
|
|
|
limit = 20,
|
|
|
|
|
contextLines = 3,
|
|
|
|
|
terms = [],
|
|
|
|
|
matchMode = 'auto',
|
|
|
|
|
hostId = null,
|
|
|
|
|
hostName = null,
|
|
|
|
|
caseSensitive = false,
|
|
|
|
|
wholeWord = false,
|
|
|
|
|
maxMatchesPerEpisode = 5,
|
|
|
|
|
} = options;
|
|
|
|
|
|
|
|
|
|
const resolvedHostIds = new Set();
|
|
|
|
|
if (hostId) {
|
|
|
|
|
resolvedHostIds.add(Number(hostId));
|
|
|
|
|
}
|
|
|
|
|
if (hostName) {
|
|
|
|
|
const hostMatches = this.searchHosts(hostName);
|
|
|
|
|
hostMatches.forEach(host => resolvedHostIds.add(host.hostid));
|
|
|
|
|
}
|
|
|
|
|
const filterByHost = resolvedHostIds.size > 0;
|
|
|
|
|
|
|
|
|
|
const explicitTerms = Array.isArray(terms)
|
|
|
|
|
? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)
|
|
|
|
|
: [];
|
|
|
|
|
|
|
|
|
|
const splitQueryTerms = (matchMode === 'any' || matchMode === 'all')
|
|
|
|
|
? (query || '')
|
|
|
|
|
.split(/[|,;\n]/)
|
|
|
|
|
.map(part => part.trim())
|
|
|
|
|
.filter(Boolean)
|
|
|
|
|
: [];
|
|
|
|
|
|
|
|
|
|
const hasQuery = typeof query === 'string' && query.trim().length > 0;
|
|
|
|
|
|
|
|
|
|
let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;
|
|
|
|
|
if (searchTerms.length === 0 && hasQuery) {
|
|
|
|
|
searchTerms = [query.trim()];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let resolvedMatchMode = matchMode;
|
|
|
|
|
if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {
|
|
|
|
|
resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const effectiveTerms = resolvedMatchMode === 'phrase'
|
|
|
|
|
? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean)
|
|
|
|
|
: searchTerms;
|
|
|
|
|
|
|
|
|
|
if (effectiveTerms.length === 0) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const regexFlags = caseSensitive ? 'g' : 'gi';
|
|
|
|
|
const matchers = effectiveTerms.map(term => {
|
|
|
|
|
if (!term) return null;
|
|
|
|
|
const escaped = escapeRegExp(term);
|
|
|
|
|
const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
|
|
|
|
|
try {
|
|
|
|
|
return {
|
|
|
|
|
term,
|
|
|
|
|
regex: new RegExp(pattern, regexFlags),
|
|
|
|
|
};
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Invalid search pattern for term "${term}":`, error.message);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}).filter(Boolean);
|
|
|
|
|
|
|
|
|
|
if (matchers.length === 0) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-26 10:54:13 +00:00
|
|
|
const results = [];
|
|
|
|
|
|
|
|
|
|
for (const [episodeId, transcript] of this.transcripts) {
|
2025-10-26 14:46:24 +00:00
|
|
|
if (results.length >= limit) break;
|
|
|
|
|
|
|
|
|
|
const episode = this.getEpisode(episodeId);
|
|
|
|
|
if (!episode) continue;
|
|
|
|
|
|
|
|
|
|
if (filterByHost && !resolvedHostIds.has(episode.hostid)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const lines = transcript.split(/\r?\n/);
|
2025-10-26 10:54:13 +00:00
|
|
|
const matches = [];
|
2025-10-26 14:46:24 +00:00
|
|
|
const matchedTerms = new Set();
|
|
|
|
|
const termHitCounts = new Map();
|
|
|
|
|
let truncated = false;
|
|
|
|
|
|
|
|
|
|
for (let index = 0; index < lines.length; index++) {
|
|
|
|
|
const line = lines[index];
|
|
|
|
|
const matchedOnLine = [];
|
|
|
|
|
|
|
|
|
|
for (const matcher of matchers) {
|
|
|
|
|
matcher.regex.lastIndex = 0;
|
|
|
|
|
if (matcher.regex.test(line)) {
|
|
|
|
|
matchedOnLine.push(matcher.term);
|
|
|
|
|
matchedTerms.add(matcher.term);
|
|
|
|
|
termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-10-26 10:54:13 +00:00
|
|
|
|
2025-10-26 14:46:24 +00:00
|
|
|
if (matchedOnLine.length > 0) {
|
2025-10-26 10:54:13 +00:00
|
|
|
const start = Math.max(0, index - contextLines);
|
|
|
|
|
const end = Math.min(lines.length, index + contextLines + 1);
|
|
|
|
|
const context = lines.slice(start, end).join('\n');
|
|
|
|
|
|
|
|
|
|
matches.push({
|
|
|
|
|
lineNumber: index + 1,
|
2025-10-26 14:46:24 +00:00
|
|
|
terms: [...new Set(matchedOnLine)],
|
|
|
|
|
context,
|
2025-10-26 10:54:13 +00:00
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-26 14:46:24 +00:00
|
|
|
if (matches.length >= maxMatchesPerEpisode) {
|
|
|
|
|
truncated = true;
|
|
|
|
|
break;
|
2025-10-26 10:54:13 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-26 14:46:24 +00:00
|
|
|
if (matches.length === 0) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
results.push({
|
|
|
|
|
episode,
|
|
|
|
|
matches,
|
|
|
|
|
matchSummary: {
|
|
|
|
|
matchMode: resolvedMatchMode,
|
|
|
|
|
matchedTerms: [...matchedTerms],
|
|
|
|
|
totalMatches: matches.length,
|
|
|
|
|
termHitCounts: Object.fromEntries(termHitCounts),
|
|
|
|
|
truncated,
|
|
|
|
|
},
|
|
|
|
|
});
|
2025-10-26 10:54:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Search hosts by name or email
|
|
|
|
|
*/
|
|
|
|
|
searchHosts(query) {
|
|
|
|
|
const queryLower = query.toLowerCase();
|
|
|
|
|
return this.hosts.filter(host =>
|
|
|
|
|
host.host.toLowerCase().includes(queryLower) ||
|
|
|
|
|
host.email.toLowerCase().includes(queryLower)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get statistics
|
|
|
|
|
*/
|
|
|
|
|
getStats() {
|
|
|
|
|
return {
|
|
|
|
|
totalEpisodes: this.episodes.length,
|
|
|
|
|
totalHosts: this.hosts.length,
|
|
|
|
|
totalComments: this.comments.length,
|
|
|
|
|
totalSeries: this.series.length,
|
|
|
|
|
totalTranscripts: this.transcripts.size,
|
|
|
|
|
dateRange: {
|
|
|
|
|
earliest: this.episodes.reduce((min, ep) => ep.date < min ? ep.date : min, this.episodes[0]?.date || ''),
|
|
|
|
|
latest: this.episodes.reduce((max, ep) => ep.date > max ? ep.date : max, this.episodes[0]?.date || '')
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export default HPRDataLoader;
|