Files
hpr-knowledge-base/data-loader.js
2025-11-02 12:51:40 +00:00

477 lines
13 KiB
JavaScript

import { readFileSync, readdirSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
function escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/**
* Calculate Levenshtein distance between two strings
* Returns the minimum number of single-character edits (insertions, deletions, substitutions)
* needed to change one string into the other.
*/
function levenshteinDistance(a, b) {
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const matrix = [];
// Initialize first column
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
// Initialize first row
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
class HPRDataLoader {
constructor() {
this.episodes = [];
this.hosts = [];
this.comments = [];
this.series = [];
this.transcripts = new Map(); // Map of episode id to transcript text
}
/**
* Load all data from JSON files and transcripts
*/
async load() {
console.error('Loading HPR data...');
// Load JSON files
this.episodes = this.loadJSON('hpr_metadata/episodes.json');
this.hosts = this.loadJSON('hpr_metadata/hosts.json');
this.comments = this.loadJSON('hpr_metadata/comments.json');
this.series = this.loadJSON('hpr_metadata/series.json');
console.error(`Loaded ${this.episodes.length} episodes`);
console.error(`Loaded ${this.hosts.length} hosts`);
console.error(`Loaded ${this.comments.length} comments`);
console.error(`Loaded ${this.series.length} series`);
// Load transcripts
this.loadTranscripts();
console.error(`Loaded ${this.transcripts.size} transcripts`);
console.error('HPR data loading complete!');
}
/**
* Load a JSON file
*/
loadJSON(relativePath) {
const filePath = join(__dirname, relativePath);
try {
const data = readFileSync(filePath, 'utf-8');
return JSON.parse(data);
} catch (error) {
console.error(`Error loading ${relativePath}:`, error.message);
return [];
}
}
/**
* Load all transcript files
*/
loadTranscripts() {
const transcriptsDir = join(__dirname, 'hpr_transcripts');
try {
const files = readdirSync(transcriptsDir);
for (const file of files) {
if (file.endsWith('.txt')) {
// Extract episode ID from filename (e.g., hpr0016.txt -> 16)
const match = file.match(/hpr(\d+)\.txt/);
if (match) {
const episodeId = parseInt(match[1], 10);
const filePath = join(transcriptsDir, file);
try {
const content = readFileSync(filePath, 'utf-8');
this.transcripts.set(episodeId, content);
} catch (error) {
console.error(`Error loading transcript ${file}:`, error.message);
}
}
}
}
} catch (error) {
console.error('Error loading transcripts directory:', error.message);
}
}
/**
* Get episode by ID
*/
getEpisode(id) {
return this.episodes.find(ep => ep.id === id);
}
/**
* Get host by ID
*/
getHost(id) {
return this.hosts.find(host => host.hostid === id);
}
/**
* Get series by ID
*/
getSeries(id) {
return this.series.find(s => s.id === id);
}
/**
* Get transcript for episode
*/
getTranscript(episodeId) {
return this.transcripts.get(episodeId);
}
/**
* Get comments for episode
*/
getCommentsForEpisode(episodeId) {
return this.comments.filter(c => c.eps_id === episodeId);
}
/**
* Get episodes by host
*/
getEpisodesByHost(hostId) {
return this.episodes.filter(ep => ep.hostid === hostId);
}
/**
* Get episodes in a series
*/
getEpisodesInSeries(seriesId) {
return this.episodes.filter(ep => ep.series === seriesId);
}
/**
* Search episodes by keyword in title, summary, or tags with fuzzy matching fallback
* Returns episodes with matchType indicator ('exact' or 'fuzzy')
*/
searchEpisodes(query, options = {}) {
const {
limit = 20,
hostId = null,
seriesId = null,
tag = null,
fromDate = null,
toDate = null,
maxDistance = 3 // More lenient for longer episode titles
} = options;
const queryLower = query.toLowerCase();
// Helper to check if episode matches filters (excluding query)
const matchesFilters = (ep) => {
const matchesHost = !hostId || ep.hostid === hostId;
const matchesSeries = seriesId === null || ep.series === seriesId;
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());
const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
(!toDate || ep.date <= toDate);
return matchesHost && matchesSeries && matchesTag && matchesDateRange;
};
// Try exact substring match first (fast path)
let results = this.episodes.filter(ep => {
const matchesQuery = !query ||
ep.title.toLowerCase().includes(queryLower) ||
ep.summary.toLowerCase().includes(queryLower) ||
ep.tags.toLowerCase().includes(queryLower) ||
ep.notes.toLowerCase().includes(queryLower);
return matchesQuery && matchesFilters(ep);
}).map(ep => ({
...ep,
matchType: 'exact'
}));
// If no exact matches and we have a query, try fuzzy match on title
if (results.length === 0 && query && query.trim().length > 0) {
const fuzzyResults = this.episodes
.filter(matchesFilters)
.map(ep => {
// Check if any word in the title is close to the query
const titleWords = ep.title.toLowerCase().split(/\s+/);
let minDistance = Infinity;
for (const word of titleWords) {
const distance = levenshteinDistance(queryLower, word);
if (distance < minDistance) {
minDistance = distance;
}
}
return {
episode: ep,
distance: minDistance
};
})
.filter(result => result.distance <= maxDistance)
.sort((a, b) => a.distance - b.distance)
.map(result => ({
...result.episode,
matchType: 'fuzzy',
matchDistance: result.distance
}));
results = fuzzyResults;
}
// Sort by date (newest first), maintaining match quality
results.sort((a, b) => {
// If both are fuzzy matches, sort by distance first, then date
if (a.matchType === 'fuzzy' && b.matchType === 'fuzzy') {
const distDiff = (a.matchDistance || 0) - (b.matchDistance || 0);
if (distDiff !== 0) return distDiff;
}
return b.date.localeCompare(a.date);
});
return results.slice(0, limit);
}
/**
* Search transcripts by keyword
*/
searchTranscripts(query, options = {}) {
const {
limit = 20,
contextLines = 3,
terms = [],
matchMode = 'auto',
hostId = null,
hostName = null,
caseSensitive = false,
wholeWord = false,
maxMatchesPerEpisode = 5,
} = options;
const resolvedHostIds = new Set();
if (hostId) {
resolvedHostIds.add(Number(hostId));
}
if (hostName) {
const hostMatches = this.searchHosts(hostName);
hostMatches.forEach(host => resolvedHostIds.add(host.hostid));
}
const filterByHost = resolvedHostIds.size > 0;
const explicitTerms = Array.isArray(terms)
? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)
: [];
const splitQueryTerms = (matchMode === 'any' || matchMode === 'all')
? (query || '')
.split(/[|,;\n]/)
.map(part => part.trim())
.filter(Boolean)
: [];
const hasQuery = typeof query === 'string' && query.trim().length > 0;
let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;
if (searchTerms.length === 0 && hasQuery) {
searchTerms = [query.trim()];
}
let resolvedMatchMode = matchMode;
if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {
resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';
}
const effectiveTerms = resolvedMatchMode === 'phrase'
? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean)
: searchTerms;
if (effectiveTerms.length === 0) {
return [];
}
const regexFlags = caseSensitive ? 'g' : 'gi';
const matchers = effectiveTerms.map(term => {
if (!term) return null;
const escaped = escapeRegExp(term);
const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
try {
return {
term,
regex: new RegExp(pattern, regexFlags),
};
} catch (error) {
console.error(`Invalid search pattern for term "${term}":`, error.message);
return null;
}
}).filter(Boolean);
if (matchers.length === 0) {
return [];
}
const results = [];
for (const [episodeId, transcript] of this.transcripts) {
if (results.length >= limit) break;
const episode = this.getEpisode(episodeId);
if (!episode) continue;
if (filterByHost && !resolvedHostIds.has(episode.hostid)) {
continue;
}
const lines = transcript.split(/\r?\n/);
const matches = [];
const matchedTerms = new Set();
const termHitCounts = new Map();
let truncated = false;
for (let index = 0; index < lines.length; index++) {
const line = lines[index];
const matchedOnLine = [];
for (const matcher of matchers) {
matcher.regex.lastIndex = 0;
if (matcher.regex.test(line)) {
matchedOnLine.push(matcher.term);
matchedTerms.add(matcher.term);
termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1);
}
}
if (matchedOnLine.length > 0) {
const start = Math.max(0, index - contextLines);
const end = Math.min(lines.length, index + contextLines + 1);
const context = lines.slice(start, end).join('\n');
matches.push({
lineNumber: index + 1,
terms: [...new Set(matchedOnLine)],
context,
});
}
if (matches.length >= maxMatchesPerEpisode) {
truncated = true;
break;
}
}
if (matches.length === 0) {
continue;
}
if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {
continue;
}
results.push({
episode,
matches,
matchSummary: {
matchMode: resolvedMatchMode,
matchedTerms: [...matchedTerms],
totalMatches: matches.length,
termHitCounts: Object.fromEntries(termHitCounts),
truncated,
},
});
}
return results;
}
/**
* Search hosts by name or email with fuzzy matching fallback
* Returns hosts with matchType indicator ('exact' or 'fuzzy')
*/
searchHosts(query, options = {}) {
const { maxDistance = 2 } = options;
const queryLower = query.toLowerCase();
// Try exact substring match first (fast path)
const exactMatches = this.hosts.filter(host =>
host.host.toLowerCase().includes(queryLower) ||
host.email.toLowerCase().includes(queryLower)
).map(host => ({
...host,
matchType: 'exact'
}));
if (exactMatches.length > 0) {
return exactMatches;
}
// Fall back to fuzzy match if no exact matches
const fuzzyMatches = this.hosts
.map(host => {
const hostLower = host.host.toLowerCase();
const emailLower = host.email.toLowerCase();
const hostDistance = levenshteinDistance(queryLower, hostLower);
const emailDistance = levenshteinDistance(queryLower, emailLower);
const minDistance = Math.min(hostDistance, emailDistance);
return {
host,
distance: minDistance
};
})
.filter(result => result.distance <= maxDistance)
.sort((a, b) => a.distance - b.distance)
.map(result => ({
...result.host,
matchType: 'fuzzy',
matchDistance: result.distance
}));
return fuzzyMatches;
}
/**
* Get statistics
*/
getStats() {
return {
totalEpisodes: this.episodes.length,
totalHosts: this.hosts.length,
totalComments: this.comments.length,
totalSeries: this.series.length,
totalTranscripts: this.transcripts.size,
dateRange: {
earliest: this.episodes.reduce((min, ep) => ep.date < min ? ep.date : min, this.episodes[0]?.date || ''),
latest: this.episodes.reduce((max, ep) => ep.date > max ? ep.date : max, this.episodes[0]?.date || '')
}
};
}
}
export default HPRDataLoader;