data-loader.js

import { readFileSync, readdirSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

function escapeRegExp(string) {
  return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

class HPRDataLoader {
  constructor() {
    this.episodes = [];
    this.hosts = [];
    this.comments = [];
    this.series = [];
    this.transcripts = new Map(); // Map of episode id to transcript text
  }

  /**
   * Load all data from JSON files and transcripts
   */
  async load() {
    console.error('Loading HPR data...');

    // Load JSON files
    this.episodes = this.loadJSON('hpr_metadata/episodes.json');
    this.hosts = this.loadJSON('hpr_metadata/hosts.json');
    this.comments = this.loadJSON('hpr_metadata/comments.json');
    this.series = this.loadJSON('hpr_metadata/series.json');

    console.error(`Loaded ${this.episodes.length} episodes`);
    console.error(`Loaded ${this.hosts.length} hosts`);
    console.error(`Loaded ${this.comments.length} comments`);
    console.error(`Loaded ${this.series.length} series`);

    // Load transcripts
    this.loadTranscripts();

    console.error(`Loaded ${this.transcripts.size} transcripts`);
    console.error('HPR data loading complete!');
  }

  /**
   * Load a JSON file
   */
  loadJSON(relativePath) {
    const filePath = join(__dirname, relativePath);
    try {
      const data = readFileSync(filePath, 'utf-8');
      return JSON.parse(data);
    } catch (error) {
      console.error(`Error loading ${relativePath}:`, error.message);
      return [];
    }
  }

  /**
   * Load all transcript files
   */
  loadTranscripts() {
    const transcriptsDir = join(__dirname, 'hpr_transcripts');
    try {
      const files = readdirSync(transcriptsDir);

      for (const file of files) {
        if (file.endsWith('.txt')) {
          // Extract episode ID from filename (e.g., hpr0016.txt -> 16)
          const match = file.match(/hpr(\d+)\.txt/);
          if (match) {
            const episodeId = parseInt(match[1], 10);
            const filePath = join(transcriptsDir, file);
            try {
              const content = readFileSync(filePath, 'utf-8');
              this.transcripts.set(episodeId, content);
            } catch (error) {
              console.error(`Error loading transcript ${file}:`, error.message);
            }
          }
        }
      }
    } catch (error) {
      console.error('Error loading transcripts directory:', error.message);
    }
  }

  /**
   * Get episode by ID
   */
  getEpisode(id) {
    return this.episodes.find(ep => ep.id === id);
  }

  /**
   * Get host by ID
   */
  getHost(id) {
    return this.hosts.find(host => host.hostid === id);
  }

  /**
   * Get series by ID
   */
  getSeries(id) {
    return this.series.find(s => s.id === id);
  }

  /**
   * Get transcript for episode
   */
  getTranscript(episodeId) {
    return this.transcripts.get(episodeId);
  }

  /**
   * Get comments for episode
   */
  getCommentsForEpisode(episodeId) {
    return this.comments.filter(c => c.eps_id === episodeId);
  }

  /**
   * Get episodes by host
   */
  getEpisodesByHost(hostId) {
    return this.episodes.filter(ep => ep.hostid === hostId);
  }

  /**
   * Get episodes in a series
   */
  getEpisodesInSeries(seriesId) {
    return this.episodes.filter(ep => ep.series === seriesId);
  }

  /**
   * Search episodes by keyword in title, summary, or tags
   */
  searchEpisodes(query, options = {}) {
    const {
      limit = 20,
      hostId = null,
      seriesId = null,
      tag = null,
      fromDate = null,
      toDate = null
    } = options;

    const queryLower = query.toLowerCase();
    let results = this.episodes.filter(ep => {
      // Basic text search
      const matchesQuery = !query ||
        ep.title.toLowerCase().includes(queryLower) ||
        ep.summary.toLowerCase().includes(queryLower) ||
        ep.tags.toLowerCase().includes(queryLower) ||
        ep.notes.toLowerCase().includes(queryLower);

      // Filter by host
      const matchesHost = !hostId || ep.hostid === hostId;

      // Filter by series
      const matchesSeries = seriesId === null || ep.series === seriesId;

      // Filter by tag
      const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());

      // Filter by date range
      const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
                                (!toDate || ep.date <= toDate);

      return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange;
    });

    // Sort by date (newest first)
    results.sort((a, b) => b.date.localeCompare(a.date));

    return results.slice(0, limit);
  }

  /**
   * Search transcripts by keyword
   */
  searchTranscripts(query, options = {}) {
    const {
      limit = 20,
      contextLines = 3,
      terms = [],
      matchMode = 'auto',
      hostId = null,
      hostName = null,
      caseSensitive = false,
      wholeWord = false,
      maxMatchesPerEpisode = 5,
    } = options;

    const resolvedHostIds = new Set();
    if (hostId) {
      resolvedHostIds.add(Number(hostId));
    }
    if (hostName) {
      const hostMatches = this.searchHosts(hostName);
      hostMatches.forEach(host => resolvedHostIds.add(host.hostid));
    }
    const filterByHost = resolvedHostIds.size > 0;

    const explicitTerms = Array.isArray(terms)
      ? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)
      : [];

    const splitQueryTerms = (matchMode === 'any' || matchMode === 'all')
      ? (query || '')
          .split(/[|,;\n]/)
          .map(part => part.trim())
          .filter(Boolean)
      : [];

    const hasQuery = typeof query === 'string' && query.trim().length > 0;

    let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;
    if (searchTerms.length === 0 && hasQuery) {
      searchTerms = [query.trim()];
    }

    let resolvedMatchMode = matchMode;
    if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {
      resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';
    }

    const effectiveTerms = resolvedMatchMode === 'phrase'
      ? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean)
      : searchTerms;

    if (effectiveTerms.length === 0) {
      return [];
    }

    const regexFlags = caseSensitive ? 'g' : 'gi';
    const matchers = effectiveTerms.map(term => {
      if (!term) return null;
      const escaped = escapeRegExp(term);
      const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
      try {
        return {
          term,
          regex: new RegExp(pattern, regexFlags),
        };
      } catch (error) {
        console.error(`Invalid search pattern for term "${term}":`, error.message);
        return null;
      }
    }).filter(Boolean);

    if (matchers.length === 0) {
      return [];
    }

    const results = [];

    for (const [episodeId, transcript] of this.transcripts) {
      if (results.length >= limit) break;

      const episode = this.getEpisode(episodeId);
      if (!episode) continue;

      if (filterByHost && !resolvedHostIds.has(episode.hostid)) {
        continue;
      }

      const lines = transcript.split(/\r?\n/);
      const matches = [];
      const matchedTerms = new Set();
      const termHitCounts = new Map();
      let truncated = false;

      for (let index = 0; index < lines.length; index++) {
        const line = lines[index];
        const matchedOnLine = [];

        for (const matcher of matchers) {
          matcher.regex.lastIndex = 0;
          if (matcher.regex.test(line)) {
            matchedOnLine.push(matcher.term);
            matchedTerms.add(matcher.term);
            termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1);
          }
        }

        if (matchedOnLine.length > 0) {
          const start = Math.max(0, index - contextLines);
          const end = Math.min(lines.length, index + contextLines + 1);
          const context = lines.slice(start, end).join('\n');

          matches.push({
            lineNumber: index + 1,
            terms: [...new Set(matchedOnLine)],
            context,
          });
        }

        if (matches.length >= maxMatchesPerEpisode) {
          truncated = true;
          break;
        }
      }

      if (matches.length === 0) {
        continue;
      }

      if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {
        continue;
      }

      results.push({
        episode,
        matches,
        matchSummary: {
          matchMode: resolvedMatchMode,
          matchedTerms: [...matchedTerms],
          totalMatches: matches.length,
          termHitCounts: Object.fromEntries(termHitCounts),
          truncated,
        },
      });
    }

    return results;
  }

  /**
   * Search hosts by name or email
   */
  searchHosts(query) {
    const queryLower = query.toLowerCase();
    return this.hosts.filter(host =>
      host.host.toLowerCase().includes(queryLower) ||
      host.email.toLowerCase().includes(queryLower)
    );
  }

  /**
   * Get statistics
   */
  getStats() {
    return {
      totalEpisodes: this.episodes.length,
      totalHosts: this.hosts.length,
      totalComments: this.comments.length,
      totalSeries: this.series.length,
      totalTranscripts: this.transcripts.size,
      dateRange: {
        earliest: this.episodes.reduce((min, ep) => ep.date < min ? ep.date : min, this.episodes[0]?.date || ''),
        latest: this.episodes.reduce((max, ep) => ep.date > max ? ep.date : max, this.episodes[0]?.date || '')
      }
    };
  }
}

export default HPRDataLoader;
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`import { readFileSync, readdirSync } from 'fs';`
			`import { join, dirname } from 'path';`
			`import { fileURLToPath } from 'url';`

			`const __filename = fileURLToPath(import.meta.url);`
			`const __dirname = dirname(__filename);`

Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`function escapeRegExp(string) {`
			`return string.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&');`
			`}`

Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`class HPRDataLoader {`
			`constructor() {`
			`this.episodes = [];`
			`this.hosts = [];`
			`this.comments = [];`
			`this.series = [];`
			`this.transcripts = new Map(); // Map of episode id to transcript text`
			`}`

			`/**`
			`* Load all data from JSON files and transcripts`
			`*/`
			`async load() {`
			`console.error('Loading HPR data...');`

			`// Load JSON files`
			`this.episodes = this.loadJSON('hpr_metadata/episodes.json');`
			`this.hosts = this.loadJSON('hpr_metadata/hosts.json');`
			`this.comments = this.loadJSON('hpr_metadata/comments.json');`
			`this.series = this.loadJSON('hpr_metadata/series.json');`

			console.error(`Loaded ${this.episodes.length} episodes`);
			console.error(`Loaded ${this.hosts.length} hosts`);
			console.error(`Loaded ${this.comments.length} comments`);
			console.error(`Loaded ${this.series.length} series`);

			`// Load transcripts`
			`this.loadTranscripts();`

			console.error(`Loaded ${this.transcripts.size} transcripts`);
			`console.error('HPR data loading complete!');`
			`}`

			`/**`
			`* Load a JSON file`
			`*/`
			`loadJSON(relativePath) {`
			`const filePath = join(__dirname, relativePath);`
			`try {`
			`const data = readFileSync(filePath, 'utf-8');`
			`return JSON.parse(data);`
			`} catch (error) {`
			console.error(`Error loading ${relativePath}:`, error.message);
			`return [];`
			`}`
			`}`

			`/**`
			`* Load all transcript files`
			`*/`
			`loadTranscripts() {`
			`const transcriptsDir = join(__dirname, 'hpr_transcripts');`
			`try {`
			`const files = readdirSync(transcriptsDir);`

			`for (const file of files) {`
			`if (file.endsWith('.txt')) {`
			`// Extract episode ID from filename (e.g., hpr0016.txt -> 16)`
			`const match = file.match(/hpr(\d+)\.txt/);`
			`if (match) {`
			`const episodeId = parseInt(match[1], 10);`
			`const filePath = join(transcriptsDir, file);`
			`try {`
			`const content = readFileSync(filePath, 'utf-8');`
			`this.transcripts.set(episodeId, content);`
			`} catch (error) {`
			console.error(`Error loading transcript ${file}:`, error.message);
			`}`
			`}`
			`}`
			`}`
			`} catch (error) {`
			`console.error('Error loading transcripts directory:', error.message);`
			`}`
			`}`

			`/**`
			`* Get episode by ID`
			`*/`
			`getEpisode(id) {`
			`return this.episodes.find(ep => ep.id === id);`
			`}`

			`/**`
			`* Get host by ID`
			`*/`
			`getHost(id) {`
			`return this.hosts.find(host => host.hostid === id);`
			`}`

			`/**`
			`* Get series by ID`
			`*/`
			`getSeries(id) {`
			`return this.series.find(s => s.id === id);`
			`}`

			`/**`
			`* Get transcript for episode`
			`*/`
			`getTranscript(episodeId) {`
			`return this.transcripts.get(episodeId);`
			`}`

			`/**`
			`* Get comments for episode`
			`*/`
			`getCommentsForEpisode(episodeId) {`
			`return this.comments.filter(c => c.eps_id === episodeId);`
			`}`

			`/**`
			`* Get episodes by host`
			`*/`
			`getEpisodesByHost(hostId) {`
			`return this.episodes.filter(ep => ep.hostid === hostId);`
			`}`

			`/**`
			`* Get episodes in a series`
			`*/`
			`getEpisodesInSeries(seriesId) {`
			`return this.episodes.filter(ep => ep.series === seriesId);`
			`}`

			`/**`
			`* Search episodes by keyword in title, summary, or tags`
			`*/`
			`searchEpisodes(query, options = {}) {`
			`const {`
			`limit = 20,`
			`hostId = null,`
			`seriesId = null,`
			`tag = null,`
			`fromDate = null,`
			`toDate = null`
			`} = options;`

			`const queryLower = query.toLowerCase();`
			`let results = this.episodes.filter(ep => {`
			`// Basic text search`
			`const matchesQuery = !query \|\|`
			`ep.title.toLowerCase().includes(queryLower) \|\|`
			`ep.summary.toLowerCase().includes(queryLower) \|\|`
			`ep.tags.toLowerCase().includes(queryLower) \|\|`
			`ep.notes.toLowerCase().includes(queryLower);`

			`// Filter by host`
			`const matchesHost = !hostId \|\| ep.hostid === hostId;`

			`// Filter by series`
			`const matchesSeries = seriesId === null \|\| ep.series === seriesId;`

			`// Filter by tag`
			`const matchesTag = !tag \|\| ep.tags.toLowerCase().includes(tag.toLowerCase());`

			`// Filter by date range`
			`const matchesDateRange = (!fromDate \|\| ep.date >= fromDate) &&`
			`(!toDate \|\| ep.date <= toDate);`

			`return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange;`
			`});`

			`// Sort by date (newest first)`
			`results.sort((a, b) => b.date.localeCompare(a.date));`

			`return results.slice(0, limit);`
			`}`

			`/**`
			`* Search transcripts by keyword`
			`*/`
			`searchTranscripts(query, options = {}) {`
Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`const {`
			`limit = 20,`
			`contextLines = 3,`
			`terms = [],`
			`matchMode = 'auto',`
			`hostId = null,`
			`hostName = null,`
			`caseSensitive = false,`
			`wholeWord = false,`
			`maxMatchesPerEpisode = 5,`
			`} = options;`

			`const resolvedHostIds = new Set();`
			`if (hostId) {`
			`resolvedHostIds.add(Number(hostId));`
			`}`
			`if (hostName) {`
			`const hostMatches = this.searchHosts(hostName);`
			`hostMatches.forEach(host => resolvedHostIds.add(host.hostid));`
			`}`
			`const filterByHost = resolvedHostIds.size > 0;`

			`const explicitTerms = Array.isArray(terms)`
			`? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)`
			`: [];`

			`const splitQueryTerms = (matchMode === 'any' \|\| matchMode === 'all')`
			`? (query \|\| '')`
			`.split(/[\|,;\n]/)`
			`.map(part => part.trim())`
			`.filter(Boolean)`
			`: [];`

			`const hasQuery = typeof query === 'string' && query.trim().length > 0;`

			`let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;`
			`if (searchTerms.length === 0 && hasQuery) {`
			`searchTerms = [query.trim()];`
			`}`

			`let resolvedMatchMode = matchMode;`
			`if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {`
			`resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';`
			`}`

			`const effectiveTerms = resolvedMatchMode === 'phrase'`
			`? [(hasQuery ? query.trim() : searchTerms[0] \|\| '')].filter(Boolean)`
			`: searchTerms;`

			`if (effectiveTerms.length === 0) {`
			`return [];`
			`}`

			`const regexFlags = caseSensitive ? 'g' : 'gi';`
			`const matchers = effectiveTerms.map(term => {`
			`if (!term) return null;`
			`const escaped = escapeRegExp(term);`
			const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
			`try {`
			`return {`
			`term,`
			`regex: new RegExp(pattern, regexFlags),`
			`};`
			`} catch (error) {`
			console.error(`Invalid search pattern for term "${term}":`, error.message);
			`return null;`
			`}`
			`}).filter(Boolean);`

			`if (matchers.length === 0) {`
			`return [];`
			`}`

Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`const results = [];`

			`for (const [episodeId, transcript] of this.transcripts) {`
Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`if (results.length >= limit) break;`

			`const episode = this.getEpisode(episodeId);`
			`if (!episode) continue;`

			`if (filterByHost && !resolvedHostIds.has(episode.hostid)) {`
			`continue;`
			`}`

			`const lines = transcript.split(/\r?\n/);`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`const matches = [];`
Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`const matchedTerms = new Set();`
			`const termHitCounts = new Map();`
			`let truncated = false;`

			`for (let index = 0; index < lines.length; index++) {`
			`const line = lines[index];`
			`const matchedOnLine = [];`

			`for (const matcher of matchers) {`
			`matcher.regex.lastIndex = 0;`
			`if (matcher.regex.test(line)) {`
			`matchedOnLine.push(matcher.term);`
			`matchedTerms.add(matcher.term);`
			`termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) \|\| 0) + 1);`
			`}`
			`}`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00
Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`if (matchedOnLine.length > 0) {`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`const start = Math.max(0, index - contextLines);`
			`const end = Math.min(lines.length, index + contextLines + 1);`
			`const context = lines.slice(start, end).join('\n');`

			`matches.push({`
			`lineNumber: index + 1,`
Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`terms: [...new Set(matchedOnLine)],`
			`context,`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`});`
			`}`

Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`if (matches.length >= maxMatchesPerEpisode) {`
			`truncated = true;`
			`break;`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`}`
			`}`

Enhance transcript search tooling with flexible matching 2025-10-26 14:46:24 +00:00			`if (matches.length === 0) {`
			`continue;`
			`}`

			`if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {`
			`continue;`
			`}`

			`results.push({`
			`episode,`
			`matches,`
			`matchSummary: {`
			`matchMode: resolvedMatchMode,`
			`matchedTerms: [...matchedTerms],`
			`totalMatches: matches.length,`
			`termHitCounts: Object.fromEntries(termHitCounts),`
			`truncated,`
			`},`
			`});`
Initial commit: HPR Knowledge Base MCP Server - MCP server with stdio transport for local use - Search episodes, transcripts, hosts, and series - 4,511 episodes with metadata and transcripts - Data loader with in-memory JSON storage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-10-26 10:54:13 +00:00			`}`

			`return results;`
			`}`

			`/**`
			`* Search hosts by name or email`
			`*/`
			`searchHosts(query) {`
			`const queryLower = query.toLowerCase();`
			`return this.hosts.filter(host =>`
			`host.host.toLowerCase().includes(queryLower) \|\|`
			`host.email.toLowerCase().includes(queryLower)`
			`);`
			`}`

			`/**`
			`* Get statistics`
			`*/`
			`getStats() {`
			`return {`
			`totalEpisodes: this.episodes.length,`
			`totalHosts: this.hosts.length,`
			`totalComments: this.comments.length,`
			`totalSeries: this.series.length,`
			`totalTranscripts: this.transcripts.size,`
			`dateRange: {`
			`earliest: this.episodes.reduce((min, ep) => ep.date < min ? ep.date : min, this.episodes[0]?.date \|\| ''),`
			`latest: this.episodes.reduce((max, ep) => ep.date > max ? ep.date : max, this.episodes[0]?.date \|\| '')`
			`}`
			`};`
			`}`
			`}`

			`export default HPRDataLoader;`