Enhance transcript search tooling with flexible matching

2025-10-26 14:46:24 +00:00
parent 36aa12acb5
commit 98177f3fd5
4 changed files with 448 additions and 83 deletions
@@ -138,16 +138,29 @@ Get details for episode 16 including transcript and comments

 ### 3. `search_transcripts`

-Search through episode transcripts for specific keywords.
+Search through episode transcripts for phrases or multiple terms with flexible matching.

 **Parameters:**
- `query` (string, required): Search query
- `limit` (number, optional): Maximum episodes to return (default: 20)
- `contextLines` (number, optional): Lines of context around matches (default: 3)
+- `query` (string, optional): Phrase to search for. Useful for exact-phrase lookups.
+- `terms` (string[], optional): Explicit list of terms to search for; combine with `matchMode` for logical AND/OR searches.
+- `matchMode` (`'phrase' | 'any' | 'all'`, optional): How to combine `query`/`terms`. Defaults to `'phrase'`. Use `'any'` to match if any term is present, `'all'` to require every term somewhere in the transcript.
+- `limit` (number, optional): Maximum episodes to return (default: 20).
+- `contextLines` (number, optional): Lines of context to include around each match (default: 3).
+- `hostId` (number, optional): Only return matches for this host ID.
+- `hostName` (string, optional): Only return matches for hosts whose name includes this value.
+- `caseSensitive` (boolean, optional): Treat terms as case-sensitive (default: false).
+- `wholeWord` (boolean, optional): Match whole words only (default: false).
+- `maxMatchesPerEpisode` (number, optional): Maximum number of excerpts per episode (default: 5).

-**Example:**
+**Example queries:**
 ```
-Search transcripts for mentions of "virtual machine"
+Find transcripts mentioning "virtual machine"
+```
+```
+Find transcripts where klaatu talks about bash or python
+```
+```
+List episodes where host ID 123 mentions "encryption" and "privacy" (require all terms)
 ```

 ### 4. `get_host_info`
@@ -5,6 +5,10 @@ import { fileURLToPath } from 'url';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);

+function escapeRegExp(string) {
+  return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
 class HPRDataLoader {
  constructor() {
    this.episodes = [];
@@ -178,42 +182,148 @@ class HPRDataLoader {
   * Search transcripts by keyword
   */
  searchTranscripts(query, options = {}) {
-    const { limit = 20, contextLines = 3 } = options;
-    const queryLower = query.toLowerCase();
+    const {
+      limit = 20,
+      contextLines = 3,
+      terms = [],
+      matchMode = 'auto',
+      hostId = null,
+      hostName = null,
+      caseSensitive = false,
+      wholeWord = false,
+      maxMatchesPerEpisode = 5,
+    } = options;
+
+    const resolvedHostIds = new Set();
+    if (hostId) {
+      resolvedHostIds.add(Number(hostId));
+    }
+    if (hostName) {
+      const hostMatches = this.searchHosts(hostName);
+      hostMatches.forEach(host => resolvedHostIds.add(host.hostid));
+    }
+    const filterByHost = resolvedHostIds.size > 0;
+
+    const explicitTerms = Array.isArray(terms)
+      ? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)
+      : [];
+
+    const splitQueryTerms = (matchMode === 'any' || matchMode === 'all')
+      ? (query || '')
+          .split(/[|,;\n]/)
+          .map(part => part.trim())
+          .filter(Boolean)
+      : [];
+
+    const hasQuery = typeof query === 'string' && query.trim().length > 0;
+
+    let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;
+    if (searchTerms.length === 0 && hasQuery) {
+      searchTerms = [query.trim()];
+    }
+
+    let resolvedMatchMode = matchMode;
+    if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {
+      resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';
+    }
+
+    const effectiveTerms = resolvedMatchMode === 'phrase'
+      ? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean)
+      : searchTerms;
+
+    if (effectiveTerms.length === 0) {
+      return [];
+    }
+
+    const regexFlags = caseSensitive ? 'g' : 'gi';
+    const matchers = effectiveTerms.map(term => {
+      if (!term) return null;
+      const escaped = escapeRegExp(term);
+      const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
+      try {
+        return {
+          term,
+          regex: new RegExp(pattern, regexFlags),
+        };
+      } catch (error) {
+        console.error(`Invalid search pattern for term "${term}":`, error.message);
+        return null;
+      }
+    }).filter(Boolean);
+
+    if (matchers.length === 0) {
+      return [];
+    }
+
    const results = [];

    for (const [episodeId, transcript] of this.transcripts) {
-      const lines = transcript.split('\n');
-      const matches = [];
+      if (results.length >= limit) break;

-      // Find all matching lines
-      lines.forEach((line, index) => {
-        if (line.toLowerCase().includes(queryLower)) {
-          // Get context around the match
+      const episode = this.getEpisode(episodeId);
+      if (!episode) continue;
+
+      if (filterByHost && !resolvedHostIds.has(episode.hostid)) {
+        continue;
+      }
+
+      const lines = transcript.split(/\r?\n/);
+      const matches = [];
+      const matchedTerms = new Set();
+      const termHitCounts = new Map();
+      let truncated = false;
+
+      for (let index = 0; index < lines.length; index++) {
+        const line = lines[index];
+        const matchedOnLine = [];
+
+        for (const matcher of matchers) {
+          matcher.regex.lastIndex = 0;
+          if (matcher.regex.test(line)) {
+            matchedOnLine.push(matcher.term);
+            matchedTerms.add(matcher.term);
+            termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1);
+          }
+        }
+
+        if (matchedOnLine.length > 0) {
          const start = Math.max(0, index - contextLines);
          const end = Math.min(lines.length, index + contextLines + 1);
          const context = lines.slice(start, end).join('\n');

          matches.push({
            lineNumber: index + 1,
-            line: line.trim(),
-            context: context
+            terms: [...new Set(matchedOnLine)],
+            context,
          });
        }
-      });

-      if (matches.length > 0) {
-        const episode = this.getEpisode(episodeId);
-        if (episode) {
+        if (matches.length >= maxMatchesPerEpisode) {
+          truncated = true;
+          break;
+        }
+      }
+
+      if (matches.length === 0) {
+        continue;
+      }
+
+      if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {
+        continue;
+      }
+
      results.push({
        episode,
-            matches: matches.slice(0, 5) // Limit matches per episode
+        matches,
+        matchSummary: {
+          matchMode: resolvedMatchMode,
+          matchedTerms: [...matchedTerms],
+          totalMatches: matches.length,
+          termHitCounts: Object.fromEntries(termHitCounts),
+          truncated,
+        },
      });
    }
-      }
-
-      if (results.length >= limit) break;
-    }

    return results;
  }
@@ -69,6 +69,97 @@ ${episode.summary}`;
  return result;
 }

+function formatTranscriptSearchResults(results, args) {
+  if (results.length === 0) {
+    return '';
+  }
+
+  const descriptorParts = [];
+  if (args.query) {
+    descriptorParts.push(`phrase="${args.query}"`);
+  }
+  if (Array.isArray(args.terms) && args.terms.length > 0) {
+    descriptorParts.push(`terms=[${args.terms.join(', ')}]`);
+  }
+  if (descriptorParts.length === 0) {
+    descriptorParts.push('"no explicit query provided"');
+  }
+
+  const firstSummary = results[0]?.matchSummary || {};
+  const matchMode = firstSummary.matchMode || 'phrase';
+  const contextLines = args.contextLines ?? 3;
+  const caseSensitive = args.caseSensitive ? 'yes' : 'no';
+  const wholeWord = args.wholeWord ? 'yes' : 'no';
+  const maxMatches = args.maxMatchesPerEpisode ?? 5;
+  const hostFilters = [];
+  if (args.hostId) hostFilters.push(`ID ${args.hostId}`);
+  if (args.hostName) hostFilters.push(`name "${args.hostName}"`);
+
+  let text = `# Transcript Search Results (${results.length} episodes)\n\n`;
+  text += `Searching for: ${descriptorParts.join(' | ')}\n`;
+  text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`;
+  text += `Maximum matches per episode: ${maxMatches}\n`;
+  if (hostFilters.length > 0) {
+    text += `Host filter: ${hostFilters.join(' & ')}\n`;
+  }
+  text += '\n## Summary\n';
+
+  text += results.map(result => {
+    const host = dataLoader.getHost(result.episode.hostid);
+    const matchedTerms = result.matchSummary.matchedTerms.length > 0
+      ? result.matchSummary.matchedTerms.join(', ')
+      : 'N/A';
+    const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
+    const termCountText = termCounts.length > 0
+      ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
+      : null;
+    const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : '';
+    let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`;
+    if (termCountText) {
+      line += ` (${termCountText})`;
+    }
+    line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`;
+    return line;
+  }).join('\n');
+
+  text += '\n\n';
+
+  results.forEach(result => {
+    const host = dataLoader.getHost(result.episode.hostid);
+    const matchedTerms = result.matchSummary.matchedTerms.length > 0
+      ? result.matchSummary.matchedTerms.join(', ')
+      : 'N/A';
+    const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
+    const termCountText = termCounts.length > 0
+      ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
+      : null;
+
+    text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title}
+**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date}
+**Matched terms:** ${matchedTerms}
+**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''}
+`;
+    if (termCountText) {
+      text += `**Term counts:** ${termCountText}\n`;
+    }
+    text += '\n';
+
+    result.matches.forEach((match, index) => {
+      const termInfo = match.terms && match.terms.length > 0
+        ? ` | terms: ${match.terms.join(', ')}`
+        : '';
+      text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo})
+\`\`\`
+${match.context}
+\`\`\`
+
+`;
+    });
+  });
+
+  return text;
+}
+
 // List available resources
 server.setRequestHandler(ListResourcesRequestSchema, async () => {
  const stats = dataLoader.getStats();
@@ -258,13 +349,23 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
      },
      {
        name: 'search_transcripts',
-        description: 'Search through episode transcripts for specific keywords or phrases',
+        description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters',
        inputSchema: {
          type: 'object',
          properties: {
            query: {
              type: 'string',
-              description: 'Search query to find in transcripts',
+              description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.',
+            },
+            terms: {
+              type: 'array',
+              items: { type: 'string' },
+              description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".',
+            },
+            matchMode: {
+              type: 'string',
+              enum: ['any', 'all', 'phrase'],
+              description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.',
            },
            limit: {
              type: 'number',
@@ -274,8 +375,28 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
              type: 'number',
              description: 'Number of lines of context around matches (default: 3)',
            },
+            hostId: {
+              type: 'number',
+              description: 'Restrict matches to a given host ID.',
            },
-          required: ['query'],
+            hostName: {
+              type: 'string',
+              description: 'Restrict matches to hosts whose name contains this value.',
+            },
+            caseSensitive: {
+              type: 'boolean',
+              description: 'Perform a case-sensitive search (default: false).',
+            },
+            wholeWord: {
+              type: 'boolean',
+              description: 'Match whole words only (default: false).',
+            },
+            maxMatchesPerEpisode: {
+              type: 'number',
+              description: 'Maximum number of excerpt matches to include per episode (default: 5).',
+            },
+          },
+          required: [],
        },
      },
      {
@@ -395,50 +516,50 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
    }

    if (name === 'search_transcripts') {
-      const results = dataLoader.searchTranscripts(args.query, {
+      const searchOptions = {
        limit: args.limit || 20,
-        contextLines: args.contextLines || 3,
-      });
+        contextLines: args.contextLines ?? 3,
+        terms: args.terms,
+        matchMode: args.matchMode,
+        hostId: args.hostId,
+        hostName: args.hostName,
+        caseSensitive: args.caseSensitive,
+        wholeWord: args.wholeWord,
+        maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5,
+      };
+
+      const results = dataLoader.searchTranscripts(args.query || '', searchOptions);

      if (results.length === 0) {
+        const descriptorParts = [];
+        if (args.query) descriptorParts.push(`phrase "${args.query}"`);
+        if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`);
+        if (args.hostId || args.hostName) descriptorParts.push('host filter applied');
+        const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria';
+
        return {
          content: [
            {
              type: 'text',
-              text: `No transcripts found containing "${args.query}".`,
+              text: `No transcripts found matching ${description}.`,
            },
          ],
        };
      }

-      const text = results.map(result => {
-        const { episode, matches } = result;
-        const host = dataLoader.getHost(episode.hostid);
+      const formatArgs = {
+        ...args,
+        contextLines: searchOptions.contextLines,
+        maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode,
+      };

-        let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
-**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date}
-
-**Matches found:** ${matches.length}
-
-`;
-
-        matches.forEach(match => {
-          episodeText += `### Line ${match.lineNumber}
-\`\`\`
-${match.context}
-\`\`\`
-
-`;
-        });
-
-        return episodeText;
-      }).join('\n---\n\n');
+      const text = formatTranscriptSearchResults(results, formatArgs);

      return {
        content: [
          {
            type: 'text',
-            text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`,
+            text,
          },
        ],
      };
@@ -168,6 +168,97 @@ ${stripHtml(episode.notes)}`;
  return result;
 }

+function formatTranscriptSearchResults(results, args) {
+  if (results.length === 0) {
+    return '';
+  }
+
+  const descriptorParts = [];
+  if (args.query) {
+    descriptorParts.push(`phrase="${args.query}"`);
+  }
+  if (Array.isArray(args.terms) && args.terms.length > 0) {
+    descriptorParts.push(`terms=[${args.terms.join(', ')}]`);
+  }
+  if (descriptorParts.length === 0) {
+    descriptorParts.push('"no explicit query provided"');
+  }
+
+  const firstSummary = results[0]?.matchSummary || {};
+  const matchMode = firstSummary.matchMode || 'phrase';
+  const contextLines = args.contextLines ?? 3;
+  const caseSensitive = args.caseSensitive ? 'yes' : 'no';
+  const wholeWord = args.wholeWord ? 'yes' : 'no';
+  const maxMatches = args.maxMatchesPerEpisode ?? 5;
+  const hostFilters = [];
+  if (args.hostId) hostFilters.push(`ID ${args.hostId}`);
+  if (args.hostName) hostFilters.push(`name "${args.hostName}"`);
+
+  let text = `# Transcript Search Results (${results.length} episodes)\n\n`;
+  text += `Searching for: ${descriptorParts.join(' | ')}\n`;
+  text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`;
+  text += `Maximum matches per episode: ${maxMatches}\n`;
+  if (hostFilters.length > 0) {
+    text += `Host filter: ${hostFilters.join(' & ')}\n`;
+  }
+  text += '\n## Summary\n';
+
+  text += results.map(result => {
+    const host = dataLoader.getHost(result.episode.hostid);
+    const matchedTerms = result.matchSummary.matchedTerms.length > 0
+      ? result.matchSummary.matchedTerms.join(', ')
+      : 'N/A';
+    const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
+    const termCountText = termCounts.length > 0
+      ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
+      : null;
+    const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : '';
+    let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`;
+    if (termCountText) {
+      line += ` (${termCountText})`;
+    }
+    line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`;
+    return line;
+  }).join('\n');
+
+  text += '\n\n';
+
+  results.forEach(result => {
+    const host = dataLoader.getHost(result.episode.hostid);
+    const matchedTerms = result.matchSummary.matchedTerms.length > 0
+      ? result.matchSummary.matchedTerms.join(', ')
+      : 'N/A';
+    const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
+    const termCountText = termCounts.length > 0
+      ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
+      : null;
+
+    text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title}
+**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date}
+**Matched terms:** ${matchedTerms}
+**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''}
+`;
+    if (termCountText) {
+      text += `**Term counts:** ${termCountText}\n`;
+    }
+    text += '\n';
+
+    result.matches.forEach((match, index) => {
+      const termInfo = match.terms && match.terms.length > 0
+        ? ` | terms: ${match.terms.join(', ')}`
+        : '';
+      text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo})
+\`\`\`
+${match.context}
+\`\`\`
+
+`;
+    });
+  });
+
+  return text;
+}
+
 // Create MCP server factory
 function createMCPServer() {
  const server = new Server(
@@ -370,13 +461,23 @@ All content is contributed by the community, for the community.`,
        },
        {
          name: 'search_transcripts',
-          description: 'Search through episode transcripts for specific keywords or phrases',
+          description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters',
          inputSchema: {
            type: 'object',
            properties: {
              query: {
                type: 'string',
-                description: 'Search query to find in transcripts',
+                description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.',
+              },
+              terms: {
+                type: 'array',
+                items: { type: 'string' },
+                description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".',
+              },
+              matchMode: {
+                type: 'string',
+                enum: ['any', 'all', 'phrase'],
+                description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.',
              },
              limit: {
                type: 'number',
@@ -386,8 +487,28 @@ All content is contributed by the community, for the community.`,
                type: 'number',
                description: 'Number of lines of context around matches (default: 3)',
              },
+              hostId: {
+                type: 'number',
+                description: 'Restrict matches to a given host ID.',
              },
-            required: ['query'],
+              hostName: {
+                type: 'string',
+                description: 'Restrict matches to hosts whose name contains this value.',
+              },
+              caseSensitive: {
+                type: 'boolean',
+                description: 'Perform a case-sensitive search (default: false).',
+              },
+              wholeWord: {
+                type: 'boolean',
+                description: 'Match whole words only (default: false).',
+              },
+              maxMatchesPerEpisode: {
+                type: 'number',
+                description: 'Maximum number of excerpt matches to include per episode (default: 5).',
+              },
+            },
+            required: [],
          },
        },
        {
@@ -507,50 +628,50 @@ All content is contributed by the community, for the community.`,
      }

      if (name === 'search_transcripts') {
-        const results = dataLoader.searchTranscripts(args.query, {
+        const searchOptions = {
          limit: args.limit || 20,
-          contextLines: args.contextLines || 3,
-        });
+          contextLines: args.contextLines ?? 3,
+          terms: args.terms,
+          matchMode: args.matchMode,
+          hostId: args.hostId,
+          hostName: args.hostName,
+          caseSensitive: args.caseSensitive,
+          wholeWord: args.wholeWord,
+          maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5,
+        };
+
+        const results = dataLoader.searchTranscripts(args.query || '', searchOptions);

        if (results.length === 0) {
+          const descriptorParts = [];
+          if (args.query) descriptorParts.push(`phrase "${args.query}"`);
+          if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`);
+          if (args.hostId || args.hostName) descriptorParts.push('host filter applied');
+          const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria';
+
          return {
            content: [
              {
                type: 'text',
-                text: `No transcripts found containing "${args.query}".`,
+                text: `No transcripts found matching ${description}.`,
              },
            ],
          };
        }

-        const text = results.map(result => {
-          const { episode, matches } = result;
-          const host = dataLoader.getHost(episode.hostid);
+        const formatArgs = {
+          ...args,
+          contextLines: searchOptions.contextLines,
+          maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode,
+        };

-          let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
-**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date}
-
-**Matches found:** ${matches.length}
-
-`;
-
-          matches.forEach(match => {
-            episodeText += `### Line ${match.lineNumber}
-\`\`\`
-${match.context}
-\`\`\`
-
-`;
-          });
-
-          return episodeText;
-        }).join('\n---\n\n');
+        const text = formatTranscriptSearchResults(results, formatArgs);

        return {
          content: [
            {
              type: 'text',
-              text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`,
+              text,
            },
          ],
        };