From 98177f3fd5d6c9b184dd33396ae6897969fab561 Mon Sep 17 00:00:00 2001 From: Lee Hanken Date: Sun, 26 Oct 2025 14:46:24 +0000 Subject: [PATCH] Enhance transcript search tooling with flexible matching --- README.md | 25 +++++-- data-loader.js | 148 ++++++++++++++++++++++++++++++++++------ index.js | 179 +++++++++++++++++++++++++++++++++++++++++-------- server-http.js | 179 +++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 448 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 3df8273..ff10d10 100644 --- a/README.md +++ b/README.md @@ -138,16 +138,29 @@ Get details for episode 16 including transcript and comments ### 3. `search_transcripts` -Search through episode transcripts for specific keywords. +Search through episode transcripts for phrases or multiple terms with flexible matching. **Parameters:** -- `query` (string, required): Search query -- `limit` (number, optional): Maximum episodes to return (default: 20) -- `contextLines` (number, optional): Lines of context around matches (default: 3) +- `query` (string, optional): Phrase to search for. Useful for exact-phrase lookups. +- `terms` (string[], optional): Explicit list of terms to search for; combine with `matchMode` for logical AND/OR searches. +- `matchMode` (`'phrase' | 'any' | 'all'`, optional): How to combine `query`/`terms`. Defaults to `'phrase'`. Use `'any'` to match if any term is present, `'all'` to require every term somewhere in the transcript. +- `limit` (number, optional): Maximum episodes to return (default: 20). +- `contextLines` (number, optional): Lines of context to include around each match (default: 3). +- `hostId` (number, optional): Only return matches for this host ID. +- `hostName` (string, optional): Only return matches for hosts whose name includes this value. +- `caseSensitive` (boolean, optional): Treat terms as case-sensitive (default: false). +- `wholeWord` (boolean, optional): Match whole words only (default: false). +- `maxMatchesPerEpisode` (number, optional): Maximum number of excerpts per episode (default: 5). -**Example:** +**Example queries:** ``` -Search transcripts for mentions of "virtual machine" +Find transcripts mentioning "virtual machine" +``` +``` +Find transcripts where klaatu talks about bash or python +``` +``` +List episodes where host ID 123 mentions "encryption" and "privacy" (require all terms) ``` ### 4. `get_host_info` diff --git a/data-loader.js b/data-loader.js index 3875954..9368381 100644 --- a/data-loader.js +++ b/data-loader.js @@ -5,6 +5,10 @@ import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); +function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + class HPRDataLoader { constructor() { this.episodes = []; @@ -178,41 +182,147 @@ class HPRDataLoader { * Search transcripts by keyword */ searchTranscripts(query, options = {}) { - const { limit = 20, contextLines = 3 } = options; - const queryLower = query.toLowerCase(); + const { + limit = 20, + contextLines = 3, + terms = [], + matchMode = 'auto', + hostId = null, + hostName = null, + caseSensitive = false, + wholeWord = false, + maxMatchesPerEpisode = 5, + } = options; + + const resolvedHostIds = new Set(); + if (hostId) { + resolvedHostIds.add(Number(hostId)); + } + if (hostName) { + const hostMatches = this.searchHosts(hostName); + hostMatches.forEach(host => resolvedHostIds.add(host.hostid)); + } + const filterByHost = resolvedHostIds.size > 0; + + const explicitTerms = Array.isArray(terms) + ? terms.map(t => (t ?? '').toString().trim()).filter(Boolean) + : []; + + const splitQueryTerms = (matchMode === 'any' || matchMode === 'all') + ? (query || '') + .split(/[|,;\n]/) + .map(part => part.trim()) + .filter(Boolean) + : []; + + const hasQuery = typeof query === 'string' && query.trim().length > 0; + + let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms; + if (searchTerms.length === 0 && hasQuery) { + searchTerms = [query.trim()]; + } + + let resolvedMatchMode = matchMode; + if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) { + resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase'; + } + + const effectiveTerms = resolvedMatchMode === 'phrase' + ? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean) + : searchTerms; + + if (effectiveTerms.length === 0) { + return []; + } + + const regexFlags = caseSensitive ? 'g' : 'gi'; + const matchers = effectiveTerms.map(term => { + if (!term) return null; + const escaped = escapeRegExp(term); + const pattern = wholeWord ? `\\b${escaped}\\b` : escaped; + try { + return { + term, + regex: new RegExp(pattern, regexFlags), + }; + } catch (error) { + console.error(`Invalid search pattern for term "${term}":`, error.message); + return null; + } + }).filter(Boolean); + + if (matchers.length === 0) { + return []; + } + const results = []; for (const [episodeId, transcript] of this.transcripts) { - const lines = transcript.split('\n'); - const matches = []; + if (results.length >= limit) break; - // Find all matching lines - lines.forEach((line, index) => { - if (line.toLowerCase().includes(queryLower)) { - // Get context around the match + const episode = this.getEpisode(episodeId); + if (!episode) continue; + + if (filterByHost && !resolvedHostIds.has(episode.hostid)) { + continue; + } + + const lines = transcript.split(/\r?\n/); + const matches = []; + const matchedTerms = new Set(); + const termHitCounts = new Map(); + let truncated = false; + + for (let index = 0; index < lines.length; index++) { + const line = lines[index]; + const matchedOnLine = []; + + for (const matcher of matchers) { + matcher.regex.lastIndex = 0; + if (matcher.regex.test(line)) { + matchedOnLine.push(matcher.term); + matchedTerms.add(matcher.term); + termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1); + } + } + + if (matchedOnLine.length > 0) { const start = Math.max(0, index - contextLines); const end = Math.min(lines.length, index + contextLines + 1); const context = lines.slice(start, end).join('\n'); matches.push({ lineNumber: index + 1, - line: line.trim(), - context: context + terms: [...new Set(matchedOnLine)], + context, }); } - }); - if (matches.length > 0) { - const episode = this.getEpisode(episodeId); - if (episode) { - results.push({ - episode, - matches: matches.slice(0, 5) // Limit matches per episode - }); + if (matches.length >= maxMatchesPerEpisode) { + truncated = true; + break; } } - if (results.length >= limit) break; + if (matches.length === 0) { + continue; + } + + if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) { + continue; + } + + results.push({ + episode, + matches, + matchSummary: { + matchMode: resolvedMatchMode, + matchedTerms: [...matchedTerms], + totalMatches: matches.length, + termHitCounts: Object.fromEntries(termHitCounts), + truncated, + }, + }); } return results; diff --git a/index.js b/index.js index 645b6a7..482d474 100755 --- a/index.js +++ b/index.js @@ -69,6 +69,97 @@ ${episode.summary}`; return result; } +function formatTranscriptSearchResults(results, args) { + if (results.length === 0) { + return ''; + } + + const descriptorParts = []; + if (args.query) { + descriptorParts.push(`phrase="${args.query}"`); + } + if (Array.isArray(args.terms) && args.terms.length > 0) { + descriptorParts.push(`terms=[${args.terms.join(', ')}]`); + } + if (descriptorParts.length === 0) { + descriptorParts.push('"no explicit query provided"'); + } + + const firstSummary = results[0]?.matchSummary || {}; + const matchMode = firstSummary.matchMode || 'phrase'; + const contextLines = args.contextLines ?? 3; + const caseSensitive = args.caseSensitive ? 'yes' : 'no'; + const wholeWord = args.wholeWord ? 'yes' : 'no'; + const maxMatches = args.maxMatchesPerEpisode ?? 5; + const hostFilters = []; + if (args.hostId) hostFilters.push(`ID ${args.hostId}`); + if (args.hostName) hostFilters.push(`name "${args.hostName}"`); + + let text = `# Transcript Search Results (${results.length} episodes)\n\n`; + text += `Searching for: ${descriptorParts.join(' | ')}\n`; + text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`; + text += `Maximum matches per episode: ${maxMatches}\n`; + if (hostFilters.length > 0) { + text += `Host filter: ${hostFilters.join(' & ')}\n`; + } + text += '\n## Summary\n'; + + text += results.map(result => { + const host = dataLoader.getHost(result.episode.hostid); + const matchedTerms = result.matchSummary.matchedTerms.length > 0 + ? result.matchSummary.matchedTerms.join(', ') + : 'N/A'; + const termCounts = Object.entries(result.matchSummary.termHitCounts || {}); + const termCountText = termCounts.length > 0 + ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ') + : null; + const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : ''; + let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`; + if (termCountText) { + line += ` (${termCountText})`; + } + line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`; + return line; + }).join('\n'); + + text += '\n\n'; + + results.forEach(result => { + const host = dataLoader.getHost(result.episode.hostid); + const matchedTerms = result.matchSummary.matchedTerms.length > 0 + ? result.matchSummary.matchedTerms.join(', ') + : 'N/A'; + const termCounts = Object.entries(result.matchSummary.termHitCounts || {}); + const termCountText = termCounts.length > 0 + ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ') + : null; + + text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} +**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date} +**Matched terms:** ${matchedTerms} +**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''} +`; + if (termCountText) { + text += `**Term counts:** ${termCountText}\n`; + } + text += '\n'; + + result.matches.forEach((match, index) => { + const termInfo = match.terms && match.terms.length > 0 + ? ` | terms: ${match.terms.join(', ')}` + : ''; + text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo}) +\`\`\` +${match.context} +\`\`\` + +`; + }); + }); + + return text; +} + // List available resources server.setRequestHandler(ListResourcesRequestSchema, async () => { const stats = dataLoader.getStats(); @@ -258,13 +349,23 @@ server.setRequestHandler(ListToolsRequestSchema, async () => { }, { name: 'search_transcripts', - description: 'Search through episode transcripts for specific keywords or phrases', + description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters', inputSchema: { type: 'object', properties: { query: { type: 'string', - description: 'Search query to find in transcripts', + description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.', + }, + terms: { + type: 'array', + items: { type: 'string' }, + description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".', + }, + matchMode: { + type: 'string', + enum: ['any', 'all', 'phrase'], + description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.', }, limit: { type: 'number', @@ -274,8 +375,28 @@ server.setRequestHandler(ListToolsRequestSchema, async () => { type: 'number', description: 'Number of lines of context around matches (default: 3)', }, + hostId: { + type: 'number', + description: 'Restrict matches to a given host ID.', + }, + hostName: { + type: 'string', + description: 'Restrict matches to hosts whose name contains this value.', + }, + caseSensitive: { + type: 'boolean', + description: 'Perform a case-sensitive search (default: false).', + }, + wholeWord: { + type: 'boolean', + description: 'Match whole words only (default: false).', + }, + maxMatchesPerEpisode: { + type: 'number', + description: 'Maximum number of excerpt matches to include per episode (default: 5).', + }, }, - required: ['query'], + required: [], }, }, { @@ -395,50 +516,50 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { } if (name === 'search_transcripts') { - const results = dataLoader.searchTranscripts(args.query, { + const searchOptions = { limit: args.limit || 20, - contextLines: args.contextLines || 3, - }); + contextLines: args.contextLines ?? 3, + terms: args.terms, + matchMode: args.matchMode, + hostId: args.hostId, + hostName: args.hostName, + caseSensitive: args.caseSensitive, + wholeWord: args.wholeWord, + maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5, + }; + + const results = dataLoader.searchTranscripts(args.query || '', searchOptions); if (results.length === 0) { + const descriptorParts = []; + if (args.query) descriptorParts.push(`phrase "${args.query}"`); + if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`); + if (args.hostId || args.hostName) descriptorParts.push('host filter applied'); + const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria'; + return { content: [ { type: 'text', - text: `No transcripts found containing "${args.query}".`, + text: `No transcripts found matching ${description}.`, }, ], }; } - const text = results.map(result => { - const { episode, matches } = result; - const host = dataLoader.getHost(episode.hostid); + const formatArgs = { + ...args, + contextLines: searchOptions.contextLines, + maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode, + }; - let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} -**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date} - -**Matches found:** ${matches.length} - -`; - - matches.forEach(match => { - episodeText += `### Line ${match.lineNumber} -\`\`\` -${match.context} -\`\`\` - -`; - }); - - return episodeText; - }).join('\n---\n\n'); + const text = formatTranscriptSearchResults(results, formatArgs); return { content: [ { type: 'text', - text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`, + text, }, ], }; diff --git a/server-http.js b/server-http.js index 5214441..decaf0d 100644 --- a/server-http.js +++ b/server-http.js @@ -168,6 +168,97 @@ ${stripHtml(episode.notes)}`; return result; } +function formatTranscriptSearchResults(results, args) { + if (results.length === 0) { + return ''; + } + + const descriptorParts = []; + if (args.query) { + descriptorParts.push(`phrase="${args.query}"`); + } + if (Array.isArray(args.terms) && args.terms.length > 0) { + descriptorParts.push(`terms=[${args.terms.join(', ')}]`); + } + if (descriptorParts.length === 0) { + descriptorParts.push('"no explicit query provided"'); + } + + const firstSummary = results[0]?.matchSummary || {}; + const matchMode = firstSummary.matchMode || 'phrase'; + const contextLines = args.contextLines ?? 3; + const caseSensitive = args.caseSensitive ? 'yes' : 'no'; + const wholeWord = args.wholeWord ? 'yes' : 'no'; + const maxMatches = args.maxMatchesPerEpisode ?? 5; + const hostFilters = []; + if (args.hostId) hostFilters.push(`ID ${args.hostId}`); + if (args.hostName) hostFilters.push(`name "${args.hostName}"`); + + let text = `# Transcript Search Results (${results.length} episodes)\n\n`; + text += `Searching for: ${descriptorParts.join(' | ')}\n`; + text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`; + text += `Maximum matches per episode: ${maxMatches}\n`; + if (hostFilters.length > 0) { + text += `Host filter: ${hostFilters.join(' & ')}\n`; + } + text += '\n## Summary\n'; + + text += results.map(result => { + const host = dataLoader.getHost(result.episode.hostid); + const matchedTerms = result.matchSummary.matchedTerms.length > 0 + ? result.matchSummary.matchedTerms.join(', ') + : 'N/A'; + const termCounts = Object.entries(result.matchSummary.termHitCounts || {}); + const termCountText = termCounts.length > 0 + ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ') + : null; + const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : ''; + let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`; + if (termCountText) { + line += ` (${termCountText})`; + } + line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`; + return line; + }).join('\n'); + + text += '\n\n'; + + results.forEach(result => { + const host = dataLoader.getHost(result.episode.hostid); + const matchedTerms = result.matchSummary.matchedTerms.length > 0 + ? result.matchSummary.matchedTerms.join(', ') + : 'N/A'; + const termCounts = Object.entries(result.matchSummary.termHitCounts || {}); + const termCountText = termCounts.length > 0 + ? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ') + : null; + + text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} +**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date} +**Matched terms:** ${matchedTerms} +**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''} +`; + if (termCountText) { + text += `**Term counts:** ${termCountText}\n`; + } + text += '\n'; + + result.matches.forEach((match, index) => { + const termInfo = match.terms && match.terms.length > 0 + ? ` | terms: ${match.terms.join(', ')}` + : ''; + text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo}) +\`\`\` +${match.context} +\`\`\` + +`; + }); + }); + + return text; +} + // Create MCP server factory function createMCPServer() { const server = new Server( @@ -370,13 +461,23 @@ All content is contributed by the community, for the community.`, }, { name: 'search_transcripts', - description: 'Search through episode transcripts for specific keywords or phrases', + description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters', inputSchema: { type: 'object', properties: { query: { type: 'string', - description: 'Search query to find in transcripts', + description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.', + }, + terms: { + type: 'array', + items: { type: 'string' }, + description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".', + }, + matchMode: { + type: 'string', + enum: ['any', 'all', 'phrase'], + description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.', }, limit: { type: 'number', @@ -386,8 +487,28 @@ All content is contributed by the community, for the community.`, type: 'number', description: 'Number of lines of context around matches (default: 3)', }, + hostId: { + type: 'number', + description: 'Restrict matches to a given host ID.', + }, + hostName: { + type: 'string', + description: 'Restrict matches to hosts whose name contains this value.', + }, + caseSensitive: { + type: 'boolean', + description: 'Perform a case-sensitive search (default: false).', + }, + wholeWord: { + type: 'boolean', + description: 'Match whole words only (default: false).', + }, + maxMatchesPerEpisode: { + type: 'number', + description: 'Maximum number of excerpt matches to include per episode (default: 5).', + }, }, - required: ['query'], + required: [], }, }, { @@ -507,50 +628,50 @@ All content is contributed by the community, for the community.`, } if (name === 'search_transcripts') { - const results = dataLoader.searchTranscripts(args.query, { + const searchOptions = { limit: args.limit || 20, - contextLines: args.contextLines || 3, - }); + contextLines: args.contextLines ?? 3, + terms: args.terms, + matchMode: args.matchMode, + hostId: args.hostId, + hostName: args.hostName, + caseSensitive: args.caseSensitive, + wholeWord: args.wholeWord, + maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5, + }; + + const results = dataLoader.searchTranscripts(args.query || '', searchOptions); if (results.length === 0) { + const descriptorParts = []; + if (args.query) descriptorParts.push(`phrase "${args.query}"`); + if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`); + if (args.hostId || args.hostName) descriptorParts.push('host filter applied'); + const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria'; + return { content: [ { type: 'text', - text: `No transcripts found containing "${args.query}".`, + text: `No transcripts found matching ${description}.`, }, ], }; } - const text = results.map(result => { - const { episode, matches } = result; - const host = dataLoader.getHost(episode.hostid); + const formatArgs = { + ...args, + contextLines: searchOptions.contextLines, + maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode, + }; - let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} -**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date} - -**Matches found:** ${matches.length} - -`; - - matches.forEach(match => { - episodeText += `### Line ${match.lineNumber} -\`\`\` -${match.context} -\`\`\` - -`; - }); - - return episodeText; - }).join('\n---\n\n'); + const text = formatTranscriptSearchResults(results, formatArgs); return { content: [ { type: 'text', - text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`, + text, }, ], };