Enhance transcript search tooling with flexible matching
This commit is contained in:
25
README.md
25
README.md
@@ -138,16 +138,29 @@ Get details for episode 16 including transcript and comments
|
|||||||
|
|
||||||
### 3. `search_transcripts`
|
### 3. `search_transcripts`
|
||||||
|
|
||||||
Search through episode transcripts for specific keywords.
|
Search through episode transcripts for phrases or multiple terms with flexible matching.
|
||||||
|
|
||||||
**Parameters:**
|
**Parameters:**
|
||||||
- `query` (string, required): Search query
|
- `query` (string, optional): Phrase to search for. Useful for exact-phrase lookups.
|
||||||
- `limit` (number, optional): Maximum episodes to return (default: 20)
|
- `terms` (string[], optional): Explicit list of terms to search for; combine with `matchMode` for logical AND/OR searches.
|
||||||
- `contextLines` (number, optional): Lines of context around matches (default: 3)
|
- `matchMode` (`'phrase' | 'any' | 'all'`, optional): How to combine `query`/`terms`. Defaults to `'phrase'`. Use `'any'` to match if any term is present, `'all'` to require every term somewhere in the transcript.
|
||||||
|
- `limit` (number, optional): Maximum episodes to return (default: 20).
|
||||||
|
- `contextLines` (number, optional): Lines of context to include around each match (default: 3).
|
||||||
|
- `hostId` (number, optional): Only return matches for this host ID.
|
||||||
|
- `hostName` (string, optional): Only return matches for hosts whose name includes this value.
|
||||||
|
- `caseSensitive` (boolean, optional): Treat terms as case-sensitive (default: false).
|
||||||
|
- `wholeWord` (boolean, optional): Match whole words only (default: false).
|
||||||
|
- `maxMatchesPerEpisode` (number, optional): Maximum number of excerpts per episode (default: 5).
|
||||||
|
|
||||||
**Example:**
|
**Example queries:**
|
||||||
```
|
```
|
||||||
Search transcripts for mentions of "virtual machine"
|
Find transcripts mentioning "virtual machine"
|
||||||
|
```
|
||||||
|
```
|
||||||
|
Find transcripts where klaatu talks about bash or python
|
||||||
|
```
|
||||||
|
```
|
||||||
|
List episodes where host ID 123 mentions "encryption" and "privacy" (require all terms)
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. `get_host_info`
|
### 4. `get_host_info`
|
||||||
|
|||||||
148
data-loader.js
148
data-loader.js
@@ -5,6 +5,10 @@ import { fileURLToPath } from 'url';
|
|||||||
const __filename = fileURLToPath(import.meta.url);
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
const __dirname = dirname(__filename);
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
function escapeRegExp(string) {
|
||||||
|
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
class HPRDataLoader {
|
class HPRDataLoader {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.episodes = [];
|
this.episodes = [];
|
||||||
@@ -178,41 +182,147 @@ class HPRDataLoader {
|
|||||||
* Search transcripts by keyword
|
* Search transcripts by keyword
|
||||||
*/
|
*/
|
||||||
searchTranscripts(query, options = {}) {
|
searchTranscripts(query, options = {}) {
|
||||||
const { limit = 20, contextLines = 3 } = options;
|
const {
|
||||||
const queryLower = query.toLowerCase();
|
limit = 20,
|
||||||
|
contextLines = 3,
|
||||||
|
terms = [],
|
||||||
|
matchMode = 'auto',
|
||||||
|
hostId = null,
|
||||||
|
hostName = null,
|
||||||
|
caseSensitive = false,
|
||||||
|
wholeWord = false,
|
||||||
|
maxMatchesPerEpisode = 5,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const resolvedHostIds = new Set();
|
||||||
|
if (hostId) {
|
||||||
|
resolvedHostIds.add(Number(hostId));
|
||||||
|
}
|
||||||
|
if (hostName) {
|
||||||
|
const hostMatches = this.searchHosts(hostName);
|
||||||
|
hostMatches.forEach(host => resolvedHostIds.add(host.hostid));
|
||||||
|
}
|
||||||
|
const filterByHost = resolvedHostIds.size > 0;
|
||||||
|
|
||||||
|
const explicitTerms = Array.isArray(terms)
|
||||||
|
? terms.map(t => (t ?? '').toString().trim()).filter(Boolean)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
const splitQueryTerms = (matchMode === 'any' || matchMode === 'all')
|
||||||
|
? (query || '')
|
||||||
|
.split(/[|,;\n]/)
|
||||||
|
.map(part => part.trim())
|
||||||
|
.filter(Boolean)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
const hasQuery = typeof query === 'string' && query.trim().length > 0;
|
||||||
|
|
||||||
|
let searchTerms = explicitTerms.length > 0 ? explicitTerms : splitQueryTerms;
|
||||||
|
if (searchTerms.length === 0 && hasQuery) {
|
||||||
|
searchTerms = [query.trim()];
|
||||||
|
}
|
||||||
|
|
||||||
|
let resolvedMatchMode = matchMode;
|
||||||
|
if (!['any', 'all', 'phrase'].includes(resolvedMatchMode)) {
|
||||||
|
resolvedMatchMode = searchTerms.length > 1 ? 'any' : 'phrase';
|
||||||
|
}
|
||||||
|
|
||||||
|
const effectiveTerms = resolvedMatchMode === 'phrase'
|
||||||
|
? [(hasQuery ? query.trim() : searchTerms[0] || '')].filter(Boolean)
|
||||||
|
: searchTerms;
|
||||||
|
|
||||||
|
if (effectiveTerms.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const regexFlags = caseSensitive ? 'g' : 'gi';
|
||||||
|
const matchers = effectiveTerms.map(term => {
|
||||||
|
if (!term) return null;
|
||||||
|
const escaped = escapeRegExp(term);
|
||||||
|
const pattern = wholeWord ? `\\b${escaped}\\b` : escaped;
|
||||||
|
try {
|
||||||
|
return {
|
||||||
|
term,
|
||||||
|
regex: new RegExp(pattern, regexFlags),
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Invalid search pattern for term "${term}":`, error.message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}).filter(Boolean);
|
||||||
|
|
||||||
|
if (matchers.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
|
|
||||||
for (const [episodeId, transcript] of this.transcripts) {
|
for (const [episodeId, transcript] of this.transcripts) {
|
||||||
const lines = transcript.split('\n');
|
if (results.length >= limit) break;
|
||||||
const matches = [];
|
|
||||||
|
|
||||||
// Find all matching lines
|
const episode = this.getEpisode(episodeId);
|
||||||
lines.forEach((line, index) => {
|
if (!episode) continue;
|
||||||
if (line.toLowerCase().includes(queryLower)) {
|
|
||||||
// Get context around the match
|
if (filterByHost && !resolvedHostIds.has(episode.hostid)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lines = transcript.split(/\r?\n/);
|
||||||
|
const matches = [];
|
||||||
|
const matchedTerms = new Set();
|
||||||
|
const termHitCounts = new Map();
|
||||||
|
let truncated = false;
|
||||||
|
|
||||||
|
for (let index = 0; index < lines.length; index++) {
|
||||||
|
const line = lines[index];
|
||||||
|
const matchedOnLine = [];
|
||||||
|
|
||||||
|
for (const matcher of matchers) {
|
||||||
|
matcher.regex.lastIndex = 0;
|
||||||
|
if (matcher.regex.test(line)) {
|
||||||
|
matchedOnLine.push(matcher.term);
|
||||||
|
matchedTerms.add(matcher.term);
|
||||||
|
termHitCounts.set(matcher.term, (termHitCounts.get(matcher.term) || 0) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchedOnLine.length > 0) {
|
||||||
const start = Math.max(0, index - contextLines);
|
const start = Math.max(0, index - contextLines);
|
||||||
const end = Math.min(lines.length, index + contextLines + 1);
|
const end = Math.min(lines.length, index + contextLines + 1);
|
||||||
const context = lines.slice(start, end).join('\n');
|
const context = lines.slice(start, end).join('\n');
|
||||||
|
|
||||||
matches.push({
|
matches.push({
|
||||||
lineNumber: index + 1,
|
lineNumber: index + 1,
|
||||||
line: line.trim(),
|
terms: [...new Set(matchedOnLine)],
|
||||||
context: context
|
context,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
if (matches.length > 0) {
|
if (matches.length >= maxMatchesPerEpisode) {
|
||||||
const episode = this.getEpisode(episodeId);
|
truncated = true;
|
||||||
if (episode) {
|
break;
|
||||||
results.push({
|
|
||||||
episode,
|
|
||||||
matches: matches.slice(0, 5) // Limit matches per episode
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (results.length >= limit) break;
|
if (matches.length === 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (resolvedMatchMode === 'all' && matchedTerms.size < matchers.length) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
episode,
|
||||||
|
matches,
|
||||||
|
matchSummary: {
|
||||||
|
matchMode: resolvedMatchMode,
|
||||||
|
matchedTerms: [...matchedTerms],
|
||||||
|
totalMatches: matches.length,
|
||||||
|
termHitCounts: Object.fromEntries(termHitCounts),
|
||||||
|
truncated,
|
||||||
|
},
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
|
|||||||
179
index.js
179
index.js
@@ -69,6 +69,97 @@ ${episode.summary}`;
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function formatTranscriptSearchResults(results, args) {
|
||||||
|
if (results.length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const descriptorParts = [];
|
||||||
|
if (args.query) {
|
||||||
|
descriptorParts.push(`phrase="${args.query}"`);
|
||||||
|
}
|
||||||
|
if (Array.isArray(args.terms) && args.terms.length > 0) {
|
||||||
|
descriptorParts.push(`terms=[${args.terms.join(', ')}]`);
|
||||||
|
}
|
||||||
|
if (descriptorParts.length === 0) {
|
||||||
|
descriptorParts.push('"no explicit query provided"');
|
||||||
|
}
|
||||||
|
|
||||||
|
const firstSummary = results[0]?.matchSummary || {};
|
||||||
|
const matchMode = firstSummary.matchMode || 'phrase';
|
||||||
|
const contextLines = args.contextLines ?? 3;
|
||||||
|
const caseSensitive = args.caseSensitive ? 'yes' : 'no';
|
||||||
|
const wholeWord = args.wholeWord ? 'yes' : 'no';
|
||||||
|
const maxMatches = args.maxMatchesPerEpisode ?? 5;
|
||||||
|
const hostFilters = [];
|
||||||
|
if (args.hostId) hostFilters.push(`ID ${args.hostId}`);
|
||||||
|
if (args.hostName) hostFilters.push(`name "${args.hostName}"`);
|
||||||
|
|
||||||
|
let text = `# Transcript Search Results (${results.length} episodes)\n\n`;
|
||||||
|
text += `Searching for: ${descriptorParts.join(' | ')}\n`;
|
||||||
|
text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`;
|
||||||
|
text += `Maximum matches per episode: ${maxMatches}\n`;
|
||||||
|
if (hostFilters.length > 0) {
|
||||||
|
text += `Host filter: ${hostFilters.join(' & ')}\n`;
|
||||||
|
}
|
||||||
|
text += '\n## Summary\n';
|
||||||
|
|
||||||
|
text += results.map(result => {
|
||||||
|
const host = dataLoader.getHost(result.episode.hostid);
|
||||||
|
const matchedTerms = result.matchSummary.matchedTerms.length > 0
|
||||||
|
? result.matchSummary.matchedTerms.join(', ')
|
||||||
|
: 'N/A';
|
||||||
|
const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
|
||||||
|
const termCountText = termCounts.length > 0
|
||||||
|
? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
|
||||||
|
: null;
|
||||||
|
const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : '';
|
||||||
|
let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`;
|
||||||
|
if (termCountText) {
|
||||||
|
line += ` (${termCountText})`;
|
||||||
|
}
|
||||||
|
line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`;
|
||||||
|
return line;
|
||||||
|
}).join('\n');
|
||||||
|
|
||||||
|
text += '\n\n';
|
||||||
|
|
||||||
|
results.forEach(result => {
|
||||||
|
const host = dataLoader.getHost(result.episode.hostid);
|
||||||
|
const matchedTerms = result.matchSummary.matchedTerms.length > 0
|
||||||
|
? result.matchSummary.matchedTerms.join(', ')
|
||||||
|
: 'N/A';
|
||||||
|
const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
|
||||||
|
const termCountText = termCounts.length > 0
|
||||||
|
? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
|
||||||
|
: null;
|
||||||
|
|
||||||
|
text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title}
|
||||||
|
**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date}
|
||||||
|
**Matched terms:** ${matchedTerms}
|
||||||
|
**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''}
|
||||||
|
`;
|
||||||
|
if (termCountText) {
|
||||||
|
text += `**Term counts:** ${termCountText}\n`;
|
||||||
|
}
|
||||||
|
text += '\n';
|
||||||
|
|
||||||
|
result.matches.forEach((match, index) => {
|
||||||
|
const termInfo = match.terms && match.terms.length > 0
|
||||||
|
? ` | terms: ${match.terms.join(', ')}`
|
||||||
|
: '';
|
||||||
|
text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo})
|
||||||
|
\`\`\`
|
||||||
|
${match.context}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
`;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
// List available resources
|
// List available resources
|
||||||
server.setRequestHandler(ListResourcesRequestSchema, async () => {
|
server.setRequestHandler(ListResourcesRequestSchema, async () => {
|
||||||
const stats = dataLoader.getStats();
|
const stats = dataLoader.getStats();
|
||||||
@@ -258,13 +349,23 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'search_transcripts',
|
name: 'search_transcripts',
|
||||||
description: 'Search through episode transcripts for specific keywords or phrases',
|
description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters',
|
||||||
inputSchema: {
|
inputSchema: {
|
||||||
type: 'object',
|
type: 'object',
|
||||||
properties: {
|
properties: {
|
||||||
query: {
|
query: {
|
||||||
type: 'string',
|
type: 'string',
|
||||||
description: 'Search query to find in transcripts',
|
description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.',
|
||||||
|
},
|
||||||
|
terms: {
|
||||||
|
type: 'array',
|
||||||
|
items: { type: 'string' },
|
||||||
|
description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".',
|
||||||
|
},
|
||||||
|
matchMode: {
|
||||||
|
type: 'string',
|
||||||
|
enum: ['any', 'all', 'phrase'],
|
||||||
|
description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.',
|
||||||
},
|
},
|
||||||
limit: {
|
limit: {
|
||||||
type: 'number',
|
type: 'number',
|
||||||
@@ -274,8 +375,28 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|||||||
type: 'number',
|
type: 'number',
|
||||||
description: 'Number of lines of context around matches (default: 3)',
|
description: 'Number of lines of context around matches (default: 3)',
|
||||||
},
|
},
|
||||||
|
hostId: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Restrict matches to a given host ID.',
|
||||||
|
},
|
||||||
|
hostName: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Restrict matches to hosts whose name contains this value.',
|
||||||
|
},
|
||||||
|
caseSensitive: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Perform a case-sensitive search (default: false).',
|
||||||
|
},
|
||||||
|
wholeWord: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Match whole words only (default: false).',
|
||||||
|
},
|
||||||
|
maxMatchesPerEpisode: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Maximum number of excerpt matches to include per episode (default: 5).',
|
||||||
|
},
|
||||||
},
|
},
|
||||||
required: ['query'],
|
required: [],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -395,50 +516,50 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (name === 'search_transcripts') {
|
if (name === 'search_transcripts') {
|
||||||
const results = dataLoader.searchTranscripts(args.query, {
|
const searchOptions = {
|
||||||
limit: args.limit || 20,
|
limit: args.limit || 20,
|
||||||
contextLines: args.contextLines || 3,
|
contextLines: args.contextLines ?? 3,
|
||||||
});
|
terms: args.terms,
|
||||||
|
matchMode: args.matchMode,
|
||||||
|
hostId: args.hostId,
|
||||||
|
hostName: args.hostName,
|
||||||
|
caseSensitive: args.caseSensitive,
|
||||||
|
wholeWord: args.wholeWord,
|
||||||
|
maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5,
|
||||||
|
};
|
||||||
|
|
||||||
|
const results = dataLoader.searchTranscripts(args.query || '', searchOptions);
|
||||||
|
|
||||||
if (results.length === 0) {
|
if (results.length === 0) {
|
||||||
|
const descriptorParts = [];
|
||||||
|
if (args.query) descriptorParts.push(`phrase "${args.query}"`);
|
||||||
|
if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`);
|
||||||
|
if (args.hostId || args.hostName) descriptorParts.push('host filter applied');
|
||||||
|
const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria';
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: [
|
content: [
|
||||||
{
|
{
|
||||||
type: 'text',
|
type: 'text',
|
||||||
text: `No transcripts found containing "${args.query}".`,
|
text: `No transcripts found matching ${description}.`,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const text = results.map(result => {
|
const formatArgs = {
|
||||||
const { episode, matches } = result;
|
...args,
|
||||||
const host = dataLoader.getHost(episode.hostid);
|
contextLines: searchOptions.contextLines,
|
||||||
|
maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode,
|
||||||
|
};
|
||||||
|
|
||||||
let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
|
const text = formatTranscriptSearchResults(results, formatArgs);
|
||||||
**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date}
|
|
||||||
|
|
||||||
**Matches found:** ${matches.length}
|
|
||||||
|
|
||||||
`;
|
|
||||||
|
|
||||||
matches.forEach(match => {
|
|
||||||
episodeText += `### Line ${match.lineNumber}
|
|
||||||
\`\`\`
|
|
||||||
${match.context}
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
`;
|
|
||||||
});
|
|
||||||
|
|
||||||
return episodeText;
|
|
||||||
}).join('\n---\n\n');
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: [
|
content: [
|
||||||
{
|
{
|
||||||
type: 'text',
|
type: 'text',
|
||||||
text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`,
|
text,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|||||||
179
server-http.js
179
server-http.js
@@ -168,6 +168,97 @@ ${stripHtml(episode.notes)}`;
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function formatTranscriptSearchResults(results, args) {
|
||||||
|
if (results.length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const descriptorParts = [];
|
||||||
|
if (args.query) {
|
||||||
|
descriptorParts.push(`phrase="${args.query}"`);
|
||||||
|
}
|
||||||
|
if (Array.isArray(args.terms) && args.terms.length > 0) {
|
||||||
|
descriptorParts.push(`terms=[${args.terms.join(', ')}]`);
|
||||||
|
}
|
||||||
|
if (descriptorParts.length === 0) {
|
||||||
|
descriptorParts.push('"no explicit query provided"');
|
||||||
|
}
|
||||||
|
|
||||||
|
const firstSummary = results[0]?.matchSummary || {};
|
||||||
|
const matchMode = firstSummary.matchMode || 'phrase';
|
||||||
|
const contextLines = args.contextLines ?? 3;
|
||||||
|
const caseSensitive = args.caseSensitive ? 'yes' : 'no';
|
||||||
|
const wholeWord = args.wholeWord ? 'yes' : 'no';
|
||||||
|
const maxMatches = args.maxMatchesPerEpisode ?? 5;
|
||||||
|
const hostFilters = [];
|
||||||
|
if (args.hostId) hostFilters.push(`ID ${args.hostId}`);
|
||||||
|
if (args.hostName) hostFilters.push(`name "${args.hostName}"`);
|
||||||
|
|
||||||
|
let text = `# Transcript Search Results (${results.length} episodes)\n\n`;
|
||||||
|
text += `Searching for: ${descriptorParts.join(' | ')}\n`;
|
||||||
|
text += `Match mode: ${matchMode} | Context lines: ${contextLines} | Case sensitive: ${caseSensitive} | Whole word: ${wholeWord}\n`;
|
||||||
|
text += `Maximum matches per episode: ${maxMatches}\n`;
|
||||||
|
if (hostFilters.length > 0) {
|
||||||
|
text += `Host filter: ${hostFilters.join(' & ')}\n`;
|
||||||
|
}
|
||||||
|
text += '\n## Summary\n';
|
||||||
|
|
||||||
|
text += results.map(result => {
|
||||||
|
const host = dataLoader.getHost(result.episode.hostid);
|
||||||
|
const matchedTerms = result.matchSummary.matchedTerms.length > 0
|
||||||
|
? result.matchSummary.matchedTerms.join(', ')
|
||||||
|
: 'N/A';
|
||||||
|
const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
|
||||||
|
const termCountText = termCounts.length > 0
|
||||||
|
? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
|
||||||
|
: null;
|
||||||
|
const truncatedNote = result.matchSummary.truncated ? ' (truncated)' : '';
|
||||||
|
let line = `- HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title} — ${result.matchSummary.totalMatches} match${result.matchSummary.totalMatches === 1 ? '' : 'es'}${truncatedNote}; terms: ${matchedTerms}`;
|
||||||
|
if (termCountText) {
|
||||||
|
line += ` (${termCountText})`;
|
||||||
|
}
|
||||||
|
line += ` | Host: ${host?.host || 'Unknown'} (${result.episode.date})`;
|
||||||
|
return line;
|
||||||
|
}).join('\n');
|
||||||
|
|
||||||
|
text += '\n\n';
|
||||||
|
|
||||||
|
results.forEach(result => {
|
||||||
|
const host = dataLoader.getHost(result.episode.hostid);
|
||||||
|
const matchedTerms = result.matchSummary.matchedTerms.length > 0
|
||||||
|
? result.matchSummary.matchedTerms.join(', ')
|
||||||
|
: 'N/A';
|
||||||
|
const termCounts = Object.entries(result.matchSummary.termHitCounts || {});
|
||||||
|
const termCountText = termCounts.length > 0
|
||||||
|
? termCounts.map(([term, count]) => `${term}: ${count}`).join(', ')
|
||||||
|
: null;
|
||||||
|
|
||||||
|
text += `## HPR${String(result.episode.id).padStart(4, '0')}: ${result.episode.title}
|
||||||
|
**Host:** ${host?.host || 'Unknown'} | **Date:** ${result.episode.date}
|
||||||
|
**Matched terms:** ${matchedTerms}
|
||||||
|
**Matches captured:** ${result.matchSummary.totalMatches}${result.matchSummary.truncated ? ' (additional matches omitted after reaching limit)' : ''}
|
||||||
|
`;
|
||||||
|
if (termCountText) {
|
||||||
|
text += `**Term counts:** ${termCountText}\n`;
|
||||||
|
}
|
||||||
|
text += '\n';
|
||||||
|
|
||||||
|
result.matches.forEach((match, index) => {
|
||||||
|
const termInfo = match.terms && match.terms.length > 0
|
||||||
|
? ` | terms: ${match.terms.join(', ')}`
|
||||||
|
: '';
|
||||||
|
text += `### Match ${index + 1} (line ${match.lineNumber}${termInfo})
|
||||||
|
\`\`\`
|
||||||
|
${match.context}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
`;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
// Create MCP server factory
|
// Create MCP server factory
|
||||||
function createMCPServer() {
|
function createMCPServer() {
|
||||||
const server = new Server(
|
const server = new Server(
|
||||||
@@ -370,13 +461,23 @@ All content is contributed by the community, for the community.`,
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'search_transcripts',
|
name: 'search_transcripts',
|
||||||
description: 'Search through episode transcripts for specific keywords or phrases',
|
description: 'Search through episode transcripts using phrases or multiple terms with AND/OR matching and optional host filters',
|
||||||
inputSchema: {
|
inputSchema: {
|
||||||
type: 'object',
|
type: 'object',
|
||||||
properties: {
|
properties: {
|
||||||
query: {
|
query: {
|
||||||
type: 'string',
|
type: 'string',
|
||||||
description: 'Search query to find in transcripts',
|
description: 'Search phrase to find in transcripts. Combine with terms/matchMode for advanced searches.',
|
||||||
|
},
|
||||||
|
terms: {
|
||||||
|
type: 'array',
|
||||||
|
items: { type: 'string' },
|
||||||
|
description: 'Explicit list of terms to search for; useful when pairing with matchMode "any" or "all".',
|
||||||
|
},
|
||||||
|
matchMode: {
|
||||||
|
type: 'string',
|
||||||
|
enum: ['any', 'all', 'phrase'],
|
||||||
|
description: 'How to interpret the query/terms. "phrase" (default) matches the phrase exactly, "any" matches if any term is present, "all" requires every term.',
|
||||||
},
|
},
|
||||||
limit: {
|
limit: {
|
||||||
type: 'number',
|
type: 'number',
|
||||||
@@ -386,8 +487,28 @@ All content is contributed by the community, for the community.`,
|
|||||||
type: 'number',
|
type: 'number',
|
||||||
description: 'Number of lines of context around matches (default: 3)',
|
description: 'Number of lines of context around matches (default: 3)',
|
||||||
},
|
},
|
||||||
|
hostId: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Restrict matches to a given host ID.',
|
||||||
|
},
|
||||||
|
hostName: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Restrict matches to hosts whose name contains this value.',
|
||||||
|
},
|
||||||
|
caseSensitive: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Perform a case-sensitive search (default: false).',
|
||||||
|
},
|
||||||
|
wholeWord: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Match whole words only (default: false).',
|
||||||
|
},
|
||||||
|
maxMatchesPerEpisode: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Maximum number of excerpt matches to include per episode (default: 5).',
|
||||||
|
},
|
||||||
},
|
},
|
||||||
required: ['query'],
|
required: [],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -507,50 +628,50 @@ All content is contributed by the community, for the community.`,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (name === 'search_transcripts') {
|
if (name === 'search_transcripts') {
|
||||||
const results = dataLoader.searchTranscripts(args.query, {
|
const searchOptions = {
|
||||||
limit: args.limit || 20,
|
limit: args.limit || 20,
|
||||||
contextLines: args.contextLines || 3,
|
contextLines: args.contextLines ?? 3,
|
||||||
});
|
terms: args.terms,
|
||||||
|
matchMode: args.matchMode,
|
||||||
|
hostId: args.hostId,
|
||||||
|
hostName: args.hostName,
|
||||||
|
caseSensitive: args.caseSensitive,
|
||||||
|
wholeWord: args.wholeWord,
|
||||||
|
maxMatchesPerEpisode: args.maxMatchesPerEpisode ?? 5,
|
||||||
|
};
|
||||||
|
|
||||||
|
const results = dataLoader.searchTranscripts(args.query || '', searchOptions);
|
||||||
|
|
||||||
if (results.length === 0) {
|
if (results.length === 0) {
|
||||||
|
const descriptorParts = [];
|
||||||
|
if (args.query) descriptorParts.push(`phrase "${args.query}"`);
|
||||||
|
if (Array.isArray(args.terms) && args.terms.length > 0) descriptorParts.push(`terms [${args.terms.join(', ')}]`);
|
||||||
|
if (args.hostId || args.hostName) descriptorParts.push('host filter applied');
|
||||||
|
const description = descriptorParts.length > 0 ? descriptorParts.join(', ') : 'the provided criteria';
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: [
|
content: [
|
||||||
{
|
{
|
||||||
type: 'text',
|
type: 'text',
|
||||||
text: `No transcripts found containing "${args.query}".`,
|
text: `No transcripts found matching ${description}.`,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const text = results.map(result => {
|
const formatArgs = {
|
||||||
const { episode, matches } = result;
|
...args,
|
||||||
const host = dataLoader.getHost(episode.hostid);
|
contextLines: searchOptions.contextLines,
|
||||||
|
maxMatchesPerEpisode: searchOptions.maxMatchesPerEpisode,
|
||||||
|
};
|
||||||
|
|
||||||
let episodeText = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
|
const text = formatTranscriptSearchResults(results, formatArgs);
|
||||||
**Host:** ${host?.host || 'Unknown'} | **Date:** ${episode.date}
|
|
||||||
|
|
||||||
**Matches found:** ${matches.length}
|
|
||||||
|
|
||||||
`;
|
|
||||||
|
|
||||||
matches.forEach(match => {
|
|
||||||
episodeText += `### Line ${match.lineNumber}
|
|
||||||
\`\`\`
|
|
||||||
${match.context}
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
`;
|
|
||||||
});
|
|
||||||
|
|
||||||
return episodeText;
|
|
||||||
}).join('\n---\n\n');
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: [
|
content: [
|
||||||
{
|
{
|
||||||
type: 'text',
|
type: 'text',
|
||||||
text: `# Transcript Search Results (${results.length} episodes)\n\nSearching for: "${args.query}"\n\n${text}`,
|
text,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user