From 8924bb489f344f5a2c94bbdd910a12837ca2be6c Mon Sep 17 00:00:00 2001 From: Lee Hanken Date: Sun, 2 Nov 2025 12:51:40 +0000 Subject: [PATCH] use fuzzy matching for host and episode searches --- README.md | 45 ++++++++++++- data-loader.js | 156 +++++++++++++++++++++++++++++++++++++------ index.js | 18 ++++- server-http.js | 18 ++++- test-fuzzy-http.js | 113 +++++++++++++++++++++++++++++++ test-fuzzy-search.js | 82 +++++++++++++++++++++++ 6 files changed, 406 insertions(+), 26 deletions(-) create mode 100644 test-fuzzy-http.js create mode 100644 test-fuzzy-search.js diff --git a/README.md b/README.md index ff10d10..9df8465 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,11 @@ Hacker Public Radio is a community-driven podcast where hosts contribute content This MCP server provides: - **Episode Search**: Search through thousands of HPR episodes by title, summary, tags, or host notes -- **Transcript Search**: Full-text search across all episode transcripts + - **Fuzzy Matching**: Automatically handles typos and misspellings (e.g., "linx" finds "linux", "pythoon" finds "python") +- **Transcript Search**: Full-text search across all episode transcripts with flexible matching modes - **Episode Details**: Get complete information about any episode including transcript and comments - **Host Information**: Look up hosts and see all their contributions + - **Fuzzy Matching**: Handles name variations and typos (e.g., "klattu" finds "Klaatu") - **Series Browsing**: Explore mini-series of related episodes - **Statistics**: View overall HPR statistics and recent episodes @@ -189,6 +191,45 @@ Get information about a series and all its episodes. Get information about series 4 (Databases series) ``` +## Fuzzy Matching + +The server includes intelligent fuzzy matching for episode and host searches to handle typos and misspellings. + +### How It Works + +1. **Exact Match First**: The server always tries exact substring matching first for speed +2. **Fuzzy Fallback**: If no exact matches are found, it falls back to fuzzy matching using Levenshtein distance +3. **Match Indicators**: Results include indicators showing whether they're exact or fuzzy matches + +### Examples + +**Host Search:** +- Query: `"klattu"` → Finds: **Klaatu** *(fuzzy match, distance: 1)* +- Query: `"ken"` → Finds: **Ken Fallon** *(exact match)* + +**Episode Search:** +- Query: `"pythoon"` → Finds episodes with **python** in the title *(fuzzy match, distance: 1)* +- Query: `"linx"` → Finds episodes with **linux** *(may match exactly in summary/tags, or fuzzy in title)* + +### Distance Thresholds + +- **Hosts**: Maximum distance of 2 characters (handles 1-2 typos) +- **Episodes**: Maximum distance of 3 characters (more lenient for longer titles) + +### What the AI Agent Sees + +When fuzzy matching is used, results include: +- `matchType: 'exact'` or `matchType: 'fuzzy'` +- `matchDistance: N` (for fuzzy matches, indicating how many character edits were needed) + +This allows AI agents to provide context to users, such as: *"I found results for 'klaatu' (you typed 'klattu')"* + +### Technical Details + +The fuzzy matching uses the **Levenshtein distance algorithm**, which counts the minimum number of single-character edits (insertions, deletions, substitutions) needed to change one string into another. + +**Note**: Transcript search uses regex-based matching and does not use fuzzy matching, as the flexible regex patterns already handle many variations. + ## Available Resources ### `hpr://stats` @@ -314,7 +355,7 @@ The Hacker Public Radio content itself is released under various Creative Common Contributions are welcome! This server can be extended with: -- Advanced search features (fuzzy matching, relevance ranking) +- Advanced search features (relevance ranking, semantic search) - Tag cloud generation - Episode recommendations - Audio file access diff --git a/data-loader.js b/data-loader.js index 9368381..d498a9b 100644 --- a/data-loader.js +++ b/data-loader.js @@ -9,6 +9,45 @@ function escapeRegExp(string) { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } +/** + * Calculate Levenshtein distance between two strings + * Returns the minimum number of single-character edits (insertions, deletions, substitutions) + * needed to change one string into the other. + */ +function levenshteinDistance(a, b) { + if (a.length === 0) return b.length; + if (b.length === 0) return a.length; + + const matrix = []; + + // Initialize first column + for (let i = 0; i <= b.length; i++) { + matrix[i] = [i]; + } + + // Initialize first row + for (let j = 0; j <= a.length; j++) { + matrix[0][j] = j; + } + + // Fill in the rest of the matrix + for (let i = 1; i <= b.length; i++) { + for (let j = 1; j <= a.length; j++) { + if (b.charAt(i - 1) === a.charAt(j - 1)) { + matrix[i][j] = matrix[i - 1][j - 1]; + } else { + matrix[i][j] = Math.min( + matrix[i - 1][j - 1] + 1, // substitution + matrix[i][j - 1] + 1, // insertion + matrix[i - 1][j] + 1 // deletion + ); + } + } + } + + return matrix[b.length][a.length]; +} + class HPRDataLoader { constructor() { this.episodes = []; @@ -135,7 +174,8 @@ class HPRDataLoader { } /** - * Search episodes by keyword in title, summary, or tags + * Search episodes by keyword in title, summary, or tags with fuzzy matching fallback + * Returns episodes with matchType indicator ('exact' or 'fuzzy') */ searchEpisodes(query, options = {}) { const { @@ -144,37 +184,78 @@ class HPRDataLoader { seriesId = null, tag = null, fromDate = null, - toDate = null + toDate = null, + maxDistance = 3 // More lenient for longer episode titles } = options; const queryLower = query.toLowerCase(); + + // Helper to check if episode matches filters (excluding query) + const matchesFilters = (ep) => { + const matchesHost = !hostId || ep.hostid === hostId; + const matchesSeries = seriesId === null || ep.series === seriesId; + const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase()); + const matchesDateRange = (!fromDate || ep.date >= fromDate) && + (!toDate || ep.date <= toDate); + return matchesHost && matchesSeries && matchesTag && matchesDateRange; + }; + + // Try exact substring match first (fast path) let results = this.episodes.filter(ep => { - // Basic text search const matchesQuery = !query || ep.title.toLowerCase().includes(queryLower) || ep.summary.toLowerCase().includes(queryLower) || ep.tags.toLowerCase().includes(queryLower) || ep.notes.toLowerCase().includes(queryLower); - // Filter by host - const matchesHost = !hostId || ep.hostid === hostId; + return matchesQuery && matchesFilters(ep); + }).map(ep => ({ + ...ep, + matchType: 'exact' + })); - // Filter by series - const matchesSeries = seriesId === null || ep.series === seriesId; + // If no exact matches and we have a query, try fuzzy match on title + if (results.length === 0 && query && query.trim().length > 0) { + const fuzzyResults = this.episodes + .filter(matchesFilters) + .map(ep => { + // Check if any word in the title is close to the query + const titleWords = ep.title.toLowerCase().split(/\s+/); + let minDistance = Infinity; - // Filter by tag - const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase()); + for (const word of titleWords) { + const distance = levenshteinDistance(queryLower, word); + if (distance < minDistance) { + minDistance = distance; + } + } - // Filter by date range - const matchesDateRange = (!fromDate || ep.date >= fromDate) && - (!toDate || ep.date <= toDate); + return { + episode: ep, + distance: minDistance + }; + }) + .filter(result => result.distance <= maxDistance) + .sort((a, b) => a.distance - b.distance) + .map(result => ({ + ...result.episode, + matchType: 'fuzzy', + matchDistance: result.distance + })); - return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange; + results = fuzzyResults; + } + + // Sort by date (newest first), maintaining match quality + results.sort((a, b) => { + // If both are fuzzy matches, sort by distance first, then date + if (a.matchType === 'fuzzy' && b.matchType === 'fuzzy') { + const distDiff = (a.matchDistance || 0) - (b.matchDistance || 0); + if (distDiff !== 0) return distDiff; + } + return b.date.localeCompare(a.date); }); - // Sort by date (newest first) - results.sort((a, b) => b.date.localeCompare(a.date)); - return results.slice(0, limit); } @@ -329,14 +410,49 @@ class HPRDataLoader { } /** - * Search hosts by name or email + * Search hosts by name or email with fuzzy matching fallback + * Returns hosts with matchType indicator ('exact' or 'fuzzy') */ - searchHosts(query) { + searchHosts(query, options = {}) { + const { maxDistance = 2 } = options; const queryLower = query.toLowerCase(); - return this.hosts.filter(host => + + // Try exact substring match first (fast path) + const exactMatches = this.hosts.filter(host => host.host.toLowerCase().includes(queryLower) || host.email.toLowerCase().includes(queryLower) - ); + ).map(host => ({ + ...host, + matchType: 'exact' + })); + + if (exactMatches.length > 0) { + return exactMatches; + } + + // Fall back to fuzzy match if no exact matches + const fuzzyMatches = this.hosts + .map(host => { + const hostLower = host.host.toLowerCase(); + const emailLower = host.email.toLowerCase(); + const hostDistance = levenshteinDistance(queryLower, hostLower); + const emailDistance = levenshteinDistance(queryLower, emailLower); + const minDistance = Math.min(hostDistance, emailDistance); + + return { + host, + distance: minDistance + }; + }) + .filter(result => result.distance <= maxDistance) + .sort((a, b) => a.distance - b.distance) + .map(result => ({ + ...result.host, + matchType: 'fuzzy', + matchDistance: result.distance + })); + + return fuzzyMatches; } /** diff --git a/index.js b/index.js index 26085f2..3feaa84 100755 --- a/index.js +++ b/index.js @@ -45,7 +45,14 @@ function formatEpisode(episode, includeNotes = false) { const host = dataLoader.getHost(episode.hostid); const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null; - let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} + let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`; + + // Add match type indicator for fuzzy matches + if (episode.matchType === 'fuzzy') { + result += ` *(fuzzy match, distance: ${episode.matchDistance})*`; + } + + result += ` **Date:** ${episode.date} **Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid}) @@ -606,7 +613,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { }; } - let text = `# ${host.host} + let text = `# ${host.host}`; + + // Add match type indicator for fuzzy matches + if (host.matchType === 'fuzzy') { + text += ` *(fuzzy match, distance: ${host.matchDistance})*`; + } + + text += ` **Host ID:** ${host.hostid} **Email:** ${host.email} diff --git a/server-http.js b/server-http.js index b53ab88..e319324 100644 --- a/server-http.js +++ b/server-http.js @@ -139,7 +139,14 @@ function formatEpisode(episode, includeNotes = false) { const host = dataLoader.getHost(episode.hostid); const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null; - let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} + let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`; + + // Add match type indicator for fuzzy matches + if (episode.matchType === 'fuzzy') { + result += ` *(fuzzy match, distance: ${episode.matchDistance})*`; + } + + result += ` **Date:** ${episode.date} **Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid}) @@ -718,7 +725,14 @@ All content is contributed by the community, for the community.`, }; } - let text = `# ${host.host} + let text = `# ${host.host}`; + + // Add match type indicator for fuzzy matches + if (host.matchType === 'fuzzy') { + text += ` *(fuzzy match, distance: ${host.matchDistance})*`; + } + + text += ` **Host ID:** ${host.hostid} **Email:** ${host.email} diff --git a/test-fuzzy-http.js b/test-fuzzy-http.js new file mode 100644 index 0000000..ca98187 --- /dev/null +++ b/test-fuzzy-http.js @@ -0,0 +1,113 @@ +#!/usr/bin/env node + +/** + * Test fuzzy search via HTTP/SSE MCP Server + */ + +import EventSource from 'eventsource'; +import fetch from 'node-fetch'; + +const SERVER_URL = 'http://localhost:3000'; +const SSE_ENDPOINT = `${SERVER_URL}/sse`; +const MESSAGE_ENDPOINT = `${SERVER_URL}/message`; + +let requestId = 1; +let sse; +let connectionId = null; + +async function sendMessage(method, params = {}) { + const message = { + jsonrpc: '2.0', + id: requestId++, + method, + params + }; + + return new Promise(async (resolve) => { + const handler = (event) => { + try { + const data = JSON.parse(event.data); + if (data.id === message.id) { + sse.removeEventListener('message', handler); + resolve(data.result); + } + } catch (e) { + // Ignore parse errors + } + }; + + sse.addEventListener('message', handler); + + await fetch(MESSAGE_ENDPOINT, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-connection-id': connectionId + }, + body: JSON.stringify(message) + }); + }); +} + +async function test() { + console.log('Testing fuzzy search via HTTP/SSE MCP\n'); + + // Connect to SSE + sse = new EventSource(SSE_ENDPOINT); + + await new Promise((resolve) => { + sse.addEventListener('endpoint', (event) => { + const url = new URL(event.data, SERVER_URL); + connectionId = url.searchParams.get('sessionId'); + console.log(`Connected with session ID: ${connectionId}\n`); + resolve(); + }); + }); + + await new Promise(resolve => setTimeout(resolve, 500)); + + // Test 1: Search for host with typo + console.log('=== Test 1: Fuzzy Host Search ==='); + console.log('Searching for host: "klattu" (typo for Klaatu)\n'); + + const hostResult = await sendMessage('tools/call', { + name: 'get_host_info', + arguments: { + hostName: 'klattu' + } + }); + + const hostText = hostResult.content[0].text; + const hostLines = hostText.split('\n').slice(0, 8); + console.log(hostLines.join('\n')); + console.log(''); + + // Test 2: Search episodes with typo + console.log('=== Test 2: Fuzzy Episode Search ==='); + console.log('Searching for episodes: "pythoon" (typo for python)\n'); + + const episodeResult = await sendMessage('tools/call', { + name: 'search_episodes', + arguments: { + query: 'pythoon', + limit: 2 + } + }); + + const episodeText = episodeResult.content[0].text; + // Extract just the first episode header + const firstEpisode = episodeText.split('\n---\n')[0]; + const episodeLines = firstEpisode.split('\n').slice(0, 10); + console.log(episodeLines.join('\n')); + console.log(''); + + console.log('✅ HTTP/SSE fuzzy search tests completed!\n'); + + sse.close(); + process.exit(0); +} + +test().catch(err => { + console.error('Error:', err); + process.exit(1); +}); diff --git a/test-fuzzy-search.js b/test-fuzzy-search.js new file mode 100644 index 0000000..8200f82 --- /dev/null +++ b/test-fuzzy-search.js @@ -0,0 +1,82 @@ +#!/usr/bin/env node + +/** + * Test script for fuzzy search functionality + * Tests both episode and host fuzzy matching + */ + +import HPRDataLoader from './data-loader.js'; + +console.log('Loading HPR data...\n'); +const dataLoader = new HPRDataLoader(); +await dataLoader.load(); +console.log('Data loaded!\n'); + +// Test 1: Exact host match (should use exact matching) +console.log('=== Test 1: Exact Host Match ==='); +console.log('Query: "ken"\n'); +const exactHosts = dataLoader.searchHosts('ken'); +console.log(`Found ${exactHosts.length} results (exact match)`); +exactHosts.slice(0, 3).forEach(host => { + console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}]`); +}); +console.log(''); + +// Test 2: Fuzzy host match with typo +console.log('=== Test 2: Fuzzy Host Match (typo) ==='); +console.log('Query: "klattu" (should match "klaatu")\n'); +const fuzzyHosts = dataLoader.searchHosts('klattu'); +console.log(`Found ${fuzzyHosts.length} results`); +fuzzyHosts.forEach(host => { + console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}, distance: ${host.matchDistance}]`); +}); +console.log(''); + +// Test 3: Another fuzzy host match +console.log('=== Test 3: Fuzzy Host Match (another typo) ==='); +console.log('Query: "dav" (should find hosts like "Dave")\n'); +const fuzzyHosts2 = dataLoader.searchHosts('dav'); +console.log(`Found ${fuzzyHosts2.length} results`); +fuzzyHosts2.slice(0, 5).forEach(host => { + console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}${host.matchDistance ? ', distance: ' + host.matchDistance : ''}]`); +}); +console.log(''); + +// Test 4: Exact episode search +console.log('=== Test 4: Exact Episode Match ==='); +console.log('Query: "linux" (exact match in title/summary)\n'); +const exactEpisodes = dataLoader.searchEpisodes('linux', { limit: 3 }); +console.log(`Found ${exactEpisodes.length} results`); +exactEpisodes.forEach(ep => { + console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title} [matchType: ${ep.matchType}]`); +}); +console.log(''); + +// Test 5: Fuzzy episode search with typo +console.log('=== Test 5: Fuzzy Episode Match (typo) ==='); +console.log('Query: "linx" (should match episodes with "linux" in title)\n'); +const fuzzyEpisodes = dataLoader.searchEpisodes('linx', { limit: 3 }); +console.log(`Found ${fuzzyEpisodes.length} results`); +fuzzyEpisodes.forEach(ep => { + console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`); +}); +console.log(''); + +// Test 6: Another fuzzy episode search +console.log('=== Test 6: Fuzzy Episode Match (misspelling) ==='); +console.log('Query: "pythoon" (should match "python")\n'); +const fuzzyEpisodes2 = dataLoader.searchEpisodes('pythoon', { limit: 3 }); +console.log(`Found ${fuzzyEpisodes2.length} results`); +fuzzyEpisodes2.forEach(ep => { + console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`); +}); +console.log(''); + +// Test 7: No match (distance too large) +console.log('=== Test 7: No Match (distance too large) ==='); +console.log('Query: "xyzabc" (should find nothing)\n'); +const noMatch = dataLoader.searchHosts('xyzabc'); +console.log(`Found ${noMatch.length} results`); +console.log(''); + +console.log('✅ All fuzzy search tests completed!');