use fuzzy matching for host and episode searches

This commit is contained in:
Lee Hanken
2025-11-02 12:51:40 +00:00
parent 020d324edb
commit 8924bb489f
6 changed files with 406 additions and 26 deletions

View File

@@ -11,9 +11,11 @@ Hacker Public Radio is a community-driven podcast where hosts contribute content
This MCP server provides: This MCP server provides:
- **Episode Search**: Search through thousands of HPR episodes by title, summary, tags, or host notes - **Episode Search**: Search through thousands of HPR episodes by title, summary, tags, or host notes
- **Transcript Search**: Full-text search across all episode transcripts - **Fuzzy Matching**: Automatically handles typos and misspellings (e.g., "linx" finds "linux", "pythoon" finds "python")
- **Transcript Search**: Full-text search across all episode transcripts with flexible matching modes
- **Episode Details**: Get complete information about any episode including transcript and comments - **Episode Details**: Get complete information about any episode including transcript and comments
- **Host Information**: Look up hosts and see all their contributions - **Host Information**: Look up hosts and see all their contributions
- **Fuzzy Matching**: Handles name variations and typos (e.g., "klattu" finds "Klaatu")
- **Series Browsing**: Explore mini-series of related episodes - **Series Browsing**: Explore mini-series of related episodes
- **Statistics**: View overall HPR statistics and recent episodes - **Statistics**: View overall HPR statistics and recent episodes
@@ -189,6 +191,45 @@ Get information about a series and all its episodes.
Get information about series 4 (Databases series) Get information about series 4 (Databases series)
``` ```
## Fuzzy Matching
The server includes intelligent fuzzy matching for episode and host searches to handle typos and misspellings.
### How It Works
1. **Exact Match First**: The server always tries exact substring matching first for speed
2. **Fuzzy Fallback**: If no exact matches are found, it falls back to fuzzy matching using Levenshtein distance
3. **Match Indicators**: Results include indicators showing whether they're exact or fuzzy matches
### Examples
**Host Search:**
- Query: `"klattu"` → Finds: **Klaatu** *(fuzzy match, distance: 1)*
- Query: `"ken"` → Finds: **Ken Fallon** *(exact match)*
**Episode Search:**
- Query: `"pythoon"` → Finds episodes with **python** in the title *(fuzzy match, distance: 1)*
- Query: `"linx"` → Finds episodes with **linux** *(may match exactly in summary/tags, or fuzzy in title)*
### Distance Thresholds
- **Hosts**: Maximum distance of 2 characters (handles 1-2 typos)
- **Episodes**: Maximum distance of 3 characters (more lenient for longer titles)
### What the AI Agent Sees
When fuzzy matching is used, results include:
- `matchType: 'exact'` or `matchType: 'fuzzy'`
- `matchDistance: N` (for fuzzy matches, indicating how many character edits were needed)
This allows AI agents to provide context to users, such as: *"I found results for 'klaatu' (you typed 'klattu')"*
### Technical Details
The fuzzy matching uses the **Levenshtein distance algorithm**, which counts the minimum number of single-character edits (insertions, deletions, substitutions) needed to change one string into another.
**Note**: Transcript search uses regex-based matching and does not use fuzzy matching, as the flexible regex patterns already handle many variations.
## Available Resources ## Available Resources
### `hpr://stats` ### `hpr://stats`
@@ -314,7 +355,7 @@ The Hacker Public Radio content itself is released under various Creative Common
Contributions are welcome! This server can be extended with: Contributions are welcome! This server can be extended with:
- Advanced search features (fuzzy matching, relevance ranking) - Advanced search features (relevance ranking, semantic search)
- Tag cloud generation - Tag cloud generation
- Episode recommendations - Episode recommendations
- Audio file access - Audio file access

View File

@@ -9,6 +9,45 @@ function escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
} }
/**
* Calculate Levenshtein distance between two strings
* Returns the minimum number of single-character edits (insertions, deletions, substitutions)
* needed to change one string into the other.
*/
function levenshteinDistance(a, b) {
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const matrix = [];
// Initialize first column
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
// Initialize first row
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
class HPRDataLoader { class HPRDataLoader {
constructor() { constructor() {
this.episodes = []; this.episodes = [];
@@ -135,7 +174,8 @@ class HPRDataLoader {
} }
/** /**
* Search episodes by keyword in title, summary, or tags * Search episodes by keyword in title, summary, or tags with fuzzy matching fallback
* Returns episodes with matchType indicator ('exact' or 'fuzzy')
*/ */
searchEpisodes(query, options = {}) { searchEpisodes(query, options = {}) {
const { const {
@@ -144,37 +184,78 @@ class HPRDataLoader {
seriesId = null, seriesId = null,
tag = null, tag = null,
fromDate = null, fromDate = null,
toDate = null toDate = null,
maxDistance = 3 // More lenient for longer episode titles
} = options; } = options;
const queryLower = query.toLowerCase(); const queryLower = query.toLowerCase();
// Helper to check if episode matches filters (excluding query)
const matchesFilters = (ep) => {
const matchesHost = !hostId || ep.hostid === hostId;
const matchesSeries = seriesId === null || ep.series === seriesId;
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());
const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
(!toDate || ep.date <= toDate);
return matchesHost && matchesSeries && matchesTag && matchesDateRange;
};
// Try exact substring match first (fast path)
let results = this.episodes.filter(ep => { let results = this.episodes.filter(ep => {
// Basic text search
const matchesQuery = !query || const matchesQuery = !query ||
ep.title.toLowerCase().includes(queryLower) || ep.title.toLowerCase().includes(queryLower) ||
ep.summary.toLowerCase().includes(queryLower) || ep.summary.toLowerCase().includes(queryLower) ||
ep.tags.toLowerCase().includes(queryLower) || ep.tags.toLowerCase().includes(queryLower) ||
ep.notes.toLowerCase().includes(queryLower); ep.notes.toLowerCase().includes(queryLower);
// Filter by host return matchesQuery && matchesFilters(ep);
const matchesHost = !hostId || ep.hostid === hostId; }).map(ep => ({
...ep,
matchType: 'exact'
}));
// Filter by series // If no exact matches and we have a query, try fuzzy match on title
const matchesSeries = seriesId === null || ep.series === seriesId; if (results.length === 0 && query && query.trim().length > 0) {
const fuzzyResults = this.episodes
.filter(matchesFilters)
.map(ep => {
// Check if any word in the title is close to the query
const titleWords = ep.title.toLowerCase().split(/\s+/);
let minDistance = Infinity;
// Filter by tag for (const word of titleWords) {
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase()); const distance = levenshteinDistance(queryLower, word);
if (distance < minDistance) {
minDistance = distance;
}
}
// Filter by date range return {
const matchesDateRange = (!fromDate || ep.date >= fromDate) && episode: ep,
(!toDate || ep.date <= toDate); distance: minDistance
};
})
.filter(result => result.distance <= maxDistance)
.sort((a, b) => a.distance - b.distance)
.map(result => ({
...result.episode,
matchType: 'fuzzy',
matchDistance: result.distance
}));
return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange; results = fuzzyResults;
}
// Sort by date (newest first), maintaining match quality
results.sort((a, b) => {
// If both are fuzzy matches, sort by distance first, then date
if (a.matchType === 'fuzzy' && b.matchType === 'fuzzy') {
const distDiff = (a.matchDistance || 0) - (b.matchDistance || 0);
if (distDiff !== 0) return distDiff;
}
return b.date.localeCompare(a.date);
}); });
// Sort by date (newest first)
results.sort((a, b) => b.date.localeCompare(a.date));
return results.slice(0, limit); return results.slice(0, limit);
} }
@@ -329,14 +410,49 @@ class HPRDataLoader {
} }
/** /**
* Search hosts by name or email * Search hosts by name or email with fuzzy matching fallback
* Returns hosts with matchType indicator ('exact' or 'fuzzy')
*/ */
searchHosts(query) { searchHosts(query, options = {}) {
const { maxDistance = 2 } = options;
const queryLower = query.toLowerCase(); const queryLower = query.toLowerCase();
return this.hosts.filter(host =>
// Try exact substring match first (fast path)
const exactMatches = this.hosts.filter(host =>
host.host.toLowerCase().includes(queryLower) || host.host.toLowerCase().includes(queryLower) ||
host.email.toLowerCase().includes(queryLower) host.email.toLowerCase().includes(queryLower)
); ).map(host => ({
...host,
matchType: 'exact'
}));
if (exactMatches.length > 0) {
return exactMatches;
}
// Fall back to fuzzy match if no exact matches
const fuzzyMatches = this.hosts
.map(host => {
const hostLower = host.host.toLowerCase();
const emailLower = host.email.toLowerCase();
const hostDistance = levenshteinDistance(queryLower, hostLower);
const emailDistance = levenshteinDistance(queryLower, emailLower);
const minDistance = Math.min(hostDistance, emailDistance);
return {
host,
distance: minDistance
};
})
.filter(result => result.distance <= maxDistance)
.sort((a, b) => a.distance - b.distance)
.map(result => ({
...result.host,
matchType: 'fuzzy',
matchDistance: result.distance
}));
return fuzzyMatches;
} }
/** /**

View File

@@ -45,7 +45,14 @@ function formatEpisode(episode, includeNotes = false) {
const host = dataLoader.getHost(episode.hostid); const host = dataLoader.getHost(episode.hostid);
const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null; const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null;
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`;
// Add match type indicator for fuzzy matches
if (episode.matchType === 'fuzzy') {
result += ` *(fuzzy match, distance: ${episode.matchDistance})*`;
}
result += `
**Date:** ${episode.date} **Date:** ${episode.date}
**Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid}) **Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid})
@@ -606,7 +613,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
}; };
} }
let text = `# ${host.host} let text = `# ${host.host}`;
// Add match type indicator for fuzzy matches
if (host.matchType === 'fuzzy') {
text += ` *(fuzzy match, distance: ${host.matchDistance})*`;
}
text += `
**Host ID:** ${host.hostid} **Host ID:** ${host.hostid}
**Email:** ${host.email} **Email:** ${host.email}

View File

@@ -139,7 +139,14 @@ function formatEpisode(episode, includeNotes = false) {
const host = dataLoader.getHost(episode.hostid); const host = dataLoader.getHost(episode.hostid);
const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null; const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null;
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title} let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`;
// Add match type indicator for fuzzy matches
if (episode.matchType === 'fuzzy') {
result += ` *(fuzzy match, distance: ${episode.matchDistance})*`;
}
result += `
**Date:** ${episode.date} **Date:** ${episode.date}
**Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid}) **Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid})
@@ -718,7 +725,14 @@ All content is contributed by the community, for the community.`,
}; };
} }
let text = `# ${host.host} let text = `# ${host.host}`;
// Add match type indicator for fuzzy matches
if (host.matchType === 'fuzzy') {
text += ` *(fuzzy match, distance: ${host.matchDistance})*`;
}
text += `
**Host ID:** ${host.hostid} **Host ID:** ${host.hostid}
**Email:** ${host.email} **Email:** ${host.email}

113
test-fuzzy-http.js Normal file
View File

@@ -0,0 +1,113 @@
#!/usr/bin/env node
/**
* Test fuzzy search via HTTP/SSE MCP Server
*/
import EventSource from 'eventsource';
import fetch from 'node-fetch';
const SERVER_URL = 'http://localhost:3000';
const SSE_ENDPOINT = `${SERVER_URL}/sse`;
const MESSAGE_ENDPOINT = `${SERVER_URL}/message`;
let requestId = 1;
let sse;
let connectionId = null;
async function sendMessage(method, params = {}) {
const message = {
jsonrpc: '2.0',
id: requestId++,
method,
params
};
return new Promise(async (resolve) => {
const handler = (event) => {
try {
const data = JSON.parse(event.data);
if (data.id === message.id) {
sse.removeEventListener('message', handler);
resolve(data.result);
}
} catch (e) {
// Ignore parse errors
}
};
sse.addEventListener('message', handler);
await fetch(MESSAGE_ENDPOINT, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-connection-id': connectionId
},
body: JSON.stringify(message)
});
});
}
async function test() {
console.log('Testing fuzzy search via HTTP/SSE MCP\n');
// Connect to SSE
sse = new EventSource(SSE_ENDPOINT);
await new Promise((resolve) => {
sse.addEventListener('endpoint', (event) => {
const url = new URL(event.data, SERVER_URL);
connectionId = url.searchParams.get('sessionId');
console.log(`Connected with session ID: ${connectionId}\n`);
resolve();
});
});
await new Promise(resolve => setTimeout(resolve, 500));
// Test 1: Search for host with typo
console.log('=== Test 1: Fuzzy Host Search ===');
console.log('Searching for host: "klattu" (typo for Klaatu)\n');
const hostResult = await sendMessage('tools/call', {
name: 'get_host_info',
arguments: {
hostName: 'klattu'
}
});
const hostText = hostResult.content[0].text;
const hostLines = hostText.split('\n').slice(0, 8);
console.log(hostLines.join('\n'));
console.log('');
// Test 2: Search episodes with typo
console.log('=== Test 2: Fuzzy Episode Search ===');
console.log('Searching for episodes: "pythoon" (typo for python)\n');
const episodeResult = await sendMessage('tools/call', {
name: 'search_episodes',
arguments: {
query: 'pythoon',
limit: 2
}
});
const episodeText = episodeResult.content[0].text;
// Extract just the first episode header
const firstEpisode = episodeText.split('\n---\n')[0];
const episodeLines = firstEpisode.split('\n').slice(0, 10);
console.log(episodeLines.join('\n'));
console.log('');
console.log('✅ HTTP/SSE fuzzy search tests completed!\n');
sse.close();
process.exit(0);
}
test().catch(err => {
console.error('Error:', err);
process.exit(1);
});

82
test-fuzzy-search.js Normal file
View File

@@ -0,0 +1,82 @@
#!/usr/bin/env node
/**
* Test script for fuzzy search functionality
* Tests both episode and host fuzzy matching
*/
import HPRDataLoader from './data-loader.js';
console.log('Loading HPR data...\n');
const dataLoader = new HPRDataLoader();
await dataLoader.load();
console.log('Data loaded!\n');
// Test 1: Exact host match (should use exact matching)
console.log('=== Test 1: Exact Host Match ===');
console.log('Query: "ken"\n');
const exactHosts = dataLoader.searchHosts('ken');
console.log(`Found ${exactHosts.length} results (exact match)`);
exactHosts.slice(0, 3).forEach(host => {
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}]`);
});
console.log('');
// Test 2: Fuzzy host match with typo
console.log('=== Test 2: Fuzzy Host Match (typo) ===');
console.log('Query: "klattu" (should match "klaatu")\n');
const fuzzyHosts = dataLoader.searchHosts('klattu');
console.log(`Found ${fuzzyHosts.length} results`);
fuzzyHosts.forEach(host => {
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}, distance: ${host.matchDistance}]`);
});
console.log('');
// Test 3: Another fuzzy host match
console.log('=== Test 3: Fuzzy Host Match (another typo) ===');
console.log('Query: "dav" (should find hosts like "Dave")\n');
const fuzzyHosts2 = dataLoader.searchHosts('dav');
console.log(`Found ${fuzzyHosts2.length} results`);
fuzzyHosts2.slice(0, 5).forEach(host => {
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}${host.matchDistance ? ', distance: ' + host.matchDistance : ''}]`);
});
console.log('');
// Test 4: Exact episode search
console.log('=== Test 4: Exact Episode Match ===');
console.log('Query: "linux" (exact match in title/summary)\n');
const exactEpisodes = dataLoader.searchEpisodes('linux', { limit: 3 });
console.log(`Found ${exactEpisodes.length} results`);
exactEpisodes.forEach(ep => {
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title} [matchType: ${ep.matchType}]`);
});
console.log('');
// Test 5: Fuzzy episode search with typo
console.log('=== Test 5: Fuzzy Episode Match (typo) ===');
console.log('Query: "linx" (should match episodes with "linux" in title)\n');
const fuzzyEpisodes = dataLoader.searchEpisodes('linx', { limit: 3 });
console.log(`Found ${fuzzyEpisodes.length} results`);
fuzzyEpisodes.forEach(ep => {
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`);
});
console.log('');
// Test 6: Another fuzzy episode search
console.log('=== Test 6: Fuzzy Episode Match (misspelling) ===');
console.log('Query: "pythoon" (should match "python")\n');
const fuzzyEpisodes2 = dataLoader.searchEpisodes('pythoon', { limit: 3 });
console.log(`Found ${fuzzyEpisodes2.length} results`);
fuzzyEpisodes2.forEach(ep => {
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`);
});
console.log('');
// Test 7: No match (distance too large)
console.log('=== Test 7: No Match (distance too large) ===');
console.log('Query: "xyzabc" (should find nothing)\n');
const noMatch = dataLoader.searchHosts('xyzabc');
console.log(`Found ${noMatch.length} results`);
console.log('');
console.log('✅ All fuzzy search tests completed!');