use fuzzy matching for host and episode searches
This commit is contained in:
45
README.md
45
README.md
@@ -11,9 +11,11 @@ Hacker Public Radio is a community-driven podcast where hosts contribute content
|
||||
This MCP server provides:
|
||||
|
||||
- **Episode Search**: Search through thousands of HPR episodes by title, summary, tags, or host notes
|
||||
- **Transcript Search**: Full-text search across all episode transcripts
|
||||
- **Fuzzy Matching**: Automatically handles typos and misspellings (e.g., "linx" finds "linux", "pythoon" finds "python")
|
||||
- **Transcript Search**: Full-text search across all episode transcripts with flexible matching modes
|
||||
- **Episode Details**: Get complete information about any episode including transcript and comments
|
||||
- **Host Information**: Look up hosts and see all their contributions
|
||||
- **Fuzzy Matching**: Handles name variations and typos (e.g., "klattu" finds "Klaatu")
|
||||
- **Series Browsing**: Explore mini-series of related episodes
|
||||
- **Statistics**: View overall HPR statistics and recent episodes
|
||||
|
||||
@@ -189,6 +191,45 @@ Get information about a series and all its episodes.
|
||||
Get information about series 4 (Databases series)
|
||||
```
|
||||
|
||||
## Fuzzy Matching
|
||||
|
||||
The server includes intelligent fuzzy matching for episode and host searches to handle typos and misspellings.
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Exact Match First**: The server always tries exact substring matching first for speed
|
||||
2. **Fuzzy Fallback**: If no exact matches are found, it falls back to fuzzy matching using Levenshtein distance
|
||||
3. **Match Indicators**: Results include indicators showing whether they're exact or fuzzy matches
|
||||
|
||||
### Examples
|
||||
|
||||
**Host Search:**
|
||||
- Query: `"klattu"` → Finds: **Klaatu** *(fuzzy match, distance: 1)*
|
||||
- Query: `"ken"` → Finds: **Ken Fallon** *(exact match)*
|
||||
|
||||
**Episode Search:**
|
||||
- Query: `"pythoon"` → Finds episodes with **python** in the title *(fuzzy match, distance: 1)*
|
||||
- Query: `"linx"` → Finds episodes with **linux** *(may match exactly in summary/tags, or fuzzy in title)*
|
||||
|
||||
### Distance Thresholds
|
||||
|
||||
- **Hosts**: Maximum distance of 2 characters (handles 1-2 typos)
|
||||
- **Episodes**: Maximum distance of 3 characters (more lenient for longer titles)
|
||||
|
||||
### What the AI Agent Sees
|
||||
|
||||
When fuzzy matching is used, results include:
|
||||
- `matchType: 'exact'` or `matchType: 'fuzzy'`
|
||||
- `matchDistance: N` (for fuzzy matches, indicating how many character edits were needed)
|
||||
|
||||
This allows AI agents to provide context to users, such as: *"I found results for 'klaatu' (you typed 'klattu')"*
|
||||
|
||||
### Technical Details
|
||||
|
||||
The fuzzy matching uses the **Levenshtein distance algorithm**, which counts the minimum number of single-character edits (insertions, deletions, substitutions) needed to change one string into another.
|
||||
|
||||
**Note**: Transcript search uses regex-based matching and does not use fuzzy matching, as the flexible regex patterns already handle many variations.
|
||||
|
||||
## Available Resources
|
||||
|
||||
### `hpr://stats`
|
||||
@@ -314,7 +355,7 @@ The Hacker Public Radio content itself is released under various Creative Common
|
||||
|
||||
Contributions are welcome! This server can be extended with:
|
||||
|
||||
- Advanced search features (fuzzy matching, relevance ranking)
|
||||
- Advanced search features (relevance ranking, semantic search)
|
||||
- Tag cloud generation
|
||||
- Episode recommendations
|
||||
- Audio file access
|
||||
|
||||
156
data-loader.js
156
data-loader.js
@@ -9,6 +9,45 @@ function escapeRegExp(string) {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate Levenshtein distance between two strings
|
||||
* Returns the minimum number of single-character edits (insertions, deletions, substitutions)
|
||||
* needed to change one string into the other.
|
||||
*/
|
||||
function levenshteinDistance(a, b) {
|
||||
if (a.length === 0) return b.length;
|
||||
if (b.length === 0) return a.length;
|
||||
|
||||
const matrix = [];
|
||||
|
||||
// Initialize first column
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
|
||||
// Initialize first row
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
|
||||
// Fill in the rest of the matrix
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
} else {
|
||||
matrix[i][j] = Math.min(
|
||||
matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
|
||||
class HPRDataLoader {
|
||||
constructor() {
|
||||
this.episodes = [];
|
||||
@@ -135,7 +174,8 @@ class HPRDataLoader {
|
||||
}
|
||||
|
||||
/**
|
||||
* Search episodes by keyword in title, summary, or tags
|
||||
* Search episodes by keyword in title, summary, or tags with fuzzy matching fallback
|
||||
* Returns episodes with matchType indicator ('exact' or 'fuzzy')
|
||||
*/
|
||||
searchEpisodes(query, options = {}) {
|
||||
const {
|
||||
@@ -144,37 +184,78 @@ class HPRDataLoader {
|
||||
seriesId = null,
|
||||
tag = null,
|
||||
fromDate = null,
|
||||
toDate = null
|
||||
toDate = null,
|
||||
maxDistance = 3 // More lenient for longer episode titles
|
||||
} = options;
|
||||
|
||||
const queryLower = query.toLowerCase();
|
||||
|
||||
// Helper to check if episode matches filters (excluding query)
|
||||
const matchesFilters = (ep) => {
|
||||
const matchesHost = !hostId || ep.hostid === hostId;
|
||||
const matchesSeries = seriesId === null || ep.series === seriesId;
|
||||
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());
|
||||
const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
|
||||
(!toDate || ep.date <= toDate);
|
||||
return matchesHost && matchesSeries && matchesTag && matchesDateRange;
|
||||
};
|
||||
|
||||
// Try exact substring match first (fast path)
|
||||
let results = this.episodes.filter(ep => {
|
||||
// Basic text search
|
||||
const matchesQuery = !query ||
|
||||
ep.title.toLowerCase().includes(queryLower) ||
|
||||
ep.summary.toLowerCase().includes(queryLower) ||
|
||||
ep.tags.toLowerCase().includes(queryLower) ||
|
||||
ep.notes.toLowerCase().includes(queryLower);
|
||||
|
||||
// Filter by host
|
||||
const matchesHost = !hostId || ep.hostid === hostId;
|
||||
return matchesQuery && matchesFilters(ep);
|
||||
}).map(ep => ({
|
||||
...ep,
|
||||
matchType: 'exact'
|
||||
}));
|
||||
|
||||
// Filter by series
|
||||
const matchesSeries = seriesId === null || ep.series === seriesId;
|
||||
// If no exact matches and we have a query, try fuzzy match on title
|
||||
if (results.length === 0 && query && query.trim().length > 0) {
|
||||
const fuzzyResults = this.episodes
|
||||
.filter(matchesFilters)
|
||||
.map(ep => {
|
||||
// Check if any word in the title is close to the query
|
||||
const titleWords = ep.title.toLowerCase().split(/\s+/);
|
||||
let minDistance = Infinity;
|
||||
|
||||
// Filter by tag
|
||||
const matchesTag = !tag || ep.tags.toLowerCase().includes(tag.toLowerCase());
|
||||
for (const word of titleWords) {
|
||||
const distance = levenshteinDistance(queryLower, word);
|
||||
if (distance < minDistance) {
|
||||
minDistance = distance;
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by date range
|
||||
const matchesDateRange = (!fromDate || ep.date >= fromDate) &&
|
||||
(!toDate || ep.date <= toDate);
|
||||
return {
|
||||
episode: ep,
|
||||
distance: minDistance
|
||||
};
|
||||
})
|
||||
.filter(result => result.distance <= maxDistance)
|
||||
.sort((a, b) => a.distance - b.distance)
|
||||
.map(result => ({
|
||||
...result.episode,
|
||||
matchType: 'fuzzy',
|
||||
matchDistance: result.distance
|
||||
}));
|
||||
|
||||
return matchesQuery && matchesHost && matchesSeries && matchesTag && matchesDateRange;
|
||||
results = fuzzyResults;
|
||||
}
|
||||
|
||||
// Sort by date (newest first), maintaining match quality
|
||||
results.sort((a, b) => {
|
||||
// If both are fuzzy matches, sort by distance first, then date
|
||||
if (a.matchType === 'fuzzy' && b.matchType === 'fuzzy') {
|
||||
const distDiff = (a.matchDistance || 0) - (b.matchDistance || 0);
|
||||
if (distDiff !== 0) return distDiff;
|
||||
}
|
||||
return b.date.localeCompare(a.date);
|
||||
});
|
||||
|
||||
// Sort by date (newest first)
|
||||
results.sort((a, b) => b.date.localeCompare(a.date));
|
||||
|
||||
return results.slice(0, limit);
|
||||
}
|
||||
|
||||
@@ -329,14 +410,49 @@ class HPRDataLoader {
|
||||
}
|
||||
|
||||
/**
|
||||
* Search hosts by name or email
|
||||
* Search hosts by name or email with fuzzy matching fallback
|
||||
* Returns hosts with matchType indicator ('exact' or 'fuzzy')
|
||||
*/
|
||||
searchHosts(query) {
|
||||
searchHosts(query, options = {}) {
|
||||
const { maxDistance = 2 } = options;
|
||||
const queryLower = query.toLowerCase();
|
||||
return this.hosts.filter(host =>
|
||||
|
||||
// Try exact substring match first (fast path)
|
||||
const exactMatches = this.hosts.filter(host =>
|
||||
host.host.toLowerCase().includes(queryLower) ||
|
||||
host.email.toLowerCase().includes(queryLower)
|
||||
);
|
||||
).map(host => ({
|
||||
...host,
|
||||
matchType: 'exact'
|
||||
}));
|
||||
|
||||
if (exactMatches.length > 0) {
|
||||
return exactMatches;
|
||||
}
|
||||
|
||||
// Fall back to fuzzy match if no exact matches
|
||||
const fuzzyMatches = this.hosts
|
||||
.map(host => {
|
||||
const hostLower = host.host.toLowerCase();
|
||||
const emailLower = host.email.toLowerCase();
|
||||
const hostDistance = levenshteinDistance(queryLower, hostLower);
|
||||
const emailDistance = levenshteinDistance(queryLower, emailLower);
|
||||
const minDistance = Math.min(hostDistance, emailDistance);
|
||||
|
||||
return {
|
||||
host,
|
||||
distance: minDistance
|
||||
};
|
||||
})
|
||||
.filter(result => result.distance <= maxDistance)
|
||||
.sort((a, b) => a.distance - b.distance)
|
||||
.map(result => ({
|
||||
...result.host,
|
||||
matchType: 'fuzzy',
|
||||
matchDistance: result.distance
|
||||
}));
|
||||
|
||||
return fuzzyMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
18
index.js
18
index.js
@@ -45,7 +45,14 @@ function formatEpisode(episode, includeNotes = false) {
|
||||
const host = dataLoader.getHost(episode.hostid);
|
||||
const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null;
|
||||
|
||||
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
|
||||
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`;
|
||||
|
||||
// Add match type indicator for fuzzy matches
|
||||
if (episode.matchType === 'fuzzy') {
|
||||
result += ` *(fuzzy match, distance: ${episode.matchDistance})*`;
|
||||
}
|
||||
|
||||
result += `
|
||||
|
||||
**Date:** ${episode.date}
|
||||
**Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid})
|
||||
@@ -606,7 +613,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
||||
};
|
||||
}
|
||||
|
||||
let text = `# ${host.host}
|
||||
let text = `# ${host.host}`;
|
||||
|
||||
// Add match type indicator for fuzzy matches
|
||||
if (host.matchType === 'fuzzy') {
|
||||
text += ` *(fuzzy match, distance: ${host.matchDistance})*`;
|
||||
}
|
||||
|
||||
text += `
|
||||
|
||||
**Host ID:** ${host.hostid}
|
||||
**Email:** ${host.email}
|
||||
|
||||
@@ -139,7 +139,14 @@ function formatEpisode(episode, includeNotes = false) {
|
||||
const host = dataLoader.getHost(episode.hostid);
|
||||
const seriesInfo = episode.series !== 0 ? dataLoader.getSeries(episode.series) : null;
|
||||
|
||||
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}
|
||||
let result = `# HPR${String(episode.id).padStart(4, '0')}: ${episode.title}`;
|
||||
|
||||
// Add match type indicator for fuzzy matches
|
||||
if (episode.matchType === 'fuzzy') {
|
||||
result += ` *(fuzzy match, distance: ${episode.matchDistance})*`;
|
||||
}
|
||||
|
||||
result += `
|
||||
|
||||
**Date:** ${episode.date}
|
||||
**Host:** ${host?.host || 'Unknown'} (ID: ${episode.hostid})
|
||||
@@ -718,7 +725,14 @@ All content is contributed by the community, for the community.`,
|
||||
};
|
||||
}
|
||||
|
||||
let text = `# ${host.host}
|
||||
let text = `# ${host.host}`;
|
||||
|
||||
// Add match type indicator for fuzzy matches
|
||||
if (host.matchType === 'fuzzy') {
|
||||
text += ` *(fuzzy match, distance: ${host.matchDistance})*`;
|
||||
}
|
||||
|
||||
text += `
|
||||
|
||||
**Host ID:** ${host.hostid}
|
||||
**Email:** ${host.email}
|
||||
|
||||
113
test-fuzzy-http.js
Normal file
113
test-fuzzy-http.js
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Test fuzzy search via HTTP/SSE MCP Server
|
||||
*/
|
||||
|
||||
import EventSource from 'eventsource';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
const SERVER_URL = 'http://localhost:3000';
|
||||
const SSE_ENDPOINT = `${SERVER_URL}/sse`;
|
||||
const MESSAGE_ENDPOINT = `${SERVER_URL}/message`;
|
||||
|
||||
let requestId = 1;
|
||||
let sse;
|
||||
let connectionId = null;
|
||||
|
||||
async function sendMessage(method, params = {}) {
|
||||
const message = {
|
||||
jsonrpc: '2.0',
|
||||
id: requestId++,
|
||||
method,
|
||||
params
|
||||
};
|
||||
|
||||
return new Promise(async (resolve) => {
|
||||
const handler = (event) => {
|
||||
try {
|
||||
const data = JSON.parse(event.data);
|
||||
if (data.id === message.id) {
|
||||
sse.removeEventListener('message', handler);
|
||||
resolve(data.result);
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore parse errors
|
||||
}
|
||||
};
|
||||
|
||||
sse.addEventListener('message', handler);
|
||||
|
||||
await fetch(MESSAGE_ENDPOINT, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-connection-id': connectionId
|
||||
},
|
||||
body: JSON.stringify(message)
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function test() {
|
||||
console.log('Testing fuzzy search via HTTP/SSE MCP\n');
|
||||
|
||||
// Connect to SSE
|
||||
sse = new EventSource(SSE_ENDPOINT);
|
||||
|
||||
await new Promise((resolve) => {
|
||||
sse.addEventListener('endpoint', (event) => {
|
||||
const url = new URL(event.data, SERVER_URL);
|
||||
connectionId = url.searchParams.get('sessionId');
|
||||
console.log(`Connected with session ID: ${connectionId}\n`);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
|
||||
// Test 1: Search for host with typo
|
||||
console.log('=== Test 1: Fuzzy Host Search ===');
|
||||
console.log('Searching for host: "klattu" (typo for Klaatu)\n');
|
||||
|
||||
const hostResult = await sendMessage('tools/call', {
|
||||
name: 'get_host_info',
|
||||
arguments: {
|
||||
hostName: 'klattu'
|
||||
}
|
||||
});
|
||||
|
||||
const hostText = hostResult.content[0].text;
|
||||
const hostLines = hostText.split('\n').slice(0, 8);
|
||||
console.log(hostLines.join('\n'));
|
||||
console.log('');
|
||||
|
||||
// Test 2: Search episodes with typo
|
||||
console.log('=== Test 2: Fuzzy Episode Search ===');
|
||||
console.log('Searching for episodes: "pythoon" (typo for python)\n');
|
||||
|
||||
const episodeResult = await sendMessage('tools/call', {
|
||||
name: 'search_episodes',
|
||||
arguments: {
|
||||
query: 'pythoon',
|
||||
limit: 2
|
||||
}
|
||||
});
|
||||
|
||||
const episodeText = episodeResult.content[0].text;
|
||||
// Extract just the first episode header
|
||||
const firstEpisode = episodeText.split('\n---\n')[0];
|
||||
const episodeLines = firstEpisode.split('\n').slice(0, 10);
|
||||
console.log(episodeLines.join('\n'));
|
||||
console.log('');
|
||||
|
||||
console.log('✅ HTTP/SSE fuzzy search tests completed!\n');
|
||||
|
||||
sse.close();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
test().catch(err => {
|
||||
console.error('Error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
82
test-fuzzy-search.js
Normal file
82
test-fuzzy-search.js
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Test script for fuzzy search functionality
|
||||
* Tests both episode and host fuzzy matching
|
||||
*/
|
||||
|
||||
import HPRDataLoader from './data-loader.js';
|
||||
|
||||
console.log('Loading HPR data...\n');
|
||||
const dataLoader = new HPRDataLoader();
|
||||
await dataLoader.load();
|
||||
console.log('Data loaded!\n');
|
||||
|
||||
// Test 1: Exact host match (should use exact matching)
|
||||
console.log('=== Test 1: Exact Host Match ===');
|
||||
console.log('Query: "ken"\n');
|
||||
const exactHosts = dataLoader.searchHosts('ken');
|
||||
console.log(`Found ${exactHosts.length} results (exact match)`);
|
||||
exactHosts.slice(0, 3).forEach(host => {
|
||||
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 2: Fuzzy host match with typo
|
||||
console.log('=== Test 2: Fuzzy Host Match (typo) ===');
|
||||
console.log('Query: "klattu" (should match "klaatu")\n');
|
||||
const fuzzyHosts = dataLoader.searchHosts('klattu');
|
||||
console.log(`Found ${fuzzyHosts.length} results`);
|
||||
fuzzyHosts.forEach(host => {
|
||||
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}, distance: ${host.matchDistance}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 3: Another fuzzy host match
|
||||
console.log('=== Test 3: Fuzzy Host Match (another typo) ===');
|
||||
console.log('Query: "dav" (should find hosts like "Dave")\n');
|
||||
const fuzzyHosts2 = dataLoader.searchHosts('dav');
|
||||
console.log(`Found ${fuzzyHosts2.length} results`);
|
||||
fuzzyHosts2.slice(0, 5).forEach(host => {
|
||||
console.log(` - ${host.host} (${host.hostid}) [matchType: ${host.matchType}${host.matchDistance ? ', distance: ' + host.matchDistance : ''}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 4: Exact episode search
|
||||
console.log('=== Test 4: Exact Episode Match ===');
|
||||
console.log('Query: "linux" (exact match in title/summary)\n');
|
||||
const exactEpisodes = dataLoader.searchEpisodes('linux', { limit: 3 });
|
||||
console.log(`Found ${exactEpisodes.length} results`);
|
||||
exactEpisodes.forEach(ep => {
|
||||
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title} [matchType: ${ep.matchType}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 5: Fuzzy episode search with typo
|
||||
console.log('=== Test 5: Fuzzy Episode Match (typo) ===');
|
||||
console.log('Query: "linx" (should match episodes with "linux" in title)\n');
|
||||
const fuzzyEpisodes = dataLoader.searchEpisodes('linx', { limit: 3 });
|
||||
console.log(`Found ${fuzzyEpisodes.length} results`);
|
||||
fuzzyEpisodes.forEach(ep => {
|
||||
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 6: Another fuzzy episode search
|
||||
console.log('=== Test 6: Fuzzy Episode Match (misspelling) ===');
|
||||
console.log('Query: "pythoon" (should match "python")\n');
|
||||
const fuzzyEpisodes2 = dataLoader.searchEpisodes('pythoon', { limit: 3 });
|
||||
console.log(`Found ${fuzzyEpisodes2.length} results`);
|
||||
fuzzyEpisodes2.forEach(ep => {
|
||||
console.log(` - HPR${String(ep.id).padStart(4, '0')}: ${ep.title.substring(0, 60)}... [matchType: ${ep.matchType}${ep.matchDistance ? ', distance: ' + ep.matchDistance : ''}]`);
|
||||
});
|
||||
console.log('');
|
||||
|
||||
// Test 7: No match (distance too large)
|
||||
console.log('=== Test 7: No Match (distance too large) ===');
|
||||
console.log('Query: "xyzabc" (should find nothing)\n');
|
||||
const noMatch = dataLoader.searchHosts('xyzabc');
|
||||
console.log(`Found ${noMatch.length} results`);
|
||||
console.log('');
|
||||
|
||||
console.log('✅ All fuzzy search tests completed!');
|
||||
Reference in New Issue
Block a user