From 3d5acde487867e59de48abded26d42e963ba07d9 Mon Sep 17 00:00:00 2001 From: Josh Knapp Date: Wed, 16 Apr 2025 19:50:33 -0700 Subject: [PATCH] Updating for a new version and adding the updated README --- README.md | 162 ++++++++++++++++++++++++++-- owui-site-crawler.py | 248 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 376 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 35311ea..a1bb4a7 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,153 @@ -owui-site-crawler.py --token "your_api_token" \ - --base-url "http://localhost:3000" \ - --website-url "https://example.com" \ - --kb-name "Example Website KB" \ - --kb-purpose "Documentation and information from example.com" \ - --depth 3 \ - --delay 1.5 \ - --exclude "/login" \ - --exclude "/admin" \ No newline at end of file + +# Web to Knowledge Base for Open WebUI + +A Python utility script that crawls websites, converts pages to Markdown or preserves JSON data, and uploads them to an Open WebUI knowledge base. + +## Features + +- Crawls websites to a specified depth while respecting domain boundaries +- Converts HTML content to Markdown using MarkItDown +- Preserves JSON content in its original format +- Creates or updates knowledge bases in Open WebUI +- Handles existing files through update or skip options +- Customizable crawling with exclude patterns +- Detailed logging of the process + +## Installation + +### Prerequisites + +- Python 3.10+ +- Open WebUI instance with API access + +### Dependencies + +Install the required packages: + +```bash +pip install requests beautifulsoup4 markitdown +``` + +### Getting the Script + +Download the script and make it executable: + +```bash +curl -O https://raw.githubusercontent.com/yourusername/open-webui-site-crawler/main/web_to_kb.py +chmod +x web_to_kb.py +``` + +## Usage + +Basic usage: + +```bash +python web_to_kb.py --token "YOUR_API_TOKEN" \ + --base-url "https://your-openwebui-instance.com" \ + --website-url "https://website-to-crawl.com" \ + --kb-name "My Website Knowledge Base" +``` + +### Command Line Arguments + +| Argument | Short | Description | Required | Default | +|----------|-------|-------------|----------|---------| +| `--token` | `-t` | Your OpenWebUI API token | Yes | - | +| `--base-url` | `-u` | Base URL of your OpenWebUI instance | Yes | - | +| `--website-url` | `-w` | URL of the website to crawl | Yes | - | +| `--kb-name` | `-n` | Name for the knowledge base | Yes | - | +| `--kb-purpose` | `-p` | Purpose description for the knowledge base | No | None | +| `--depth` | `-d` | Maximum depth to crawl | No | 2 | +| `--delay` | | Delay between requests in seconds | No | 1.0 | +| `--exclude` | `-e` | URL patterns to exclude from crawling (can be specified multiple times) | No | None | +| `--include-json` | `-j` | Include JSON files and API endpoints | No | False | +| `--update` | | Update existing files in the knowledge base | No | False | +| `--skip-existing` | | Skip existing files in the knowledge base | No | False | + +## Examples + +### Basic Crawl with Limited Depth + +```bash +python web_to_kb.py -t "YOUR_API_TOKEN" \ + -u "https://your-openwebui-instance.com" \ + -w "https://docs.example.com" \ + -n "Example Docs KB" \ + -d 3 +``` + +### Excluding Certain URL Patterns + +```bash +python web_to_kb.py -t "YOUR_API_TOKEN" \ + -u "https://your-openwebui-instance.com" \ + -w "https://blog.example.com" \ + -n "Example Blog KB" \ + -e "/tags/" \ + -e "/author/" \ + -e "/search/" +``` + +### Including JSON Content + +```bash +python web_to_kb.py -t "YOUR_API_TOKEN" \ + -u "https://your-openwebui-instance.com" \ + -w "https://api-docs.example.com" \ + -n "Example API Documentation" \ + -j +``` + +### Updating an Existing Knowledge Base + +```bash +python web_to_kb.py -t "YOUR_API_TOKEN" \ + -u "https://your-openwebui-instance.com" \ + -w "https://knowledge-center.example.com" \ + -n "Knowledge Center" \ + --update +``` + +### Skipping Existing Files + +```bash +python web_to_kb.py -t "YOUR_API_TOKEN" \ + -u "https://your-openwebui-instance.com" \ + -w "https://docs.example.com" \ + -n "Documentation KB" \ + --skip-existing +``` + +## How It Works + +1. **Website Crawling**: The script starts crawling from the specified website URL, following links up to the specified depth while staying within the same domain. + +2. **Content Processing**: + - HTML content is converted to Markdown using MarkItDown + - JSON content is preserved in its native format (when `--include-json` is used) + +3. **Knowledge Base Management**: + - Checks if a knowledge base with the specified name already exists + - Creates a new knowledge base if none exists + +4. **File Upload**: + - Manages existing files based on the `--update` or `--skip-existing` flags + - Uploads new files to the knowledge base + +## Notes + +- The script respects domain boundaries and will not crawl external links +- URLs are used to generate filenames, with special characters replaced +- Add a delay between requests to be respectful of websites' resources +- File updates are performed by uploading a new file and removing the old one + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Acknowledgments + +- [MarkItDown](https://github.com/microsoft/markitdown) for HTML to Markdown conversion [1] +- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing +- [Requests](https://requests.readthedocs.io/) for HTTP requests +- [Open WebUI](https://github.com/open-webui/open-webui) for the knowledge base API \ No newline at end of file diff --git a/owui-site-crawler.py b/owui-site-crawler.py index 8d1e398..937be55 100644 --- a/owui-site-crawler.py +++ b/owui-site-crawler.py @@ -9,7 +9,6 @@ from bs4 import BeautifulSoup from markitdown import MarkItDown import json import logging -import contextlib # Configure logging logging.basicConfig(level=logging.INFO, @@ -102,6 +101,51 @@ class OpenWebUIUploader: "Accept": "application/json" }) + def get_knowledge_bases(self): + """Get a list of all knowledge bases.""" + endpoint = f"{self.base_url}/api/v1/knowledge/list" + + try: + response = self.session.get(endpoint) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + logger.error(f"Error getting knowledge bases: {e}") + raise + + def get_knowledge_base_by_name(self, name): + """Check if a knowledge base with the given name exists, and return its details if it does.""" + try: + kbs = self.get_knowledge_bases() + for kb in kbs: + if kb.get('name') == name: + return kb + return None + except Exception as e: + logger.error(f"Error checking for existing knowledge base: {e}") + return None + + def get_knowledge_base_files(self, kb_id): + """Get all files in a knowledge base.""" + endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}" + + try: + response = self.session.get(endpoint) + response.raise_for_status() + kb_data = response.json() + return kb_data.get('files', []) + except requests.exceptions.RequestException as e: + logger.error(f"Error getting knowledge base files: {e}") + return [] + + def file_exists_in_kb(self, kb_id, filename): + """Check if a file with the given name exists in the knowledge base.""" + files = self.get_knowledge_base_files(kb_id) + for file in files: + if 'meta' in file and 'name' in file['meta'] and file['meta']['name'] == filename: + return file['id'] + return None + def create_knowledge_base(self, name, purpose=None): """Create a new knowledge base in OpenWebUI.""" endpoint = f"{self.base_url}/api/v1/knowledge/create" @@ -119,7 +163,7 @@ class OpenWebUIUploader: logger.error(f"Error creating knowledge base: {e}") raise - def upload_file(self, kb_id, content, filename): + def upload_file(self, kb_id, content, filename, content_type="text/markdown"): """Upload a file to the knowledge base.""" upload_endpoint = f"{self.base_url}/api/v1/files/" @@ -131,7 +175,7 @@ class OpenWebUIUploader: try: # Use context manager for file upload request with open(temp_file_path, 'rb') as f: - files = {'file': (filename, f, 'text/markdown')} + files = {'file': (filename, f, content_type)} with self.session.post( upload_endpoint, headers={"Authorization": f"Bearer {self.api_token}"}, @@ -161,6 +205,61 @@ class OpenWebUIUploader: if os.path.exists(temp_file_path): os.unlink(temp_file_path) + def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"): + """Update an existing file in the knowledge base.""" + # First upload the new version of the file + upload_endpoint = f"{self.base_url}/api/v1/files/" + + # Create a temporary file for the upload + temp_file_path = f"/tmp/{filename}" + with open(temp_file_path, 'w') as f: + f.write(content) + + try: + # Upload the new file + with open(temp_file_path, 'rb') as f: + files = {'file': (filename, f, content_type)} + with self.session.post( + upload_endpoint, + headers={"Authorization": f"Bearer {self.api_token}"}, + files=files + ) as upload_response: + upload_response.raise_for_status() + new_file_id = upload_response.json().get('id') + + # Remove the old file from the knowledge base + remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove" + with self.session.post( + remove_endpoint, + headers={ + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json" + }, + json={'file_id': existing_file_id} + ) as remove_response: + remove_response.raise_for_status() + + # Add the new file to the knowledge base + add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add" + with self.session.post( + add_endpoint, + headers={ + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json" + }, + json={'file_id': new_file_id} + ) as add_response: + add_response.raise_for_status() + return add_response.json() + + except requests.exceptions.RequestException as e: + logger.error(f"Error updating file: {e}") + raise + finally: + # Clean up the temporary file + if os.path.exists(temp_file_path): + os.unlink(temp_file_path) + def close(self): """Close the requests session.""" if hasattr(self, 'session') and self.session: @@ -187,6 +286,15 @@ def convert_to_markdown(html_content, url): return f"# {url}\n\nError converting content: {str(e)}" +def is_valid_json(content): + """Check if content is valid JSON.""" + try: + json.loads(content) + return True + except (ValueError, TypeError): + return False + + def main(): parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base') parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token') @@ -197,9 +305,17 @@ def main(): parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)') parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)') parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)') + parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints') + parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base') + parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base') args = parser.parse_args() + # Check for conflicting options + if args.update and args.skip_existing: + logger.error("Cannot use both --update and --skip-existing flags at the same time") + return 1 + # Initialize resources that need to be closed scraper = None uploader = None @@ -222,46 +338,128 @@ def main(): logger.error("No pages were crawled. Exiting.") return 1 - # 2. Convert HTML pages to Markdown - logger.info("Converting HTML pages to Markdown") - markdown_pages = {} - for url, html in crawled_pages.items(): - markdown_content = convert_to_markdown(html, url) - # Create a safe filename from the URL + # 2. Process content (convert HTML to Markdown or handle JSON) + logger.info("Processing crawled content") + processed_files = [] + + for url, html_content in crawled_pages.items(): + # For JSON content, preserve it as JSON + if url.endswith('.json') or (is_valid_json(html_content) and args.include_json): + if is_valid_json(html_content): + try: + json_obj = json.loads(html_content) + pretty_json = json.dumps(json_obj, indent=2) + + # Create filename for JSON file + parsed_url = urlparse(url) + filename = f"{parsed_url.netloc}{parsed_url.path}" + filename = filename.replace('/', '_').replace('.', '_') + if not filename.endswith('.json'): + filename = f"{filename}.json" + + processed_files.append({ + 'content': pretty_json, + 'content_type': 'application/json', + 'filename': filename, + 'url': url + }) + logger.info(f"Processed JSON content from {url}") + continue + except ValueError: + # Not valid JSON despite the extension, fall back to Markdown + pass + + # For all other content, convert to Markdown + markdown_content = convert_to_markdown(html_content, url) + + # Create a safe filename parsed_url = urlparse(url) filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_') if not filename.endswith('.md'): filename = f"{filename}.md" - markdown_pages[filename] = markdown_content + + processed_files.append({ + 'content': markdown_content, + 'content_type': 'text/markdown', + 'filename': filename, + 'url': url + }) + + logger.info(f"Processed {len(processed_files)} files") # 3. Upload to Open WebUI - logger.info(f"Creating knowledge base '{args.kb_name}' in Open WebUI") + # First check if a knowledge base with the specified name already exists uploader = OpenWebUIUploader(args.base_url, args.token) - kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose) - kb_id = kb.get('id') - if not kb_id: - logger.error("Failed to get knowledge base ID") - return 1 + existing_kb = uploader.get_knowledge_base_by_name(args.kb_name) + if existing_kb: + kb_id = existing_kb.get('id') + logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}") + else: + # Create a new knowledge base if none exists with that name + logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI") + kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose) + kb_id = kb.get('id') + if not kb_id: + logger.error("Failed to get knowledge base ID") + return 1 + logger.info(f"Created knowledge base with ID: {kb_id}") - logger.info(f"Created knowledge base with ID: {kb_id}") - - # 4. Upload each markdown page + # 4. Upload each file success_count = 0 + skip_count = 0 + update_count = 0 error_count = 0 - for filename, content in markdown_pages.items(): + for file_info in processed_files: try: - logger.info(f"Uploading {filename}") - uploader.upload_file(kb_id, content, filename) - success_count += 1 + filename = file_info['filename'] + existing_file_id = uploader.file_exists_in_kb(kb_id, filename) + + # Handle existing files based on options + if existing_file_id: + if args.skip_existing: + logger.info(f"Skipping existing file: {filename}") + skip_count += 1 + continue + elif args.update: + logger.info(f"Updating existing file: {filename}") + uploader.update_file( + kb_id, + existing_file_id, + file_info['content'], + filename, + file_info['content_type'] + ) + update_count += 1 + else: + # Default behavior: add as new file + logger.info(f"Adding duplicate file (existing file will remain): {filename}") + uploader.upload_file( + kb_id, + file_info['content'], + filename, + file_info['content_type'] + ) + success_count += 1 + else: + # New file + logger.info(f"Uploading new file: {filename}") + uploader.upload_file( + kb_id, + file_info['content'], + filename, + file_info['content_type'] + ) + success_count += 1 + # Add a small delay between uploads time.sleep(0.5) except Exception as e: - logger.error(f"Failed to upload {filename}: {e}") + logger.error(f"Failed to process {file_info['filename']}: {e}") error_count += 1 - logger.info(f"Upload complete: {success_count} files uploaded successfully, {error_count} errors") + logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors") return 0