From 3d5acde487867e59de48abded26d42e963ba07d9 Mon Sep 17 00:00:00 2001
From: Josh Knapp <jknapp85@gmail.com>
Date: Wed, 16 Apr 2025 19:50:33 -0700
Subject: [PATCH] Updating for a new version and adding the updated README

---
 README.md            | 162 ++++++++++++++++++++++++++--
 owui-site-crawler.py | 248 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 376 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 35311ea..a1bb4a7 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,153 @@
-owui-site-crawler.py --token "your_api_token" \
-    --base-url "http://localhost:3000" \
-    --website-url "https://example.com" \
-    --kb-name "Example Website KB" \
-    --kb-purpose "Documentation and information from example.com" \
-    --depth 3 \
-    --delay 1.5 \
-    --exclude "/login" \
-    --exclude "/admin"
\ No newline at end of file
+
+# Web to Knowledge Base for Open WebUI
+
+A Python utility script that crawls websites, converts pages to Markdown or preserves JSON data, and uploads them to an Open WebUI knowledge base.
+
+## Features
+
+- Crawls websites to a specified depth while respecting domain boundaries
+- Converts HTML content to Markdown using MarkItDown
+- Preserves JSON content in its original format
+- Creates or updates knowledge bases in Open WebUI
+- Handles existing files through update or skip options
+- Customizable crawling with exclude patterns
+- Detailed logging of the process
+
+## Installation
+
+### Prerequisites
+
+- Python 3.10+
+- Open WebUI instance with API access
+
+### Dependencies
+
+Install the required packages:
+
+```bash
+pip install requests beautifulsoup4 markitdown
+```
+
+### Getting the Script
+
+Download the script and make it executable:
+
+```bash
+curl -O https://raw.githubusercontent.com/yourusername/open-webui-site-crawler/main/web_to_kb.py
+chmod +x web_to_kb.py
+```
+
+## Usage
+
+Basic usage:
+
+```bash
+python web_to_kb.py --token "YOUR_API_TOKEN" \
+                   --base-url "https://your-openwebui-instance.com" \
+                   --website-url "https://website-to-crawl.com" \
+                   --kb-name "My Website Knowledge Base"
+```
+
+### Command Line Arguments
+
+| Argument | Short | Description | Required | Default |
+|----------|-------|-------------|----------|---------|
+| `--token` | `-t` | Your OpenWebUI API token | Yes | - |
+| `--base-url` | `-u` | Base URL of your OpenWebUI instance | Yes | - |
+| `--website-url` | `-w` | URL of the website to crawl | Yes | - |
+| `--kb-name` | `-n` | Name for the knowledge base | Yes | - |
+| `--kb-purpose` | `-p` | Purpose description for the knowledge base | No | None |
+| `--depth` | `-d` | Maximum depth to crawl | No | 2 |
+| `--delay` | | Delay between requests in seconds | No | 1.0 |
+| `--exclude` | `-e` | URL patterns to exclude from crawling (can be specified multiple times) | No | None |
+| `--include-json` | `-j` | Include JSON files and API endpoints | No | False |
+| `--update` | | Update existing files in the knowledge base | No | False |
+| `--skip-existing` | | Skip existing files in the knowledge base | No | False |
+
+## Examples
+
+### Basic Crawl with Limited Depth
+
+```bash
+python web_to_kb.py -t "YOUR_API_TOKEN" \
+                   -u "https://your-openwebui-instance.com" \
+                   -w "https://docs.example.com" \
+                   -n "Example Docs KB" \
+                   -d 3
+```
+
+### Excluding Certain URL Patterns
+
+```bash
+python web_to_kb.py -t "YOUR_API_TOKEN" \
+                   -u "https://your-openwebui-instance.com" \
+                   -w "https://blog.example.com" \
+                   -n "Example Blog KB" \
+                   -e "/tags/" \
+                   -e "/author/" \
+                   -e "/search/"
+```
+
+### Including JSON Content
+
+```bash
+python web_to_kb.py -t "YOUR_API_TOKEN" \
+                   -u "https://your-openwebui-instance.com" \
+                   -w "https://api-docs.example.com" \
+                   -n "Example API Documentation" \
+                   -j
+```
+
+### Updating an Existing Knowledge Base
+
+```bash
+python web_to_kb.py -t "YOUR_API_TOKEN" \
+                   -u "https://your-openwebui-instance.com" \
+                   -w "https://knowledge-center.example.com" \
+                   -n "Knowledge Center" \
+                   --update
+```
+
+### Skipping Existing Files
+
+```bash
+python web_to_kb.py -t "YOUR_API_TOKEN" \
+                   -u "https://your-openwebui-instance.com" \
+                   -w "https://docs.example.com" \
+                   -n "Documentation KB" \
+                   --skip-existing
+```
+
+## How It Works
+
+1. **Website Crawling**: The script starts crawling from the specified website URL, following links up to the specified depth while staying within the same domain.
+
+2. **Content Processing**: 
+   - HTML content is converted to Markdown using MarkItDown
+   - JSON content is preserved in its native format (when `--include-json` is used)
+
+3. **Knowledge Base Management**:
+   - Checks if a knowledge base with the specified name already exists
+   - Creates a new knowledge base if none exists
+
+4. **File Upload**:
+   - Manages existing files based on the `--update` or `--skip-existing` flags
+   - Uploads new files to the knowledge base
+
+## Notes
+
+- The script respects domain boundaries and will not crawl external links
+- URLs are used to generate filenames, with special characters replaced
+- Add a delay between requests to be respectful of websites' resources
+- File updates are performed by uploading a new file and removing the old one
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+## Acknowledgments
+
+- [MarkItDown](https://github.com/microsoft/markitdown) for HTML to Markdown conversion [1]
+- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
+- [Requests](https://requests.readthedocs.io/) for HTTP requests
+- [Open WebUI](https://github.com/open-webui/open-webui) for the knowledge base API
\ No newline at end of file
diff --git a/owui-site-crawler.py b/owui-site-crawler.py
index 8d1e398..937be55 100644
--- a/owui-site-crawler.py
+++ b/owui-site-crawler.py
@@ -9,7 +9,6 @@ from bs4 import BeautifulSoup
 from markitdown import MarkItDown
 import json
 import logging
-import contextlib
 
 # Configure logging
 logging.basicConfig(level=logging.INFO, 
@@ -102,6 +101,51 @@ class OpenWebUIUploader:
             "Accept": "application/json"
         })
     
+    def get_knowledge_bases(self):
+        """Get a list of all knowledge bases."""
+        endpoint = f"{self.base_url}/api/v1/knowledge/list"
+        
+        try:
+            response = self.session.get(endpoint)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error getting knowledge bases: {e}")
+            raise
+    
+    def get_knowledge_base_by_name(self, name):
+        """Check if a knowledge base with the given name exists, and return its details if it does."""
+        try:
+            kbs = self.get_knowledge_bases()
+            for kb in kbs:
+                if kb.get('name') == name:
+                    return kb
+            return None
+        except Exception as e:
+            logger.error(f"Error checking for existing knowledge base: {e}")
+            return None
+    
+    def get_knowledge_base_files(self, kb_id):
+        """Get all files in a knowledge base."""
+        endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}"
+        
+        try:
+            response = self.session.get(endpoint)
+            response.raise_for_status()
+            kb_data = response.json()
+            return kb_data.get('files', [])
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error getting knowledge base files: {e}")
+            return []
+    
+    def file_exists_in_kb(self, kb_id, filename):
+        """Check if a file with the given name exists in the knowledge base."""
+        files = self.get_knowledge_base_files(kb_id)
+        for file in files:
+            if 'meta' in file and 'name' in file['meta'] and file['meta']['name'] == filename:
+                return file['id']
+        return None
+    
     def create_knowledge_base(self, name, purpose=None):
         """Create a new knowledge base in OpenWebUI."""
         endpoint = f"{self.base_url}/api/v1/knowledge/create"
@@ -119,7 +163,7 @@ class OpenWebUIUploader:
             logger.error(f"Error creating knowledge base: {e}")
             raise
     
-    def upload_file(self, kb_id, content, filename):
+    def upload_file(self, kb_id, content, filename, content_type="text/markdown"):
         """Upload a file to the knowledge base."""
         upload_endpoint = f"{self.base_url}/api/v1/files/"
         
@@ -131,7 +175,7 @@ class OpenWebUIUploader:
         try:
             # Use context manager for file upload request
             with open(temp_file_path, 'rb') as f:
-                files = {'file': (filename, f, 'text/markdown')}
+                files = {'file': (filename, f, content_type)}
                 with self.session.post(
                     upload_endpoint,
                     headers={"Authorization": f"Bearer {self.api_token}"},
@@ -161,6 +205,61 @@ class OpenWebUIUploader:
             if os.path.exists(temp_file_path):
                 os.unlink(temp_file_path)
     
+    def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"):
+        """Update an existing file in the knowledge base."""
+        # First upload the new version of the file
+        upload_endpoint = f"{self.base_url}/api/v1/files/"
+        
+        # Create a temporary file for the upload
+        temp_file_path = f"/tmp/{filename}"
+        with open(temp_file_path, 'w') as f:
+            f.write(content)
+            
+        try:
+            # Upload the new file
+            with open(temp_file_path, 'rb') as f:
+                files = {'file': (filename, f, content_type)}
+                with self.session.post(
+                    upload_endpoint,
+                    headers={"Authorization": f"Bearer {self.api_token}"},
+                    files=files
+                ) as upload_response:
+                    upload_response.raise_for_status()
+                    new_file_id = upload_response.json().get('id')
+            
+            # Remove the old file from the knowledge base
+            remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove"
+            with self.session.post(
+                remove_endpoint,
+                headers={
+                    "Authorization": f"Bearer {self.api_token}",
+                    "Content-Type": "application/json"
+                },
+                json={'file_id': existing_file_id}
+            ) as remove_response:
+                remove_response.raise_for_status()
+            
+            # Add the new file to the knowledge base
+            add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
+            with self.session.post(
+                add_endpoint,
+                headers={
+                    "Authorization": f"Bearer {self.api_token}",
+                    "Content-Type": "application/json"
+                },
+                json={'file_id': new_file_id}
+            ) as add_response:
+                add_response.raise_for_status()
+                return add_response.json()
+                
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error updating file: {e}")
+            raise
+        finally:
+            # Clean up the temporary file
+            if os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+    
     def close(self):
         """Close the requests session."""
         if hasattr(self, 'session') and self.session:
@@ -187,6 +286,15 @@ def convert_to_markdown(html_content, url):
         return f"# {url}\n\nError converting content: {str(e)}"
 
 
+def is_valid_json(content):
+    """Check if content is valid JSON."""
+    try:
+        json.loads(content)
+        return True
+    except (ValueError, TypeError):
+        return False
+
+
 def main():
     parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
     parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
@@ -197,9 +305,17 @@ def main():
     parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)')
     parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
     parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)')
+    parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
+    parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
+    parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
     
     args = parser.parse_args()
     
+    # Check for conflicting options
+    if args.update and args.skip_existing:
+        logger.error("Cannot use both --update and --skip-existing flags at the same time")
+        return 1
+    
     # Initialize resources that need to be closed
     scraper = None
     uploader = None
@@ -222,46 +338,128 @@ def main():
             logger.error("No pages were crawled. Exiting.")
             return 1
         
-        # 2. Convert HTML pages to Markdown
-        logger.info("Converting HTML pages to Markdown")
-        markdown_pages = {}
-        for url, html in crawled_pages.items():
-            markdown_content = convert_to_markdown(html, url)
-            # Create a safe filename from the URL
+        # 2. Process content (convert HTML to Markdown or handle JSON)
+        logger.info("Processing crawled content")
+        processed_files = []
+        
+        for url, html_content in crawled_pages.items():
+            # For JSON content, preserve it as JSON
+            if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
+                if is_valid_json(html_content):
+                    try:
+                        json_obj = json.loads(html_content)
+                        pretty_json = json.dumps(json_obj, indent=2)
+                        
+                        # Create filename for JSON file
+                        parsed_url = urlparse(url)
+                        filename = f"{parsed_url.netloc}{parsed_url.path}"
+                        filename = filename.replace('/', '_').replace('.', '_')
+                        if not filename.endswith('.json'):
+                            filename = f"{filename}.json"
+                            
+                        processed_files.append({
+                            'content': pretty_json,
+                            'content_type': 'application/json',
+                            'filename': filename,
+                            'url': url
+                        })
+                        logger.info(f"Processed JSON content from {url}")
+                        continue
+                    except ValueError:
+                        # Not valid JSON despite the extension, fall back to Markdown
+                        pass
+            
+            # For all other content, convert to Markdown
+            markdown_content = convert_to_markdown(html_content, url)
+            
+            # Create a safe filename
             parsed_url = urlparse(url)
             filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
             if not filename.endswith('.md'):
                 filename = f"{filename}.md"
-            markdown_pages[filename] = markdown_content
+                
+            processed_files.append({
+                'content': markdown_content,
+                'content_type': 'text/markdown',
+                'filename': filename,
+                'url': url
+            })
+        
+        logger.info(f"Processed {len(processed_files)} files")
         
         # 3. Upload to Open WebUI
-        logger.info(f"Creating knowledge base '{args.kb_name}' in Open WebUI")
+        # First check if a knowledge base with the specified name already exists
         uploader = OpenWebUIUploader(args.base_url, args.token)
-        kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
         
-        kb_id = kb.get('id')
-        if not kb_id:
-            logger.error("Failed to get knowledge base ID")
-            return 1
+        existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
+        if existing_kb:
+            kb_id = existing_kb.get('id')
+            logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
+        else:
+            # Create a new knowledge base if none exists with that name
+            logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
+            kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
+            kb_id = kb.get('id')
+            if not kb_id:
+                logger.error("Failed to get knowledge base ID")
+                return 1
+            logger.info(f"Created knowledge base with ID: {kb_id}")
         
-        logger.info(f"Created knowledge base with ID: {kb_id}")
-        
-        # 4. Upload each markdown page
+        # 4. Upload each file
         success_count = 0
+        skip_count = 0
+        update_count = 0
         error_count = 0
         
-        for filename, content in markdown_pages.items():
+        for file_info in processed_files:
             try:
-                logger.info(f"Uploading {filename}")
-                uploader.upload_file(kb_id, content, filename)
-                success_count += 1
+                filename = file_info['filename']
+                existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
+                
+                # Handle existing files based on options
+                if existing_file_id:
+                    if args.skip_existing:
+                        logger.info(f"Skipping existing file: {filename}")
+                        skip_count += 1
+                        continue
+                    elif args.update:
+                        logger.info(f"Updating existing file: {filename}")
+                        uploader.update_file(
+                            kb_id, 
+                            existing_file_id, 
+                            file_info['content'], 
+                            filename, 
+                            file_info['content_type']
+                        )
+                        update_count += 1
+                    else:
+                        # Default behavior: add as new file
+                        logger.info(f"Adding duplicate file (existing file will remain): {filename}")
+                        uploader.upload_file(
+                            kb_id, 
+                            file_info['content'], 
+                            filename, 
+                            file_info['content_type']
+                        )
+                        success_count += 1
+                else:
+                    # New file
+                    logger.info(f"Uploading new file: {filename}")
+                    uploader.upload_file(
+                        kb_id, 
+                        file_info['content'], 
+                        filename, 
+                        file_info['content_type']
+                    )
+                    success_count += 1
+                
                 # Add a small delay between uploads
                 time.sleep(0.5)
             except Exception as e:
-                logger.error(f"Failed to upload {filename}: {e}")
+                logger.error(f"Failed to process {file_info['filename']}: {e}")
                 error_count += 1
         
-        logger.info(f"Upload complete: {success_count} files uploaded successfully, {error_count} errors")
+        logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors")
         
         return 0