Updating for a new version and adding the updated README
This commit is contained in:
		
							
								
								
									
										162
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										162
									
								
								README.md
									
									
									
									
									
								
							@@ -1,9 +1,153 @@
 | 
				
			|||||||
owui-site-crawler.py --token "your_api_token" \
 | 
					
 | 
				
			||||||
    --base-url "http://localhost:3000" \
 | 
					# Web to Knowledge Base for Open WebUI
 | 
				
			||||||
    --website-url "https://example.com" \
 | 
					
 | 
				
			||||||
    --kb-name "Example Website KB" \
 | 
					A Python utility script that crawls websites, converts pages to Markdown or preserves JSON data, and uploads them to an Open WebUI knowledge base.
 | 
				
			||||||
    --kb-purpose "Documentation and information from example.com" \
 | 
					
 | 
				
			||||||
    --depth 3 \
 | 
					## Features
 | 
				
			||||||
    --delay 1.5 \
 | 
					
 | 
				
			||||||
    --exclude "/login" \
 | 
					- Crawls websites to a specified depth while respecting domain boundaries
 | 
				
			||||||
    --exclude "/admin"
 | 
					- Converts HTML content to Markdown using MarkItDown
 | 
				
			||||||
 | 
					- Preserves JSON content in its original format
 | 
				
			||||||
 | 
					- Creates or updates knowledge bases in Open WebUI
 | 
				
			||||||
 | 
					- Handles existing files through update or skip options
 | 
				
			||||||
 | 
					- Customizable crawling with exclude patterns
 | 
				
			||||||
 | 
					- Detailed logging of the process
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Installation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Prerequisites
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Python 3.10+
 | 
				
			||||||
 | 
					- Open WebUI instance with API access
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Dependencies
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Install the required packages:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					pip install requests beautifulsoup4 markitdown
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Getting the Script
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Download the script and make it executable:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					curl -O https://raw.githubusercontent.com/yourusername/open-webui-site-crawler/main/web_to_kb.py
 | 
				
			||||||
 | 
					chmod +x web_to_kb.py
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Usage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Basic usage:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py --token "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   --base-url "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   --website-url "https://website-to-crawl.com" \
 | 
				
			||||||
 | 
					                   --kb-name "My Website Knowledge Base"
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Command Line Arguments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Argument | Short | Description | Required | Default |
 | 
				
			||||||
 | 
					|----------|-------|-------------|----------|---------|
 | 
				
			||||||
 | 
					| `--token` | `-t` | Your OpenWebUI API token | Yes | - |
 | 
				
			||||||
 | 
					| `--base-url` | `-u` | Base URL of your OpenWebUI instance | Yes | - |
 | 
				
			||||||
 | 
					| `--website-url` | `-w` | URL of the website to crawl | Yes | - |
 | 
				
			||||||
 | 
					| `--kb-name` | `-n` | Name for the knowledge base | Yes | - |
 | 
				
			||||||
 | 
					| `--kb-purpose` | `-p` | Purpose description for the knowledge base | No | None |
 | 
				
			||||||
 | 
					| `--depth` | `-d` | Maximum depth to crawl | No | 2 |
 | 
				
			||||||
 | 
					| `--delay` | | Delay between requests in seconds | No | 1.0 |
 | 
				
			||||||
 | 
					| `--exclude` | `-e` | URL patterns to exclude from crawling (can be specified multiple times) | No | None |
 | 
				
			||||||
 | 
					| `--include-json` | `-j` | Include JSON files and API endpoints | No | False |
 | 
				
			||||||
 | 
					| `--update` | | Update existing files in the knowledge base | No | False |
 | 
				
			||||||
 | 
					| `--skip-existing` | | Skip existing files in the knowledge base | No | False |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Examples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Basic Crawl with Limited Depth
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py -t "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   -u "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   -w "https://docs.example.com" \
 | 
				
			||||||
 | 
					                   -n "Example Docs KB" \
 | 
				
			||||||
 | 
					                   -d 3
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Excluding Certain URL Patterns
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py -t "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   -u "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   -w "https://blog.example.com" \
 | 
				
			||||||
 | 
					                   -n "Example Blog KB" \
 | 
				
			||||||
 | 
					                   -e "/tags/" \
 | 
				
			||||||
 | 
					                   -e "/author/" \
 | 
				
			||||||
 | 
					                   -e "/search/"
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Including JSON Content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py -t "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   -u "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   -w "https://api-docs.example.com" \
 | 
				
			||||||
 | 
					                   -n "Example API Documentation" \
 | 
				
			||||||
 | 
					                   -j
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Updating an Existing Knowledge Base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py -t "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   -u "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   -w "https://knowledge-center.example.com" \
 | 
				
			||||||
 | 
					                   -n "Knowledge Center" \
 | 
				
			||||||
 | 
					                   --update
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Skipping Existing Files
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					python web_to_kb.py -t "YOUR_API_TOKEN" \
 | 
				
			||||||
 | 
					                   -u "https://your-openwebui-instance.com" \
 | 
				
			||||||
 | 
					                   -w "https://docs.example.com" \
 | 
				
			||||||
 | 
					                   -n "Documentation KB" \
 | 
				
			||||||
 | 
					                   --skip-existing
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## How It Works
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. **Website Crawling**: The script starts crawling from the specified website URL, following links up to the specified depth while staying within the same domain.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. **Content Processing**: 
 | 
				
			||||||
 | 
					   - HTML content is converted to Markdown using MarkItDown
 | 
				
			||||||
 | 
					   - JSON content is preserved in its native format (when `--include-json` is used)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. **Knowledge Base Management**:
 | 
				
			||||||
 | 
					   - Checks if a knowledge base with the specified name already exists
 | 
				
			||||||
 | 
					   - Creates a new knowledge base if none exists
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. **File Upload**:
 | 
				
			||||||
 | 
					   - Manages existing files based on the `--update` or `--skip-existing` flags
 | 
				
			||||||
 | 
					   - Uploads new files to the knowledge base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Notes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- The script respects domain boundaries and will not crawl external links
 | 
				
			||||||
 | 
					- URLs are used to generate filenames, with special characters replaced
 | 
				
			||||||
 | 
					- Add a delay between requests to be respectful of websites' resources
 | 
				
			||||||
 | 
					- File updates are performed by uploading a new file and removing the old one
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This project is licensed under the MIT License - see the LICENSE file for details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Acknowledgments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- [MarkItDown](https://github.com/microsoft/markitdown) for HTML to Markdown conversion [1]
 | 
				
			||||||
 | 
					- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
 | 
				
			||||||
 | 
					- [Requests](https://requests.readthedocs.io/) for HTTP requests
 | 
				
			||||||
 | 
					- [Open WebUI](https://github.com/open-webui/open-webui) for the knowledge base API
 | 
				
			||||||
@@ -9,7 +9,6 @@ from bs4 import BeautifulSoup
 | 
				
			|||||||
from markitdown import MarkItDown
 | 
					from markitdown import MarkItDown
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import contextlib
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Configure logging
 | 
					# Configure logging
 | 
				
			||||||
logging.basicConfig(level=logging.INFO, 
 | 
					logging.basicConfig(level=logging.INFO, 
 | 
				
			||||||
@@ -102,6 +101,51 @@ class OpenWebUIUploader:
 | 
				
			|||||||
            "Accept": "application/json"
 | 
					            "Accept": "application/json"
 | 
				
			||||||
        })
 | 
					        })
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    def get_knowledge_bases(self):
 | 
				
			||||||
 | 
					        """Get a list of all knowledge bases."""
 | 
				
			||||||
 | 
					        endpoint = f"{self.base_url}/api/v1/knowledge/list"
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            response = self.session.get(endpoint)
 | 
				
			||||||
 | 
					            response.raise_for_status()
 | 
				
			||||||
 | 
					            return response.json()
 | 
				
			||||||
 | 
					        except requests.exceptions.RequestException as e:
 | 
				
			||||||
 | 
					            logger.error(f"Error getting knowledge bases: {e}")
 | 
				
			||||||
 | 
					            raise
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def get_knowledge_base_by_name(self, name):
 | 
				
			||||||
 | 
					        """Check if a knowledge base with the given name exists, and return its details if it does."""
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            kbs = self.get_knowledge_bases()
 | 
				
			||||||
 | 
					            for kb in kbs:
 | 
				
			||||||
 | 
					                if kb.get('name') == name:
 | 
				
			||||||
 | 
					                    return kb
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.error(f"Error checking for existing knowledge base: {e}")
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def get_knowledge_base_files(self, kb_id):
 | 
				
			||||||
 | 
					        """Get all files in a knowledge base."""
 | 
				
			||||||
 | 
					        endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}"
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            response = self.session.get(endpoint)
 | 
				
			||||||
 | 
					            response.raise_for_status()
 | 
				
			||||||
 | 
					            kb_data = response.json()
 | 
				
			||||||
 | 
					            return kb_data.get('files', [])
 | 
				
			||||||
 | 
					        except requests.exceptions.RequestException as e:
 | 
				
			||||||
 | 
					            logger.error(f"Error getting knowledge base files: {e}")
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def file_exists_in_kb(self, kb_id, filename):
 | 
				
			||||||
 | 
					        """Check if a file with the given name exists in the knowledge base."""
 | 
				
			||||||
 | 
					        files = self.get_knowledge_base_files(kb_id)
 | 
				
			||||||
 | 
					        for file in files:
 | 
				
			||||||
 | 
					            if 'meta' in file and 'name' in file['meta'] and file['meta']['name'] == filename:
 | 
				
			||||||
 | 
					                return file['id']
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    def create_knowledge_base(self, name, purpose=None):
 | 
					    def create_knowledge_base(self, name, purpose=None):
 | 
				
			||||||
        """Create a new knowledge base in OpenWebUI."""
 | 
					        """Create a new knowledge base in OpenWebUI."""
 | 
				
			||||||
        endpoint = f"{self.base_url}/api/v1/knowledge/create"
 | 
					        endpoint = f"{self.base_url}/api/v1/knowledge/create"
 | 
				
			||||||
@@ -119,7 +163,7 @@ class OpenWebUIUploader:
 | 
				
			|||||||
            logger.error(f"Error creating knowledge base: {e}")
 | 
					            logger.error(f"Error creating knowledge base: {e}")
 | 
				
			||||||
            raise
 | 
					            raise
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    def upload_file(self, kb_id, content, filename):
 | 
					    def upload_file(self, kb_id, content, filename, content_type="text/markdown"):
 | 
				
			||||||
        """Upload a file to the knowledge base."""
 | 
					        """Upload a file to the knowledge base."""
 | 
				
			||||||
        upload_endpoint = f"{self.base_url}/api/v1/files/"
 | 
					        upload_endpoint = f"{self.base_url}/api/v1/files/"
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
@@ -131,7 +175,7 @@ class OpenWebUIUploader:
 | 
				
			|||||||
        try:
 | 
					        try:
 | 
				
			||||||
            # Use context manager for file upload request
 | 
					            # Use context manager for file upload request
 | 
				
			||||||
            with open(temp_file_path, 'rb') as f:
 | 
					            with open(temp_file_path, 'rb') as f:
 | 
				
			||||||
                files = {'file': (filename, f, 'text/markdown')}
 | 
					                files = {'file': (filename, f, content_type)}
 | 
				
			||||||
                with self.session.post(
 | 
					                with self.session.post(
 | 
				
			||||||
                    upload_endpoint,
 | 
					                    upload_endpoint,
 | 
				
			||||||
                    headers={"Authorization": f"Bearer {self.api_token}"},
 | 
					                    headers={"Authorization": f"Bearer {self.api_token}"},
 | 
				
			||||||
@@ -161,6 +205,61 @@ class OpenWebUIUploader:
 | 
				
			|||||||
            if os.path.exists(temp_file_path):
 | 
					            if os.path.exists(temp_file_path):
 | 
				
			||||||
                os.unlink(temp_file_path)
 | 
					                os.unlink(temp_file_path)
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"):
 | 
				
			||||||
 | 
					        """Update an existing file in the knowledge base."""
 | 
				
			||||||
 | 
					        # First upload the new version of the file
 | 
				
			||||||
 | 
					        upload_endpoint = f"{self.base_url}/api/v1/files/"
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # Create a temporary file for the upload
 | 
				
			||||||
 | 
					        temp_file_path = f"/tmp/{filename}"
 | 
				
			||||||
 | 
					        with open(temp_file_path, 'w') as f:
 | 
				
			||||||
 | 
					            f.write(content)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            # Upload the new file
 | 
				
			||||||
 | 
					            with open(temp_file_path, 'rb') as f:
 | 
				
			||||||
 | 
					                files = {'file': (filename, f, content_type)}
 | 
				
			||||||
 | 
					                with self.session.post(
 | 
				
			||||||
 | 
					                    upload_endpoint,
 | 
				
			||||||
 | 
					                    headers={"Authorization": f"Bearer {self.api_token}"},
 | 
				
			||||||
 | 
					                    files=files
 | 
				
			||||||
 | 
					                ) as upload_response:
 | 
				
			||||||
 | 
					                    upload_response.raise_for_status()
 | 
				
			||||||
 | 
					                    new_file_id = upload_response.json().get('id')
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # Remove the old file from the knowledge base
 | 
				
			||||||
 | 
					            remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove"
 | 
				
			||||||
 | 
					            with self.session.post(
 | 
				
			||||||
 | 
					                remove_endpoint,
 | 
				
			||||||
 | 
					                headers={
 | 
				
			||||||
 | 
					                    "Authorization": f"Bearer {self.api_token}",
 | 
				
			||||||
 | 
					                    "Content-Type": "application/json"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                json={'file_id': existing_file_id}
 | 
				
			||||||
 | 
					            ) as remove_response:
 | 
				
			||||||
 | 
					                remove_response.raise_for_status()
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # Add the new file to the knowledge base
 | 
				
			||||||
 | 
					            add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
 | 
				
			||||||
 | 
					            with self.session.post(
 | 
				
			||||||
 | 
					                add_endpoint,
 | 
				
			||||||
 | 
					                headers={
 | 
				
			||||||
 | 
					                    "Authorization": f"Bearer {self.api_token}",
 | 
				
			||||||
 | 
					                    "Content-Type": "application/json"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                json={'file_id': new_file_id}
 | 
				
			||||||
 | 
					            ) as add_response:
 | 
				
			||||||
 | 
					                add_response.raise_for_status()
 | 
				
			||||||
 | 
					                return add_response.json()
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					        except requests.exceptions.RequestException as e:
 | 
				
			||||||
 | 
					            logger.error(f"Error updating file: {e}")
 | 
				
			||||||
 | 
					            raise
 | 
				
			||||||
 | 
					        finally:
 | 
				
			||||||
 | 
					            # Clean up the temporary file
 | 
				
			||||||
 | 
					            if os.path.exists(temp_file_path):
 | 
				
			||||||
 | 
					                os.unlink(temp_file_path)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    def close(self):
 | 
					    def close(self):
 | 
				
			||||||
        """Close the requests session."""
 | 
					        """Close the requests session."""
 | 
				
			||||||
        if hasattr(self, 'session') and self.session:
 | 
					        if hasattr(self, 'session') and self.session:
 | 
				
			||||||
@@ -187,6 +286,15 @@ def convert_to_markdown(html_content, url):
 | 
				
			|||||||
        return f"# {url}\n\nError converting content: {str(e)}"
 | 
					        return f"# {url}\n\nError converting content: {str(e)}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_valid_json(content):
 | 
				
			||||||
 | 
					    """Check if content is valid JSON."""
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        json.loads(content)
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    except (ValueError, TypeError):
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main():
 | 
					def main():
 | 
				
			||||||
    parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
 | 
					    parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
 | 
				
			||||||
    parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
 | 
					    parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
 | 
				
			||||||
@@ -197,9 +305,17 @@ def main():
 | 
				
			|||||||
    parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)')
 | 
					    parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)')
 | 
				
			||||||
    parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
 | 
					    parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
 | 
				
			||||||
    parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)')
 | 
					    parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)')
 | 
				
			||||||
 | 
					    parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
 | 
				
			||||||
 | 
					    parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
 | 
				
			||||||
 | 
					    parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    # Check for conflicting options
 | 
				
			||||||
 | 
					    if args.update and args.skip_existing:
 | 
				
			||||||
 | 
					        logger.error("Cannot use both --update and --skip-existing flags at the same time")
 | 
				
			||||||
 | 
					        return 1
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    # Initialize resources that need to be closed
 | 
					    # Initialize resources that need to be closed
 | 
				
			||||||
    scraper = None
 | 
					    scraper = None
 | 
				
			||||||
    uploader = None
 | 
					    uploader = None
 | 
				
			||||||
@@ -222,46 +338,128 @@ def main():
 | 
				
			|||||||
            logger.error("No pages were crawled. Exiting.")
 | 
					            logger.error("No pages were crawled. Exiting.")
 | 
				
			||||||
            return 1
 | 
					            return 1
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        # 2. Convert HTML pages to Markdown
 | 
					        # 2. Process content (convert HTML to Markdown or handle JSON)
 | 
				
			||||||
        logger.info("Converting HTML pages to Markdown")
 | 
					        logger.info("Processing crawled content")
 | 
				
			||||||
        markdown_pages = {}
 | 
					        processed_files = []
 | 
				
			||||||
        for url, html in crawled_pages.items():
 | 
					        
 | 
				
			||||||
            markdown_content = convert_to_markdown(html, url)
 | 
					        for url, html_content in crawled_pages.items():
 | 
				
			||||||
            # Create a safe filename from the URL
 | 
					            # For JSON content, preserve it as JSON
 | 
				
			||||||
 | 
					            if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
 | 
				
			||||||
 | 
					                if is_valid_json(html_content):
 | 
				
			||||||
 | 
					                    try:
 | 
				
			||||||
 | 
					                        json_obj = json.loads(html_content)
 | 
				
			||||||
 | 
					                        pretty_json = json.dumps(json_obj, indent=2)
 | 
				
			||||||
 | 
					                        
 | 
				
			||||||
 | 
					                        # Create filename for JSON file
 | 
				
			||||||
 | 
					                        parsed_url = urlparse(url)
 | 
				
			||||||
 | 
					                        filename = f"{parsed_url.netloc}{parsed_url.path}"
 | 
				
			||||||
 | 
					                        filename = filename.replace('/', '_').replace('.', '_')
 | 
				
			||||||
 | 
					                        if not filename.endswith('.json'):
 | 
				
			||||||
 | 
					                            filename = f"{filename}.json"
 | 
				
			||||||
 | 
					                            
 | 
				
			||||||
 | 
					                        processed_files.append({
 | 
				
			||||||
 | 
					                            'content': pretty_json,
 | 
				
			||||||
 | 
					                            'content_type': 'application/json',
 | 
				
			||||||
 | 
					                            'filename': filename,
 | 
				
			||||||
 | 
					                            'url': url
 | 
				
			||||||
 | 
					                        })
 | 
				
			||||||
 | 
					                        logger.info(f"Processed JSON content from {url}")
 | 
				
			||||||
 | 
					                        continue
 | 
				
			||||||
 | 
					                    except ValueError:
 | 
				
			||||||
 | 
					                        # Not valid JSON despite the extension, fall back to Markdown
 | 
				
			||||||
 | 
					                        pass
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # For all other content, convert to Markdown
 | 
				
			||||||
 | 
					            markdown_content = convert_to_markdown(html_content, url)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # Create a safe filename
 | 
				
			||||||
            parsed_url = urlparse(url)
 | 
					            parsed_url = urlparse(url)
 | 
				
			||||||
            filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
 | 
					            filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
 | 
				
			||||||
            if not filename.endswith('.md'):
 | 
					            if not filename.endswith('.md'):
 | 
				
			||||||
                filename = f"{filename}.md"
 | 
					                filename = f"{filename}.md"
 | 
				
			||||||
            markdown_pages[filename] = markdown_content
 | 
					                
 | 
				
			||||||
 | 
					            processed_files.append({
 | 
				
			||||||
 | 
					                'content': markdown_content,
 | 
				
			||||||
 | 
					                'content_type': 'text/markdown',
 | 
				
			||||||
 | 
					                'filename': filename,
 | 
				
			||||||
 | 
					                'url': url
 | 
				
			||||||
 | 
					            })
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        logger.info(f"Processed {len(processed_files)} files")
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        # 3. Upload to Open WebUI
 | 
					        # 3. Upload to Open WebUI
 | 
				
			||||||
        logger.info(f"Creating knowledge base '{args.kb_name}' in Open WebUI")
 | 
					        # First check if a knowledge base with the specified name already exists
 | 
				
			||||||
        uploader = OpenWebUIUploader(args.base_url, args.token)
 | 
					        uploader = OpenWebUIUploader(args.base_url, args.token)
 | 
				
			||||||
        kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
 | 
					 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        kb_id = kb.get('id')
 | 
					        existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
 | 
				
			||||||
        if not kb_id:
 | 
					        if existing_kb:
 | 
				
			||||||
            logger.error("Failed to get knowledge base ID")
 | 
					            kb_id = existing_kb.get('id')
 | 
				
			||||||
            return 1
 | 
					            logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # Create a new knowledge base if none exists with that name
 | 
				
			||||||
 | 
					            logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
 | 
				
			||||||
 | 
					            kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
 | 
				
			||||||
 | 
					            kb_id = kb.get('id')
 | 
				
			||||||
 | 
					            if not kb_id:
 | 
				
			||||||
 | 
					                logger.error("Failed to get knowledge base ID")
 | 
				
			||||||
 | 
					                return 1
 | 
				
			||||||
 | 
					            logger.info(f"Created knowledge base with ID: {kb_id}")
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        logger.info(f"Created knowledge base with ID: {kb_id}")
 | 
					        # 4. Upload each file
 | 
				
			||||||
        
 | 
					 | 
				
			||||||
        # 4. Upload each markdown page
 | 
					 | 
				
			||||||
        success_count = 0
 | 
					        success_count = 0
 | 
				
			||||||
 | 
					        skip_count = 0
 | 
				
			||||||
 | 
					        update_count = 0
 | 
				
			||||||
        error_count = 0
 | 
					        error_count = 0
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        for filename, content in markdown_pages.items():
 | 
					        for file_info in processed_files:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                logger.info(f"Uploading {filename}")
 | 
					                filename = file_info['filename']
 | 
				
			||||||
                uploader.upload_file(kb_id, content, filename)
 | 
					                existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
 | 
				
			||||||
                success_count += 1
 | 
					                
 | 
				
			||||||
 | 
					                # Handle existing files based on options
 | 
				
			||||||
 | 
					                if existing_file_id:
 | 
				
			||||||
 | 
					                    if args.skip_existing:
 | 
				
			||||||
 | 
					                        logger.info(f"Skipping existing file: {filename}")
 | 
				
			||||||
 | 
					                        skip_count += 1
 | 
				
			||||||
 | 
					                        continue
 | 
				
			||||||
 | 
					                    elif args.update:
 | 
				
			||||||
 | 
					                        logger.info(f"Updating existing file: {filename}")
 | 
				
			||||||
 | 
					                        uploader.update_file(
 | 
				
			||||||
 | 
					                            kb_id, 
 | 
				
			||||||
 | 
					                            existing_file_id, 
 | 
				
			||||||
 | 
					                            file_info['content'], 
 | 
				
			||||||
 | 
					                            filename, 
 | 
				
			||||||
 | 
					                            file_info['content_type']
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                        update_count += 1
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        # Default behavior: add as new file
 | 
				
			||||||
 | 
					                        logger.info(f"Adding duplicate file (existing file will remain): {filename}")
 | 
				
			||||||
 | 
					                        uploader.upload_file(
 | 
				
			||||||
 | 
					                            kb_id, 
 | 
				
			||||||
 | 
					                            file_info['content'], 
 | 
				
			||||||
 | 
					                            filename, 
 | 
				
			||||||
 | 
					                            file_info['content_type']
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                        success_count += 1
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    # New file
 | 
				
			||||||
 | 
					                    logger.info(f"Uploading new file: {filename}")
 | 
				
			||||||
 | 
					                    uploader.upload_file(
 | 
				
			||||||
 | 
					                        kb_id, 
 | 
				
			||||||
 | 
					                        file_info['content'], 
 | 
				
			||||||
 | 
					                        filename, 
 | 
				
			||||||
 | 
					                        file_info['content_type']
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                    success_count += 1
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
                # Add a small delay between uploads
 | 
					                # Add a small delay between uploads
 | 
				
			||||||
                time.sleep(0.5)
 | 
					                time.sleep(0.5)
 | 
				
			||||||
            except Exception as e:
 | 
					            except Exception as e:
 | 
				
			||||||
                logger.error(f"Failed to upload {filename}: {e}")
 | 
					                logger.error(f"Failed to process {file_info['filename']}: {e}")
 | 
				
			||||||
                error_count += 1
 | 
					                error_count += 1
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        logger.info(f"Upload complete: {success_count} files uploaded successfully, {error_count} errors")
 | 
					        logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors")
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user