Updating for a new version and adding the updated README
This commit is contained in:
parent
2082dba946
commit
3d5acde487
162
README.md
162
README.md
@ -1,9 +1,153 @@
|
|||||||
owui-site-crawler.py --token "your_api_token" \
|
|
||||||
--base-url "http://localhost:3000" \
|
# Web to Knowledge Base for Open WebUI
|
||||||
--website-url "https://example.com" \
|
|
||||||
--kb-name "Example Website KB" \
|
A Python utility script that crawls websites, converts pages to Markdown or preserves JSON data, and uploads them to an Open WebUI knowledge base.
|
||||||
--kb-purpose "Documentation and information from example.com" \
|
|
||||||
--depth 3 \
|
## Features
|
||||||
--delay 1.5 \
|
|
||||||
--exclude "/login" \
|
- Crawls websites to a specified depth while respecting domain boundaries
|
||||||
--exclude "/admin"
|
- Converts HTML content to Markdown using MarkItDown
|
||||||
|
- Preserves JSON content in its original format
|
||||||
|
- Creates or updates knowledge bases in Open WebUI
|
||||||
|
- Handles existing files through update or skip options
|
||||||
|
- Customizable crawling with exclude patterns
|
||||||
|
- Detailed logging of the process
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- Open WebUI instance with API access
|
||||||
|
|
||||||
|
### Dependencies
|
||||||
|
|
||||||
|
Install the required packages:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install requests beautifulsoup4 markitdown
|
||||||
|
```
|
||||||
|
|
||||||
|
### Getting the Script
|
||||||
|
|
||||||
|
Download the script and make it executable:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -O https://raw.githubusercontent.com/yourusername/open-webui-site-crawler/main/web_to_kb.py
|
||||||
|
chmod +x web_to_kb.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Basic usage:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py --token "YOUR_API_TOKEN" \
|
||||||
|
--base-url "https://your-openwebui-instance.com" \
|
||||||
|
--website-url "https://website-to-crawl.com" \
|
||||||
|
--kb-name "My Website Knowledge Base"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Command Line Arguments
|
||||||
|
|
||||||
|
| Argument | Short | Description | Required | Default |
|
||||||
|
|----------|-------|-------------|----------|---------|
|
||||||
|
| `--token` | `-t` | Your OpenWebUI API token | Yes | - |
|
||||||
|
| `--base-url` | `-u` | Base URL of your OpenWebUI instance | Yes | - |
|
||||||
|
| `--website-url` | `-w` | URL of the website to crawl | Yes | - |
|
||||||
|
| `--kb-name` | `-n` | Name for the knowledge base | Yes | - |
|
||||||
|
| `--kb-purpose` | `-p` | Purpose description for the knowledge base | No | None |
|
||||||
|
| `--depth` | `-d` | Maximum depth to crawl | No | 2 |
|
||||||
|
| `--delay` | | Delay between requests in seconds | No | 1.0 |
|
||||||
|
| `--exclude` | `-e` | URL patterns to exclude from crawling (can be specified multiple times) | No | None |
|
||||||
|
| `--include-json` | `-j` | Include JSON files and API endpoints | No | False |
|
||||||
|
| `--update` | | Update existing files in the knowledge base | No | False |
|
||||||
|
| `--skip-existing` | | Skip existing files in the knowledge base | No | False |
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Basic Crawl with Limited Depth
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py -t "YOUR_API_TOKEN" \
|
||||||
|
-u "https://your-openwebui-instance.com" \
|
||||||
|
-w "https://docs.example.com" \
|
||||||
|
-n "Example Docs KB" \
|
||||||
|
-d 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Excluding Certain URL Patterns
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py -t "YOUR_API_TOKEN" \
|
||||||
|
-u "https://your-openwebui-instance.com" \
|
||||||
|
-w "https://blog.example.com" \
|
||||||
|
-n "Example Blog KB" \
|
||||||
|
-e "/tags/" \
|
||||||
|
-e "/author/" \
|
||||||
|
-e "/search/"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Including JSON Content
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py -t "YOUR_API_TOKEN" \
|
||||||
|
-u "https://your-openwebui-instance.com" \
|
||||||
|
-w "https://api-docs.example.com" \
|
||||||
|
-n "Example API Documentation" \
|
||||||
|
-j
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating an Existing Knowledge Base
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py -t "YOUR_API_TOKEN" \
|
||||||
|
-u "https://your-openwebui-instance.com" \
|
||||||
|
-w "https://knowledge-center.example.com" \
|
||||||
|
-n "Knowledge Center" \
|
||||||
|
--update
|
||||||
|
```
|
||||||
|
|
||||||
|
### Skipping Existing Files
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python web_to_kb.py -t "YOUR_API_TOKEN" \
|
||||||
|
-u "https://your-openwebui-instance.com" \
|
||||||
|
-w "https://docs.example.com" \
|
||||||
|
-n "Documentation KB" \
|
||||||
|
--skip-existing
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
1. **Website Crawling**: The script starts crawling from the specified website URL, following links up to the specified depth while staying within the same domain.
|
||||||
|
|
||||||
|
2. **Content Processing**:
|
||||||
|
- HTML content is converted to Markdown using MarkItDown
|
||||||
|
- JSON content is preserved in its native format (when `--include-json` is used)
|
||||||
|
|
||||||
|
3. **Knowledge Base Management**:
|
||||||
|
- Checks if a knowledge base with the specified name already exists
|
||||||
|
- Creates a new knowledge base if none exists
|
||||||
|
|
||||||
|
4. **File Upload**:
|
||||||
|
- Manages existing files based on the `--update` or `--skip-existing` flags
|
||||||
|
- Uploads new files to the knowledge base
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The script respects domain boundaries and will not crawl external links
|
||||||
|
- URLs are used to generate filenames, with special characters replaced
|
||||||
|
- Add a delay between requests to be respectful of websites' resources
|
||||||
|
- File updates are performed by uploading a new file and removing the old one
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License - see the LICENSE file for details.
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
- [MarkItDown](https://github.com/microsoft/markitdown) for HTML to Markdown conversion [1]
|
||||||
|
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
|
||||||
|
- [Requests](https://requests.readthedocs.io/) for HTTP requests
|
||||||
|
- [Open WebUI](https://github.com/open-webui/open-webui) for the knowledge base API
|
@ -9,7 +9,6 @@ from bs4 import BeautifulSoup
|
|||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import contextlib
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO,
|
logging.basicConfig(level=logging.INFO,
|
||||||
@ -102,6 +101,51 @@ class OpenWebUIUploader:
|
|||||||
"Accept": "application/json"
|
"Accept": "application/json"
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def get_knowledge_bases(self):
|
||||||
|
"""Get a list of all knowledge bases."""
|
||||||
|
endpoint = f"{self.base_url}/api/v1/knowledge/list"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.session.get(endpoint)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error getting knowledge bases: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def get_knowledge_base_by_name(self, name):
|
||||||
|
"""Check if a knowledge base with the given name exists, and return its details if it does."""
|
||||||
|
try:
|
||||||
|
kbs = self.get_knowledge_bases()
|
||||||
|
for kb in kbs:
|
||||||
|
if kb.get('name') == name:
|
||||||
|
return kb
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking for existing knowledge base: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_knowledge_base_files(self, kb_id):
|
||||||
|
"""Get all files in a knowledge base."""
|
||||||
|
endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.session.get(endpoint)
|
||||||
|
response.raise_for_status()
|
||||||
|
kb_data = response.json()
|
||||||
|
return kb_data.get('files', [])
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error getting knowledge base files: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def file_exists_in_kb(self, kb_id, filename):
|
||||||
|
"""Check if a file with the given name exists in the knowledge base."""
|
||||||
|
files = self.get_knowledge_base_files(kb_id)
|
||||||
|
for file in files:
|
||||||
|
if 'meta' in file and 'name' in file['meta'] and file['meta']['name'] == filename:
|
||||||
|
return file['id']
|
||||||
|
return None
|
||||||
|
|
||||||
def create_knowledge_base(self, name, purpose=None):
|
def create_knowledge_base(self, name, purpose=None):
|
||||||
"""Create a new knowledge base in OpenWebUI."""
|
"""Create a new knowledge base in OpenWebUI."""
|
||||||
endpoint = f"{self.base_url}/api/v1/knowledge/create"
|
endpoint = f"{self.base_url}/api/v1/knowledge/create"
|
||||||
@ -119,7 +163,7 @@ class OpenWebUIUploader:
|
|||||||
logger.error(f"Error creating knowledge base: {e}")
|
logger.error(f"Error creating knowledge base: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def upload_file(self, kb_id, content, filename):
|
def upload_file(self, kb_id, content, filename, content_type="text/markdown"):
|
||||||
"""Upload a file to the knowledge base."""
|
"""Upload a file to the knowledge base."""
|
||||||
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
||||||
|
|
||||||
@ -131,7 +175,7 @@ class OpenWebUIUploader:
|
|||||||
try:
|
try:
|
||||||
# Use context manager for file upload request
|
# Use context manager for file upload request
|
||||||
with open(temp_file_path, 'rb') as f:
|
with open(temp_file_path, 'rb') as f:
|
||||||
files = {'file': (filename, f, 'text/markdown')}
|
files = {'file': (filename, f, content_type)}
|
||||||
with self.session.post(
|
with self.session.post(
|
||||||
upload_endpoint,
|
upload_endpoint,
|
||||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||||
@ -161,6 +205,61 @@ class OpenWebUIUploader:
|
|||||||
if os.path.exists(temp_file_path):
|
if os.path.exists(temp_file_path):
|
||||||
os.unlink(temp_file_path)
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
def update_file(self, kb_id, existing_file_id, content, filename, content_type="text/markdown"):
|
||||||
|
"""Update an existing file in the knowledge base."""
|
||||||
|
# First upload the new version of the file
|
||||||
|
upload_endpoint = f"{self.base_url}/api/v1/files/"
|
||||||
|
|
||||||
|
# Create a temporary file for the upload
|
||||||
|
temp_file_path = f"/tmp/{filename}"
|
||||||
|
with open(temp_file_path, 'w') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Upload the new file
|
||||||
|
with open(temp_file_path, 'rb') as f:
|
||||||
|
files = {'file': (filename, f, content_type)}
|
||||||
|
with self.session.post(
|
||||||
|
upload_endpoint,
|
||||||
|
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||||
|
files=files
|
||||||
|
) as upload_response:
|
||||||
|
upload_response.raise_for_status()
|
||||||
|
new_file_id = upload_response.json().get('id')
|
||||||
|
|
||||||
|
# Remove the old file from the knowledge base
|
||||||
|
remove_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/remove"
|
||||||
|
with self.session.post(
|
||||||
|
remove_endpoint,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.api_token}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
},
|
||||||
|
json={'file_id': existing_file_id}
|
||||||
|
) as remove_response:
|
||||||
|
remove_response.raise_for_status()
|
||||||
|
|
||||||
|
# Add the new file to the knowledge base
|
||||||
|
add_endpoint = f"{self.base_url}/api/v1/knowledge/{kb_id}/file/add"
|
||||||
|
with self.session.post(
|
||||||
|
add_endpoint,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.api_token}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
},
|
||||||
|
json={'file_id': new_file_id}
|
||||||
|
) as add_response:
|
||||||
|
add_response.raise_for_status()
|
||||||
|
return add_response.json()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error updating file: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# Clean up the temporary file
|
||||||
|
if os.path.exists(temp_file_path):
|
||||||
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""Close the requests session."""
|
"""Close the requests session."""
|
||||||
if hasattr(self, 'session') and self.session:
|
if hasattr(self, 'session') and self.session:
|
||||||
@ -187,6 +286,15 @@ def convert_to_markdown(html_content, url):
|
|||||||
return f"# {url}\n\nError converting content: {str(e)}"
|
return f"# {url}\n\nError converting content: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_json(content):
|
||||||
|
"""Check if content is valid JSON."""
|
||||||
|
try:
|
||||||
|
json.loads(content)
|
||||||
|
return True
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
|
parser = argparse.ArgumentParser(description='Scrape a website and create an Open WebUI knowledge base')
|
||||||
parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
|
parser.add_argument('--token', '-t', required=True, help='Your OpenWebUI API token')
|
||||||
@ -197,9 +305,17 @@ def main():
|
|||||||
parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)')
|
parser.add_argument('--depth', '-d', type=int, default=2, help='Maximum depth to crawl (default: 2)')
|
||||||
parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
|
parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
|
||||||
parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)')
|
parser.add_argument('--exclude', '-e', action='append', help='URL patterns to exclude from crawling (can be specified multiple times)')
|
||||||
|
parser.add_argument('--include-json', '-j', action='store_true', help='Include JSON files and API endpoints')
|
||||||
|
parser.add_argument('--update', action='store_true', help='Update existing files in the knowledge base')
|
||||||
|
parser.add_argument('--skip-existing', action='store_true', help='Skip existing files in the knowledge base')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check for conflicting options
|
||||||
|
if args.update and args.skip_existing:
|
||||||
|
logger.error("Cannot use both --update and --skip-existing flags at the same time")
|
||||||
|
return 1
|
||||||
|
|
||||||
# Initialize resources that need to be closed
|
# Initialize resources that need to be closed
|
||||||
scraper = None
|
scraper = None
|
||||||
uploader = None
|
uploader = None
|
||||||
@ -222,46 +338,128 @@ def main():
|
|||||||
logger.error("No pages were crawled. Exiting.")
|
logger.error("No pages were crawled. Exiting.")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# 2. Convert HTML pages to Markdown
|
# 2. Process content (convert HTML to Markdown or handle JSON)
|
||||||
logger.info("Converting HTML pages to Markdown")
|
logger.info("Processing crawled content")
|
||||||
markdown_pages = {}
|
processed_files = []
|
||||||
for url, html in crawled_pages.items():
|
|
||||||
markdown_content = convert_to_markdown(html, url)
|
for url, html_content in crawled_pages.items():
|
||||||
# Create a safe filename from the URL
|
# For JSON content, preserve it as JSON
|
||||||
|
if url.endswith('.json') or (is_valid_json(html_content) and args.include_json):
|
||||||
|
if is_valid_json(html_content):
|
||||||
|
try:
|
||||||
|
json_obj = json.loads(html_content)
|
||||||
|
pretty_json = json.dumps(json_obj, indent=2)
|
||||||
|
|
||||||
|
# Create filename for JSON file
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
filename = f"{parsed_url.netloc}{parsed_url.path}"
|
||||||
|
filename = filename.replace('/', '_').replace('.', '_')
|
||||||
|
if not filename.endswith('.json'):
|
||||||
|
filename = f"{filename}.json"
|
||||||
|
|
||||||
|
processed_files.append({
|
||||||
|
'content': pretty_json,
|
||||||
|
'content_type': 'application/json',
|
||||||
|
'filename': filename,
|
||||||
|
'url': url
|
||||||
|
})
|
||||||
|
logger.info(f"Processed JSON content from {url}")
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
# Not valid JSON despite the extension, fall back to Markdown
|
||||||
|
pass
|
||||||
|
|
||||||
|
# For all other content, convert to Markdown
|
||||||
|
markdown_content = convert_to_markdown(html_content, url)
|
||||||
|
|
||||||
|
# Create a safe filename
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
|
filename = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_').replace('.', '_')
|
||||||
if not filename.endswith('.md'):
|
if not filename.endswith('.md'):
|
||||||
filename = f"{filename}.md"
|
filename = f"{filename}.md"
|
||||||
markdown_pages[filename] = markdown_content
|
|
||||||
|
processed_files.append({
|
||||||
|
'content': markdown_content,
|
||||||
|
'content_type': 'text/markdown',
|
||||||
|
'filename': filename,
|
||||||
|
'url': url
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Processed {len(processed_files)} files")
|
||||||
|
|
||||||
# 3. Upload to Open WebUI
|
# 3. Upload to Open WebUI
|
||||||
logger.info(f"Creating knowledge base '{args.kb_name}' in Open WebUI")
|
# First check if a knowledge base with the specified name already exists
|
||||||
uploader = OpenWebUIUploader(args.base_url, args.token)
|
uploader = OpenWebUIUploader(args.base_url, args.token)
|
||||||
kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
|
|
||||||
|
|
||||||
kb_id = kb.get('id')
|
existing_kb = uploader.get_knowledge_base_by_name(args.kb_name)
|
||||||
if not kb_id:
|
if existing_kb:
|
||||||
logger.error("Failed to get knowledge base ID")
|
kb_id = existing_kb.get('id')
|
||||||
return 1
|
logger.info(f"Found existing knowledge base '{args.kb_name}' with ID: {kb_id}")
|
||||||
|
else:
|
||||||
|
# Create a new knowledge base if none exists with that name
|
||||||
|
logger.info(f"Creating new knowledge base '{args.kb_name}' in Open WebUI")
|
||||||
|
kb = uploader.create_knowledge_base(args.kb_name, args.kb_purpose)
|
||||||
|
kb_id = kb.get('id')
|
||||||
|
if not kb_id:
|
||||||
|
logger.error("Failed to get knowledge base ID")
|
||||||
|
return 1
|
||||||
|
logger.info(f"Created knowledge base with ID: {kb_id}")
|
||||||
|
|
||||||
logger.info(f"Created knowledge base with ID: {kb_id}")
|
# 4. Upload each file
|
||||||
|
|
||||||
# 4. Upload each markdown page
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
|
skip_count = 0
|
||||||
|
update_count = 0
|
||||||
error_count = 0
|
error_count = 0
|
||||||
|
|
||||||
for filename, content in markdown_pages.items():
|
for file_info in processed_files:
|
||||||
try:
|
try:
|
||||||
logger.info(f"Uploading {filename}")
|
filename = file_info['filename']
|
||||||
uploader.upload_file(kb_id, content, filename)
|
existing_file_id = uploader.file_exists_in_kb(kb_id, filename)
|
||||||
success_count += 1
|
|
||||||
|
# Handle existing files based on options
|
||||||
|
if existing_file_id:
|
||||||
|
if args.skip_existing:
|
||||||
|
logger.info(f"Skipping existing file: {filename}")
|
||||||
|
skip_count += 1
|
||||||
|
continue
|
||||||
|
elif args.update:
|
||||||
|
logger.info(f"Updating existing file: {filename}")
|
||||||
|
uploader.update_file(
|
||||||
|
kb_id,
|
||||||
|
existing_file_id,
|
||||||
|
file_info['content'],
|
||||||
|
filename,
|
||||||
|
file_info['content_type']
|
||||||
|
)
|
||||||
|
update_count += 1
|
||||||
|
else:
|
||||||
|
# Default behavior: add as new file
|
||||||
|
logger.info(f"Adding duplicate file (existing file will remain): {filename}")
|
||||||
|
uploader.upload_file(
|
||||||
|
kb_id,
|
||||||
|
file_info['content'],
|
||||||
|
filename,
|
||||||
|
file_info['content_type']
|
||||||
|
)
|
||||||
|
success_count += 1
|
||||||
|
else:
|
||||||
|
# New file
|
||||||
|
logger.info(f"Uploading new file: {filename}")
|
||||||
|
uploader.upload_file(
|
||||||
|
kb_id,
|
||||||
|
file_info['content'],
|
||||||
|
filename,
|
||||||
|
file_info['content_type']
|
||||||
|
)
|
||||||
|
success_count += 1
|
||||||
|
|
||||||
# Add a small delay between uploads
|
# Add a small delay between uploads
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to upload {filename}: {e}")
|
logger.error(f"Failed to process {file_info['filename']}: {e}")
|
||||||
error_count += 1
|
error_count += 1
|
||||||
|
|
||||||
logger.info(f"Upload complete: {success_count} files uploaded successfully, {error_count} errors")
|
logger.info(f"Upload complete: {success_count} files uploaded, {update_count} files updated, {skip_count} files skipped, {error_count} errors")
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user