Add S3/MinIO ingestion support
- S3Ingestor class for S3-compatible storage - Supports: AWS S3, MinIO, DigitalOcean Spaces, Wasabi - CLI: opus ingest-s3 --bucket my-bucket --prefix notes/ - Features: list objects, download, upload, text extraction Environment: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY - AWS_REGION - S3_ENDPOINT_URL (for MinIO/non-AWS)
This commit is contained in:
@@ -33,6 +33,7 @@ from opus_orchestrator.state import OpusState, create_initial_state
|
||||
from opus_orchestrator.langgraph_workflow import OpusGraph, run_opus, OpusGraphState
|
||||
from opus_orchestrator.autogen_critique import CritiqueCrew, create_critique_crew
|
||||
from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_ingestor
|
||||
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
|
||||
from opus_orchestrator.frameworks import StoryFramework
|
||||
from opus_orchestrator.crews import (
|
||||
OpusCrew,
|
||||
@@ -112,6 +113,9 @@ __all__ = [
|
||||
"create_critique_crew",
|
||||
"GitHubIngestor",
|
||||
"create_github_ingestor",
|
||||
# S3/MinIO (NEW!)
|
||||
"S3Ingestor",
|
||||
"create_s3_ingestor",
|
||||
]
|
||||
|
||||
# Import legacy orchestrator for backward compatibility
|
||||
|
||||
@@ -150,7 +150,7 @@ Examples:
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# INGEST COMMAND
|
||||
# INGEST COMMAND (GitHub)
|
||||
# -------------------------------------------------------------------------
|
||||
ingest_parser = subparsers.add_parser(
|
||||
"ingest",
|
||||
@@ -178,6 +178,43 @@ Examples:
|
||||
help="Show preview of ingested content",
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# INGEST-S3 COMMAND
|
||||
# -------------------------------------------------------------------------
|
||||
s3_parser = subparsers.add_parser(
|
||||
"ingest-s3",
|
||||
help="Ingest content from S3/MinIO",
|
||||
description="Fetch and analyze content from S3-compatible storage",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--bucket", "-b",
|
||||
required=True,
|
||||
help="S3 bucket name",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--prefix", "-p",
|
||||
default="",
|
||||
help="Object key prefix",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--endpoint", "-e",
|
||||
help="S3 endpoint URL (for MinIO, DO Spaces, etc.)",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--output", "-o",
|
||||
help="Output file for ingested content",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--preview",
|
||||
action="store_true",
|
||||
help="Show preview of ingested content",
|
||||
)
|
||||
s3_parser.add_argument(
|
||||
"--list-objects",
|
||||
action="store_true",
|
||||
help="List objects instead of downloading",
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# FRAMEWORKS COMMAND
|
||||
# -------------------------------------------------------------------------
|
||||
@@ -409,6 +446,54 @@ def run_ingest(args: argparse.Namespace) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def run_s3_ingest(args: argparse.Namespace) -> int:
|
||||
"""Ingest content from S3/MinIO."""
|
||||
from opus_orchestrator import S3Ingestor
|
||||
|
||||
print(f"\n🪣 Ingesting from S3: {args.bucket}/{args.prefix}\n")
|
||||
|
||||
if args.endpoint:
|
||||
print(f" Endpoint: {args.endpoint}")
|
||||
|
||||
ingestor = S3Ingestor(
|
||||
endpoint_url=args.endpoint,
|
||||
bucket=args.bucket,
|
||||
)
|
||||
|
||||
if args.list_objects:
|
||||
# Just list objects
|
||||
objects = ingestor.list_objects(bucket=args.bucket, prefix=args.prefix)
|
||||
print(f"📦 Objects ({len(objects)}):")
|
||||
for obj in objects[:20]:
|
||||
print(f" {obj['key']} ({obj['size']:,} bytes)")
|
||||
if len(objects) > 20:
|
||||
print(f" ... and {len(objects) - 20} more")
|
||||
return 0
|
||||
|
||||
# Ingest content
|
||||
result = ingestor.ingest_bucket(
|
||||
bucket=args.bucket,
|
||||
prefix=args.prefix,
|
||||
)
|
||||
|
||||
print(f"✅ Loaded {result['total_chars']:,} characters")
|
||||
print(f" Files: {result['file_count']}")
|
||||
print(f" File list: {', '.join(result['files'].keys())}\n")
|
||||
|
||||
if args.preview:
|
||||
print("📄 PREVIEW (first 2000 chars):")
|
||||
print("-" * 40)
|
||||
print(result["combined_text"][:2000])
|
||||
print("-" * 40)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(result["combined_text"])
|
||||
print(f"\n💾 Saved to: {args.output}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def run_frameworks(args: argparse.Namespace) -> int:
|
||||
"""List available frameworks."""
|
||||
from opus_orchestrator.frameworks import FRAMEWORKS
|
||||
@@ -527,6 +612,7 @@ async def main_async(args: argparse.Namespace) -> int:
|
||||
"generate": run_generate,
|
||||
"serve": run_serve,
|
||||
"ingest": run_ingest,
|
||||
"ingest-s3": run_s3_ingest,
|
||||
"frameworks": run_frameworks,
|
||||
"config": run_config,
|
||||
"docs": run_docs,
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
|
||||
from opus_orchestrator.utils.docs import generate_docs
|
||||
from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_ingestor
|
||||
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
|
||||
from opus_orchestrator.utils.llm import get_llm_client
|
||||
|
||||
__all__ = [
|
||||
"generate_docs",
|
||||
"GitHubIngestor",
|
||||
"create_github_ingestor",
|
||||
"S3Ingestor",
|
||||
"create_s3_ingestor",
|
||||
"get_llm_client",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,333 @@
|
||||
"""S3/MinIO ingestion for Opus Orchestrator.
|
||||
|
||||
Fetches content from S3-compatible object storage.
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
from typing import Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError, NoCredentialsError
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv("/home/solaria/.openclaw/workspace/opus-orchestrator-ai/.env")
|
||||
|
||||
|
||||
class S3Ingestor:
|
||||
"""Fetch and parse content from S3-compatible storage.
|
||||
|
||||
Supports:
|
||||
- Amazon S3
|
||||
- MinIO
|
||||
- DigitalOcean Spaces
|
||||
- Wasabi
|
||||
- Other S3-compatible APIs
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint_url: Optional[str] = None,
|
||||
access_key: Optional[str] = None,
|
||||
secret_key: Optional[str] = None,
|
||||
region_name: str = "us-east-1",
|
||||
bucket: Optional[str] = None,
|
||||
):
|
||||
"""Initialize S3 ingestor.
|
||||
|
||||
Args:
|
||||
endpoint_url: S3 endpoint URL (for MinIO, DO Spaces, etc.)
|
||||
access_key: AWS access key
|
||||
secret_key: AWS secret key
|
||||
region_name: AWS region
|
||||
bucket: Default bucket name
|
||||
"""
|
||||
self.access_key = access_key or os.environ.get("AWS_ACCESS_KEY_ID")
|
||||
self.secret_key = secret_key or os.environ.get("AWS_SECRET_ACCESS_KEY")
|
||||
self.region_name = region_name or os.environ.get("AWS_REGION", "us-east-1")
|
||||
self.endpoint_url = endpoint_url or os.environ.get("S3_ENDPOINT_URL")
|
||||
self.bucket = bucket
|
||||
|
||||
if not self.access_key or not self.secret_key:
|
||||
raise ValueError(
|
||||
"S3 credentials required. Set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY "
|
||||
"or pass access_key/secret_key."
|
||||
)
|
||||
|
||||
# Determine if using custom endpoint
|
||||
use_ssl = self.endpoint_url and not self.endpoint_url.startswith("http://")
|
||||
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=self.endpoint_url,
|
||||
aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key,
|
||||
region_name=self.region_name,
|
||||
)
|
||||
|
||||
self._buckets_cache: Optional[list[str]] = None
|
||||
|
||||
def list_buckets(self) -> list[str]:
|
||||
"""List available buckets.
|
||||
|
||||
Returns:
|
||||
List of bucket names
|
||||
"""
|
||||
try:
|
||||
response = self.s3_client.list_buckets()
|
||||
return [b["Name"] for b in response.get("Buckets", [])]
|
||||
except (ClientError, NoCredentialsError) as e:
|
||||
raise RuntimeError(f"Failed to list buckets: {e}")
|
||||
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: Optional[str] = None,
|
||||
prefix: str = "",
|
||||
max_keys: int = 1000,
|
||||
) -> list[dict]:
|
||||
"""List objects in a bucket.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name (uses default if not provided)
|
||||
prefix: Object key prefix filter
|
||||
max_keys: Maximum number of keys to return
|
||||
|
||||
Returns:
|
||||
List of object metadata dicts
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
try:
|
||||
response = self.s3_client.list_objects_v2(
|
||||
Bucket=bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=max_keys,
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"last_modified": obj.get("LastModified"),
|
||||
"etag": obj.get("ETag", "").strip('"'),
|
||||
}
|
||||
for obj in response.get("Contents", [])
|
||||
]
|
||||
except ClientError as e:
|
||||
raise RuntimeError(f"Failed to list objects: {e}")
|
||||
|
||||
def get_object(
|
||||
self,
|
||||
key: str,
|
||||
bucket: Optional[str] = None,
|
||||
encoding: str = "utf-8",
|
||||
) -> str:
|
||||
"""Get content of a single object.
|
||||
|
||||
Args:
|
||||
key: Object key
|
||||
bucket: Bucket name (uses default if not provided)
|
||||
encoding: Text encoding
|
||||
|
||||
Returns:
|
||||
Object content as string
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
try:
|
||||
response = self.s3_client.get_object(Bucket=bucket, Key=key)
|
||||
body = response["Body"]
|
||||
|
||||
# Try to read as text
|
||||
content = body.read()
|
||||
try:
|
||||
return content.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
# Return raw bytes as hex for binary files
|
||||
return f"[Binary file: {len(content)} bytes]"
|
||||
|
||||
except ClientError as e:
|
||||
raise RuntimeError(f"Failed to get object {key}: {e}")
|
||||
|
||||
def get_text_files(
|
||||
self,
|
||||
bucket: Optional[str] = None,
|
||||
prefix: str = "",
|
||||
extensions: Optional[list[str]] = None,
|
||||
) -> dict[str, str]:
|
||||
"""Get all text files from a prefix.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name
|
||||
prefix: Object key prefix
|
||||
extensions: File extensions to filter (e.g., ['.txt', '.md'])
|
||||
|
||||
Returns:
|
||||
Dict mapping keys to content
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
extensions = extensions or [".txt", ".md", ".markdown", ".notes", ".draft"]
|
||||
|
||||
objects = self.list_objects(bucket=bucket, prefix=prefix)
|
||||
|
||||
results = {}
|
||||
for obj in objects:
|
||||
key = obj["key"]
|
||||
|
||||
# Check extension
|
||||
if not any(key.lower().endswith(ext) for ext in extensions):
|
||||
continue
|
||||
|
||||
# Skip directories
|
||||
if key.endswith("/"):
|
||||
continue
|
||||
|
||||
try:
|
||||
content = self.get_object(key, bucket)
|
||||
results[key] = content
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to read {key}: {e}")
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
def ingest_bucket(
|
||||
self,
|
||||
bucket: Optional[str] = None,
|
||||
prefix: str = "",
|
||||
extensions: Optional[list[str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Ingest all text content from a bucket/prefix.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name
|
||||
prefix: Object key prefix
|
||||
extensions: File extensions to include
|
||||
|
||||
Returns:
|
||||
Dict with combined text and metadata
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
files = self.get_text_files(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
extensions=extensions,
|
||||
)
|
||||
|
||||
# Combine all content
|
||||
combined_lines = []
|
||||
for key, content in files.items():
|
||||
combined_lines.append(f"=== {key} ===")
|
||||
combined_lines.append(content)
|
||||
combined_lines.append("")
|
||||
|
||||
combined_text = "\n".join(combined_lines)
|
||||
|
||||
return {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"files": files,
|
||||
"file_count": len(files),
|
||||
"total_chars": len(combined_text),
|
||||
"combined_text": combined_text,
|
||||
}
|
||||
|
||||
def download_file(
|
||||
self,
|
||||
key: str,
|
||||
local_path: str | Path,
|
||||
bucket: Optional[str] = None,
|
||||
) -> Path:
|
||||
"""Download a file from S3 to local storage.
|
||||
|
||||
Args:
|
||||
key: Object key
|
||||
local_path: Local destination path
|
||||
bucket: Bucket name
|
||||
|
||||
Returns:
|
||||
Path to downloaded file
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
local_path = Path(local_path)
|
||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.s3_client.download_file(bucket, key, str(local_path))
|
||||
|
||||
return local_path
|
||||
|
||||
def upload_file(
|
||||
self,
|
||||
local_path: str | Path,
|
||||
key: str,
|
||||
bucket: Optional[str] = None,
|
||||
content_type: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Upload a file to S3.
|
||||
|
||||
Args:
|
||||
local_path: Local file path
|
||||
key: Object key
|
||||
bucket: Bucket name
|
||||
content_type: Content MIME type
|
||||
|
||||
Returns:
|
||||
S3 URL of uploaded file
|
||||
"""
|
||||
bucket = bucket or self.bucket
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name required")
|
||||
|
||||
local_path = Path(local_path)
|
||||
|
||||
extra_args = {}
|
||||
if content_type:
|
||||
extra_args["ContentType"] = content_type
|
||||
|
||||
self.s3_client.upload_file(
|
||||
str(local_path),
|
||||
bucket,
|
||||
key,
|
||||
ExtraArgs=extra_args,
|
||||
)
|
||||
|
||||
return f"s3://{bucket}/{key}"
|
||||
|
||||
|
||||
def create_s3_ingestor(
|
||||
endpoint_url: Optional[str] = None,
|
||||
access_key: Optional[str] = None,
|
||||
secret_key: Optional[str] = None,
|
||||
bucket: Optional[str] = None,
|
||||
) -> S3Ingestor:
|
||||
"""Factory function to create an S3 ingestor.
|
||||
|
||||
Args:
|
||||
endpoint_url: S3 endpoint URL
|
||||
access_key: AWS access key
|
||||
secret_key: AWS secret key
|
||||
bucket: Default bucket
|
||||
|
||||
Returns:
|
||||
Configured S3Ingestor
|
||||
"""
|
||||
return S3Ingestor(
|
||||
endpoint_url=endpoint_url,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
bucket=bucket,
|
||||
)
|
||||
Reference in New Issue
Block a user