Files
opus-orchestrator-ai/opus_orchestrator/utils/github_ingest.py
T
mrhavens 9eee1ac1e7 Add GitHub ingestion - pull content from repos as source material
- GitHubIngestor class to fetch repo contents
- Support for .md, .txt, .notes, .draft files
- Method to ingest from GitHub directly into orchestrator
- Export GitHubIngestor in __init__.py

Usage:
    orch = OpusOrchestrator(book_type='fiction', genre='memoir')
    content = await orch.ingest_from_github('mrhavens/my-notes')
    await orch.run()
2026-03-13 00:40:47 +00:00

188 lines
5.8 KiB
Python

"""GitHub ingestion for Opus Orchestrator.
Fetches content from GitHub repositories for use as source material.
"""
import os
import base64
import re
from typing import Any, Optional
import requests
from dotenv import load_dotenv
load_dotenv("/home/solaria/.openclaw/workspace/opus-orchestrator-ai/.env")
class GitHubIngestor:
"""Fetch and parse content from GitHub repositories."""
def __init__(self, token: Optional[str] = None):
self.token = token or os.environ.get("GITHUB_TOKEN")
if not self.token:
raise ValueError("GitHub token required. Set GITHUB_TOKEN or pass token.")
self.headers = {
"Authorization": f"token {self.token}",
"Accept": "application/vnd.github.v3+json",
}
self.base_url = "https://api.github.com"
def get_contents(self, repo: str, path: str = "") -> list[dict]:
"""Get contents of a directory or file.
Args:
repo: "owner/repo" format
path: directory path (default: root)
Returns:
List of content items
"""
url = f"{self.base_url}/repos/{repo}/contents/{path}"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.json()
def get_file_content(self, repo: str, path: str) -> str:
"""Get content of a single file.
Args:
repo: "owner/repo" format
path: file path
Returns:
Decoded file content
"""
url = f"{self.base_url}/repos/{repo}/contents/{path}"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
data = response.json()
# Decode base64 content
if data.get("encoding") == "base64":
content = base64.b64decode(data["content"]).decode("utf-8")
return content
return data.get("content", "")
def get_all_files(
self,
repo: str,
extensions: Optional[list[str]] = None,
exclude_dirs: Optional[list[str]] = None,
) -> dict[str, str]:
"""Get all files from a repository.
Args:
repo: "owner/repo" format
extensions: File extensions to include (e.g., ['.md', '.txt'])
exclude_dirs: Directories to exclude
Returns:
Dictionary mapping file paths to content
"""
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft"]
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
files = {}
def walk_directory(path: str = ""):
contents = self.get_contents(repo, path)
if isinstance(contents, dict):
# Single file
if contents.get("type") == "file":
content_path = contents["path"]
if self._should_include(content_path, extensions, exclude_dirs):
files[content_path] = self.get_file_content(repo, content_path)
return
for item in contents:
item_path = item.get("path", "")
item_type = item.get("type")
if item_type == "dir":
# Check if excluded
if not any(excl in item_path for excl in exclude_dirs):
walk_directory(item_path)
elif item_type == "file":
if self._should_include(item_path, extensions, exclude_dirs):
files[item_path] = self.get_file_content(repo, item_path)
walk_directory()
return files
def _should_include(
self,
path: str,
extensions: list[str],
exclude_dirs: list[str],
) -> bool:
"""Check if file should be included."""
# Exclude directories
for excl in exclude_dirs:
if excl in path:
return False
# Check extension
return any(path.endswith(ext) for ext in extensions)
def extract_text_from_files(self, files: dict[str, str]) -> str:
"""Combine all file contents into a single text blob.
Args:
files: Dictionary of filename -> content
Returns:
Combined text
"""
combined = []
for filename, content in sorted(files.items()):
combined.append(f"=== {filename} ===\n")
combined.append(content)
combined.append("\n\n")
return "".join(combined)
def ingest_repo(
self,
repo: str,
include_readme: bool = True,
) -> dict[str, Any]:
"""Ingest a complete repository.
Args:
repo: "owner/repo" format
include_readme: Include README.md files
Returns:
Dictionary with files, combined_text, and metadata
"""
# Get all markdown and text files
files = self.get_all_files(repo)
# Optionally exclude README
if not include_readme:
files = {k: v for k, v in files.items() if "README" not in k}
# Combine into single text
combined = self.extract_text_from_files(files)
return {
"repo": repo,
"files": files,
"combined_text": combined,
"file_count": len(files),
"total_chars": len(combined),
}
def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor:
"""Factory function to create GitHub ingestor."""
return GitHubIngestor(token=token)