Files
opus-orchestrator-ai/opus_orchestrator/utils/github_ingest.py
T
Gemini AI 13bce7500c feat: Full code review, bug fixes, and philosophy book generation
This commit includes:
- A full code review and bug fixes for language drift, package loading, and CLI crashes.
- The generated 15,000-word philosophy manuscript.
- CODE_REVIEW.md and CHANGELOG.md documenting the process.
2026-05-20 21:15:11 +00:00

166 lines
5.7 KiB
Python

"""GitHub ingestion for Opus Orchestrator.
Fetches content from GitHub repositories for use as source material.
"""
import os
import base64
import re
from typing import Any, Optional
import requests
# Note: dotenv loading removed - set GITHUB_TOKEN environment variable directly
class GitHubIngestor:
"""Fetch and parse content from GitHub repositories."""
def __init__(self, token: Optional[str] = None):
self.token = token or os.environ.get("GITHUB_TOKEN")
# Token is optional - only required for private repos
if self.token:
self.headers = {
"Authorization": f"token {self.token}",
"Accept": "application/vnd.github.v3+json",
}
else:
self.headers = {
"Accept": "application/vnd.github.v3+json",
}
print("⚠️ No GitHub token provided. Using unauthenticated requests (rate limited).")
self.base_url = "https://api.github.com"
def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
"""Get contents of a directory or file."""
url = f"{self.base_url}/repos/{repo}/contents/{path}"
if branch:
url += f"?ref={branch}"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.json()
def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
"""Get content of a single file."""
url = f"{self.base_url}/repos/{repo}/contents/{path}"
if branch:
url += f"?ref={branch}"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
data = response.json()
# Decode base64 content
if data.get("encoding") == "base64":
content = base64.b64decode(data["content"]).decode("utf-8")
return content
return data.get("content", "")
def get_all_files(
self,
repo: str,
branch: Optional[str] = None,
path: str = "",
extensions: Optional[list[str]] = None,
exclude_dirs: Optional[list[str]] = None,
include_all: bool = True,
) -> dict[str, str]:
"""Get all files from a repository."""
if include_all:
extensions = None
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
else:
extensions = extensions or [".md", ".txt"]
exclude_dirs = exclude_dirs or [".git", "node_modules"]
files = {}
def walk_directory(current_path: str = ""):
try:
contents = self.get_contents(repo, current_path, branch)
except Exception as e:
print(f"Error walking {current_path}: {e}")
return
if isinstance(contents, dict):
if contents.get("type") == "file":
content_path = contents["path"]
if self._should_include(content_path, extensions, exclude_dirs, include_all):
files[content_path] = self.get_file_content(repo, content_path, branch)
return
for item in contents:
item_path = item.get("path", "")
item_type = item.get("type")
if item_type == "dir":
if not any(excl in item_path for excl in exclude_dirs):
walk_directory(item_path)
elif item_type == "file":
if self._should_include(item_path, extensions, exclude_dirs, include_all):
try:
files[item_path] = self.get_file_content(repo, item_path, branch)
except Exception as e:
print(f"Error reading {item_path}: {e}")
walk_directory(path)
return files
def _should_include(
self,
path: str,
extensions: Optional[list[str]],
exclude_dirs: list[str],
include_all: bool = True,
) -> bool:
"""Check if file should be included."""
for excl in exclude_dirs:
if excl in path:
return False
if include_all:
return True
if extensions:
return any(path.endswith(ext) for ext in extensions)
return True
def extract_text_from_files(self, files: dict[str, str]) -> str:
"""Combine all file contents."""
combined = []
for filename, content in sorted(files.items()):
combined.append(f"=== {filename} ===\n")
combined.append(content)
combined.append("\n\n")
return "".join(combined)
def ingest_repo(
self,
repo: str,
branch: Optional[str] = None,
path: str = "",
include_readme: bool = True,
) -> dict[str, Any]:
"""Ingest a complete repository."""
files = self.get_all_files(repo, branch, path)
if not include_readme:
files = {k: v for k, v in files.items() if "README" not in k}
combined = self.extract_text_from_files(files)
return {
"repo": repo,
"branch": branch,
"path": path,
"files": files,
"combined_text": combined,
"file_count": len(files),
"total_chars": len(combined),
}
def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor:
"""Factory function to create GitHub ingestor."""
return GitHubIngestor(token=token)