"""GitHub ingestion for Opus Orchestrator. Fetches content from GitHub repositories for use as source material. """ import os import base64 import re from typing import Any, Optional import requests # Note: dotenv loading removed - set GITHUB_TOKEN environment variable directly class GitHubIngestor: """Fetch and parse content from GitHub repositories.""" def __init__(self, token: Optional[str] = None): self.token = token or os.environ.get("GITHUB_TOKEN") # Token is optional - only required for private repos # Public repos can be accessed without authentication if self.token: self.headers = { "Authorization": f"token {self.token}", "Accept": "application/vnd.github.v3+json", } else: # No token - use unauthenticated requests (rate limited) self.headers = { "Accept": "application/vnd.github.v3+json", } print("⚠️ No GitHub token provided. Using unauthenticated requests (rate limited).") self.base_url = "https://api.github.com" def get_contents(self, repo: str, path: str = "") -> list[dict]: """Get contents of a directory or file. Args: repo: "owner/repo" format path: directory path (default: root) Returns: List of content items """ url = f"{self.base_url}/repos/{repo}/contents/{path}" response = requests.get(url, headers=self.headers) response.raise_for_status() return response.json() def get_file_content(self, repo: str, path: str) -> str: """Get content of a single file. Args: repo: "owner/repo" format path: file path Returns: Decoded file content """ url = f"{self.base_url}/repos/{repo}/contents/{path}" response = requests.get(url, headers=self.headers) response.raise_for_status() data = response.json() # Decode base64 content if data.get("encoding") == "base64": content = base64.b64decode(data["content"]).decode("utf-8") return content return data.get("content", "") def get_all_files( self, repo: str, extensions: Optional[list[str]] = None, exclude_dirs: Optional[list[str]] = None, include_all: bool = True, ) -> dict[str, str]: """Get all files from a repository - INCLUDING SOURCE CODE. The AI witnesses EVERYTHING and transforms it into documentation. Don't filter what the AI can see - let it decide what's relevant. Args: repo: "owner/repo" format extensions: File extensions to include (None = ALL files!) exclude_dirs: Directories to exclude (build artifacts, etc.) include_all: If True, include ALL files (default True!) Returns: Dictionary mapping file paths to content """ # Default: include ALL files - the AI will witness everything! if include_all: extensions = None # No extension filter exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"] else: extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"] exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"] files = {} def walk_directory(path: str = ""): contents = self.get_contents(repo, path) if isinstance(contents, dict): # Single file if contents.get("type") == "file": content_path = contents["path"] if self._should_include(content_path, extensions, exclude_dirs, include_all): files[content_path] = self.get_file_content(repo, content_path) return for item in contents: item_path = item.get("path", "") item_type = item.get("type") if item_type == "dir": # Check if excluded if not any(excl in item_path for excl in exclude_dirs): walk_directory(item_path) elif item_type == "file": if self._should_include(item_path, extensions, exclude_dirs, include_all): files[item_path] = self.get_file_content(repo, item_path) walk_directory() return files def _should_include( self, path: str, extensions: Optional[list[str]], exclude_dirs: list[str], include_all: bool = True, ) -> bool: """Check if file should be included. Args: path: File path to check extensions: List of extensions (None if include_all=True) exclude_dirs: Directories to exclude include_all: Include ALL files (ignore extensions) """ # Exclude directories for excl in exclude_dirs: if excl in path: return False # If include_all, include everything if include_all: return True # Otherwise check extensions if extensions: return any(path.endswith(ext) for ext in extensions) return True def extract_text_from_files(self, files: dict[str, str]) -> str: """Combine all file contents into a single text blob. Args: files: Dictionary of filename -> content Returns: Combined text """ combined = [] for filename, content in sorted(files.items()): combined.append(f"=== {filename} ===\n") combined.append(content) combined.append("\n\n") return "".join(combined) def ingest_repo( self, repo: str, include_readme: bool = True, ) -> dict[str, Any]: """Ingest a complete repository. Args: repo: "owner/repo" format include_readme: Include README.md files Returns: Dictionary with files, combined_text, and metadata """ # Get all markdown and text files files = self.get_all_files(repo) # Optionally exclude README if not include_readme: files = {k: v for k, v in files.items() if "README" not in k} # Combine into single text combined = self.extract_text_from_files(files) return { "repo": repo, "files": files, "combined_text": combined, "file_count": len(files), "total_chars": len(combined), } def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor: """Factory function to create GitHub ingestor.""" return GitHubIngestor(token=token)