"""GitHub ingestion for Opus Orchestrator. Fetches content from GitHub repositories for use as source material. """ import os import base64 import re from typing import Any, Optional import requests # Note: dotenv loading removed - set GITHUB_TOKEN environment variable directly class GitHubIngestor: """Fetch and parse content from GitHub repositories.""" def __init__(self, token: Optional[str] = None): self.token = token or os.environ.get("GITHUB_TOKEN") # Token is optional - only required for private repos if self.token: self.headers = { "Authorization": f"token {self.token}", "Accept": "application/vnd.github.v3+json", } else: self.headers = { "Accept": "application/vnd.github.v3+json", } print("⚠️ No GitHub token provided. Using unauthenticated requests (rate limited).") self.base_url = "https://api.github.com" def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]: """Get contents of a directory or file.""" url = f"{self.base_url}/repos/{repo}/contents/{path}" if branch: url += f"?ref={branch}" response = requests.get(url, headers=self.headers) response.raise_for_status() return response.json() def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str: """Get content of a single file.""" url = f"{self.base_url}/repos/{repo}/contents/{path}" if branch: url += f"?ref={branch}" response = requests.get(url, headers=self.headers) response.raise_for_status() data = response.json() # Decode base64 content if data.get("encoding") == "base64": content = base64.b64decode(data["content"]).decode("utf-8") return content return data.get("content", "") def get_all_files( self, repo: str, branch: Optional[str] = None, path: str = "", extensions: Optional[list[str]] = None, exclude_dirs: Optional[list[str]] = None, include_all: bool = True, ) -> dict[str, str]: """Get all files from a repository.""" if include_all: extensions = None exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"] else: extensions = extensions or [".md", ".txt"] exclude_dirs = exclude_dirs or [".git", "node_modules"] files = {} def walk_directory(current_path: str = ""): try: contents = self.get_contents(repo, current_path, branch) except Exception as e: print(f"Error walking {current_path}: {e}") return if isinstance(contents, dict): if contents.get("type") == "file": content_path = contents["path"] if self._should_include(content_path, extensions, exclude_dirs, include_all): files[content_path] = self.get_file_content(repo, content_path, branch) return for item in contents: item_path = item.get("path", "") item_type = item.get("type") if item_type == "dir": if not any(excl in item_path for excl in exclude_dirs): walk_directory(item_path) elif item_type == "file": if self._should_include(item_path, extensions, exclude_dirs, include_all): try: files[item_path] = self.get_file_content(repo, item_path, branch) except Exception as e: print(f"Error reading {item_path}: {e}") walk_directory(path) return files def _should_include( self, path: str, extensions: Optional[list[str]], exclude_dirs: list[str], include_all: bool = True, ) -> bool: """Check if file should be included.""" for excl in exclude_dirs: if excl in path: return False if include_all: return True if extensions: return any(path.endswith(ext) for ext in extensions) return True def extract_text_from_files(self, files: dict[str, str]) -> str: """Combine all file contents.""" combined = [] for filename, content in sorted(files.items()): combined.append(f"=== {filename} ===\n") combined.append(content) combined.append("\n\n") return "".join(combined) def ingest_repo( self, repo: str, branch: Optional[str] = None, path: str = "", include_readme: bool = True, ) -> dict[str, Any]: """Ingest a complete repository.""" files = self.get_all_files(repo, branch, path) if not include_readme: files = {k: v for k, v in files.items() if "README" not in k} combined = self.extract_text_from_files(files) return { "repo": repo, "branch": branch, "path": path, "files": files, "combined_text": combined, "file_count": len(files), "total_chars": len(combined), } def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor: """Factory function to create GitHub ingestor.""" return GitHubIngestor(token=token)