opus-orchestrator-ai/opus_orchestrator/utils/github_ingest.py

"""GitHub ingestion for Opus Orchestrator.

Fetches content from GitHub repositories for use as source material.
"""

import os
import base64
import re
from typing import Any, Optional

import requests
# Note: dotenv loading removed - set GITHUB_TOKEN environment variable directly


class GitHubIngestor:
    """Fetch and parse content from GitHub repositories."""

    def __init__(self, token: Optional[str] = None):
        self.token = token or os.environ.get("GITHUB_TOKEN")

        # Token is optional - only required for private repos
        if self.token:
            self.headers = {
                "Authorization": f"token {self.token}",
                "Accept": "application/vnd.github.v3+json",
            }
        else:
            self.headers = {
                "Accept": "application/vnd.github.v3+json",
            }
            print("⚠️  No GitHub token provided. Using unauthenticated requests (rate limited).")

        self.base_url = "https://api.github.com"

    def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
        """Get contents of a directory or file."""
        url = f"{self.base_url}/repos/{repo}/contents/{path}"
        if branch:
            url += f"?ref={branch}"

        response = requests.get(url, headers=self.headers)
        response.raise_for_status()

        return response.json()

    def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
        """Get content of a single file."""
        url = f"{self.base_url}/repos/{repo}/contents/{path}"
        if branch:
            url += f"?ref={branch}"

        response = requests.get(url, headers=self.headers)
        response.raise_for_status()

        data = response.json()

        # Decode base64 content
        if data.get("encoding") == "base64":
            content = base64.b64decode(data["content"]).decode("utf-8")
            return content

        return data.get("content", "")

    def get_all_files(
        self,
        repo: str,
        branch: Optional[str] = None,
        path: str = "",
        extensions: Optional[list[str]] = None,
        exclude_dirs: Optional[list[str]] = None,
        include_all: bool = True,
    ) -> dict[str, str]:
        """Get all files from a repository."""
        if include_all:
            extensions = None
            exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
        else:
            extensions = extensions or [".md", ".txt"]
            exclude_dirs = exclude_dirs or [".git", "node_modules"]

        files = {}

        def walk_directory(current_path: str = ""):
            try:
                contents = self.get_contents(repo, current_path, branch)
            except Exception as e:
                print(f"Error walking {current_path}: {e}")
                return

            if isinstance(contents, dict):
                if contents.get("type") == "file":
                    content_path = contents["path"]
                    if self._should_include(content_path, extensions, exclude_dirs, include_all):
                        files[content_path] = self.get_file_content(repo, content_path, branch)
                return

            for item in contents:
                item_path = item.get("path", "")
                item_type = item.get("type")

                if item_type == "dir":
                    if not any(excl in item_path for excl in exclude_dirs):
                        walk_directory(item_path)
                elif item_type == "file":
                    if self._should_include(item_path, extensions, exclude_dirs, include_all):
                        try:
                            files[item_path] = self.get_file_content(repo, item_path, branch)
                        except Exception as e:
                            print(f"Error reading {item_path}: {e}")

        walk_directory(path)
        return files

    def _should_include(
        self,
        path: str,
        extensions: Optional[list[str]],
        exclude_dirs: list[str],
        include_all: bool = True,
    ) -> bool:
        """Check if file should be included."""
        for excl in exclude_dirs:
            if excl in path:
                return False
        if include_all:
            return True
        if extensions:
            return any(path.endswith(ext) for ext in extensions)
        return True

    def extract_text_from_files(self, files: dict[str, str]) -> str:
        """Combine all file contents."""
        combined = []
        for filename, content in sorted(files.items()):
            combined.append(f"=== {filename} ===\n")
            combined.append(content)
            combined.append("\n\n")
        return "".join(combined)

    def ingest_repo(
        self,
        repo: str,
        branch: Optional[str] = None,
        path: str = "",
        include_readme: bool = True,
    ) -> dict[str, Any]:
        """Ingest a complete repository."""
        files = self.get_all_files(repo, branch, path)
        if not include_readme:
            files = {k: v for k, v in files.items() if "README" not in k}
        combined = self.extract_text_from_files(files)
        return {
            "repo": repo,
            "branch": branch,
            "path": path,
            "files": files,
            "combined_text": combined,
            "file_count": len(files),
            "total_chars": len(combined),
        }


def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor:
    """Factory function to create GitHub ingestor."""
    return GitHubIngestor(token=token)