13bce7500c
This commit includes: - A full code review and bug fixes for language drift, package loading, and CLI crashes. - The generated 15,000-word philosophy manuscript. - CODE_REVIEW.md and CHANGELOG.md documenting the process.
166 lines
5.7 KiB
Python
166 lines
5.7 KiB
Python
"""GitHub ingestion for Opus Orchestrator.
|
|
|
|
Fetches content from GitHub repositories for use as source material.
|
|
"""
|
|
|
|
import os
|
|
import base64
|
|
import re
|
|
from typing import Any, Optional
|
|
|
|
import requests
|
|
# Note: dotenv loading removed - set GITHUB_TOKEN environment variable directly
|
|
|
|
|
|
class GitHubIngestor:
|
|
"""Fetch and parse content from GitHub repositories."""
|
|
|
|
def __init__(self, token: Optional[str] = None):
|
|
self.token = token or os.environ.get("GITHUB_TOKEN")
|
|
|
|
# Token is optional - only required for private repos
|
|
if self.token:
|
|
self.headers = {
|
|
"Authorization": f"token {self.token}",
|
|
"Accept": "application/vnd.github.v3+json",
|
|
}
|
|
else:
|
|
self.headers = {
|
|
"Accept": "application/vnd.github.v3+json",
|
|
}
|
|
print("⚠️ No GitHub token provided. Using unauthenticated requests (rate limited).")
|
|
|
|
self.base_url = "https://api.github.com"
|
|
|
|
def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
|
|
"""Get contents of a directory or file."""
|
|
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
|
if branch:
|
|
url += f"?ref={branch}"
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
response.raise_for_status()
|
|
|
|
return response.json()
|
|
|
|
def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
|
|
"""Get content of a single file."""
|
|
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
|
if branch:
|
|
url += f"?ref={branch}"
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# Decode base64 content
|
|
if data.get("encoding") == "base64":
|
|
content = base64.b64decode(data["content"]).decode("utf-8")
|
|
return content
|
|
|
|
return data.get("content", "")
|
|
|
|
def get_all_files(
|
|
self,
|
|
repo: str,
|
|
branch: Optional[str] = None,
|
|
path: str = "",
|
|
extensions: Optional[list[str]] = None,
|
|
exclude_dirs: Optional[list[str]] = None,
|
|
include_all: bool = True,
|
|
) -> dict[str, str]:
|
|
"""Get all files from a repository."""
|
|
if include_all:
|
|
extensions = None
|
|
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
|
|
else:
|
|
extensions = extensions or [".md", ".txt"]
|
|
exclude_dirs = exclude_dirs or [".git", "node_modules"]
|
|
|
|
files = {}
|
|
|
|
def walk_directory(current_path: str = ""):
|
|
try:
|
|
contents = self.get_contents(repo, current_path, branch)
|
|
except Exception as e:
|
|
print(f"Error walking {current_path}: {e}")
|
|
return
|
|
|
|
if isinstance(contents, dict):
|
|
if contents.get("type") == "file":
|
|
content_path = contents["path"]
|
|
if self._should_include(content_path, extensions, exclude_dirs, include_all):
|
|
files[content_path] = self.get_file_content(repo, content_path, branch)
|
|
return
|
|
|
|
for item in contents:
|
|
item_path = item.get("path", "")
|
|
item_type = item.get("type")
|
|
|
|
if item_type == "dir":
|
|
if not any(excl in item_path for excl in exclude_dirs):
|
|
walk_directory(item_path)
|
|
elif item_type == "file":
|
|
if self._should_include(item_path, extensions, exclude_dirs, include_all):
|
|
try:
|
|
files[item_path] = self.get_file_content(repo, item_path, branch)
|
|
except Exception as e:
|
|
print(f"Error reading {item_path}: {e}")
|
|
|
|
walk_directory(path)
|
|
return files
|
|
|
|
def _should_include(
|
|
self,
|
|
path: str,
|
|
extensions: Optional[list[str]],
|
|
exclude_dirs: list[str],
|
|
include_all: bool = True,
|
|
) -> bool:
|
|
"""Check if file should be included."""
|
|
for excl in exclude_dirs:
|
|
if excl in path:
|
|
return False
|
|
if include_all:
|
|
return True
|
|
if extensions:
|
|
return any(path.endswith(ext) for ext in extensions)
|
|
return True
|
|
|
|
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
|
"""Combine all file contents."""
|
|
combined = []
|
|
for filename, content in sorted(files.items()):
|
|
combined.append(f"=== {filename} ===\n")
|
|
combined.append(content)
|
|
combined.append("\n\n")
|
|
return "".join(combined)
|
|
|
|
def ingest_repo(
|
|
self,
|
|
repo: str,
|
|
branch: Optional[str] = None,
|
|
path: str = "",
|
|
include_readme: bool = True,
|
|
) -> dict[str, Any]:
|
|
"""Ingest a complete repository."""
|
|
files = self.get_all_files(repo, branch, path)
|
|
if not include_readme:
|
|
files = {k: v for k, v in files.items() if "README" not in k}
|
|
combined = self.extract_text_from_files(files)
|
|
return {
|
|
"repo": repo,
|
|
"branch": branch,
|
|
"path": path,
|
|
"files": files,
|
|
"combined_text": combined,
|
|
"file_count": len(files),
|
|
"total_chars": len(combined),
|
|
}
|
|
|
|
|
|
def create_github_ingestor(token: Optional[str] = None) -> GitHubIngestor:
|
|
"""Factory function to create GitHub ingestor."""
|
|
return GitHubIngestor(token=token)
|