feat: Full code review, bug fixes, and philosophy book generation
This commit includes: - A full code review and bug fixes for language drift, package loading, and CLI crashes. - The generated 15,000-word philosophy manuscript. - CODE_REVIEW.md and CHANGELOG.md documenting the process.
This commit is contained in:
@@ -19,14 +19,12 @@ class GitHubIngestor:
|
||||
self.token = token or os.environ.get("GITHUB_TOKEN")
|
||||
|
||||
# Token is optional - only required for private repos
|
||||
# Public repos can be accessed without authentication
|
||||
if self.token:
|
||||
self.headers = {
|
||||
"Authorization": f"token {self.token}",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
else:
|
||||
# No token - use unauthenticated requests (rate limited)
|
||||
self.headers = {
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
@@ -34,34 +32,22 @@ class GitHubIngestor:
|
||||
|
||||
self.base_url = "https://api.github.com"
|
||||
|
||||
def get_contents(self, repo: str, path: str = "") -> list[dict]:
|
||||
"""Get contents of a directory or file.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
path: directory path (default: root)
|
||||
|
||||
Returns:
|
||||
List of content items
|
||||
"""
|
||||
def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
|
||||
"""Get contents of a directory or file."""
|
||||
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
||||
if branch:
|
||||
url += f"?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
def get_file_content(self, repo: str, path: str) -> str:
|
||||
"""Get content of a single file.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
path: file path
|
||||
|
||||
Returns:
|
||||
Decoded file content
|
||||
"""
|
||||
def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
|
||||
"""Get content of a single file."""
|
||||
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
||||
if branch:
|
||||
url += f"?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
@@ -78,43 +64,34 @@ class GitHubIngestor:
|
||||
def get_all_files(
|
||||
self,
|
||||
repo: str,
|
||||
branch: Optional[str] = None,
|
||||
path: str = "",
|
||||
extensions: Optional[list[str]] = None,
|
||||
exclude_dirs: Optional[list[str]] = None,
|
||||
include_all: bool = True,
|
||||
) -> dict[str, str]:
|
||||
"""Get all files from a repository - INCLUDING SOURCE CODE.
|
||||
|
||||
The AI witnesses EVERYTHING and transforms it into documentation.
|
||||
Don't filter what the AI can see - let it decide what's relevant.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
extensions: File extensions to include (None = ALL files!)
|
||||
exclude_dirs: Directories to exclude (build artifacts, etc.)
|
||||
include_all: If True, include ALL files (default True!)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping file paths to content
|
||||
"""
|
||||
# Default: include ALL files - the AI will witness everything!
|
||||
"""Get all files from a repository."""
|
||||
if include_all:
|
||||
extensions = None # No extension filter
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
|
||||
extensions = None
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
|
||||
else:
|
||||
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
||||
extensions = extensions or [".md", ".txt"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules"]
|
||||
|
||||
files = {}
|
||||
|
||||
def walk_directory(path: str = ""):
|
||||
contents = self.get_contents(repo, path)
|
||||
def walk_directory(current_path: str = ""):
|
||||
try:
|
||||
contents = self.get_contents(repo, current_path, branch)
|
||||
except Exception as e:
|
||||
print(f"Error walking {current_path}: {e}")
|
||||
return
|
||||
|
||||
if isinstance(contents, dict):
|
||||
# Single file
|
||||
if contents.get("type") == "file":
|
||||
content_path = contents["path"]
|
||||
if self._should_include(content_path, extensions, exclude_dirs, include_all):
|
||||
files[content_path] = self.get_file_content(repo, content_path)
|
||||
files[content_path] = self.get_file_content(repo, content_path, branch)
|
||||
return
|
||||
|
||||
for item in contents:
|
||||
@@ -122,14 +99,16 @@ class GitHubIngestor:
|
||||
item_type = item.get("type")
|
||||
|
||||
if item_type == "dir":
|
||||
# Check if excluded
|
||||
if not any(excl in item_path for excl in exclude_dirs):
|
||||
walk_directory(item_path)
|
||||
elif item_type == "file":
|
||||
if self._should_include(item_path, extensions, exclude_dirs, include_all):
|
||||
files[item_path] = self.get_file_content(repo, item_path)
|
||||
try:
|
||||
files[item_path] = self.get_file_content(repo, item_path, branch)
|
||||
except Exception as e:
|
||||
print(f"Error reading {item_path}: {e}")
|
||||
|
||||
walk_directory()
|
||||
walk_directory(path)
|
||||
return files
|
||||
|
||||
def _should_include(
|
||||
@@ -139,73 +118,41 @@ class GitHubIngestor:
|
||||
exclude_dirs: list[str],
|
||||
include_all: bool = True,
|
||||
) -> bool:
|
||||
"""Check if file should be included.
|
||||
|
||||
Args:
|
||||
path: File path to check
|
||||
extensions: List of extensions (None if include_all=True)
|
||||
exclude_dirs: Directories to exclude
|
||||
include_all: Include ALL files (ignore extensions)
|
||||
"""
|
||||
# Exclude directories
|
||||
"""Check if file should be included."""
|
||||
for excl in exclude_dirs:
|
||||
if excl in path:
|
||||
return False
|
||||
|
||||
# If include_all, include everything
|
||||
if include_all:
|
||||
return True
|
||||
|
||||
# Otherwise check extensions
|
||||
if extensions:
|
||||
return any(path.endswith(ext) for ext in extensions)
|
||||
|
||||
return True
|
||||
|
||||
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
||||
"""Combine all file contents into a single text blob.
|
||||
|
||||
Args:
|
||||
files: Dictionary of filename -> content
|
||||
|
||||
Returns:
|
||||
Combined text
|
||||
"""
|
||||
"""Combine all file contents."""
|
||||
combined = []
|
||||
|
||||
for filename, content in sorted(files.items()):
|
||||
combined.append(f"=== {filename} ===\n")
|
||||
combined.append(content)
|
||||
combined.append("\n\n")
|
||||
|
||||
return "".join(combined)
|
||||
|
||||
def ingest_repo(
|
||||
self,
|
||||
repo: str,
|
||||
branch: Optional[str] = None,
|
||||
path: str = "",
|
||||
include_readme: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""Ingest a complete repository.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
include_readme: Include README.md files
|
||||
|
||||
Returns:
|
||||
Dictionary with files, combined_text, and metadata
|
||||
"""
|
||||
# Get all markdown and text files
|
||||
files = self.get_all_files(repo)
|
||||
|
||||
# Optionally exclude README
|
||||
"""Ingest a complete repository."""
|
||||
files = self.get_all_files(repo, branch, path)
|
||||
if not include_readme:
|
||||
files = {k: v for k, v in files.items() if "README" not in k}
|
||||
|
||||
# Combine into single text
|
||||
combined = self.extract_text_from_files(files)
|
||||
|
||||
return {
|
||||
"repo": repo,
|
||||
"branch": branch,
|
||||
"path": path,
|
||||
"files": files,
|
||||
"combined_text": combined,
|
||||
"file_count": len(files),
|
||||
|
||||
Reference in New Issue
Block a user