feat: Full code review, bug fixes, and philosophy book generation

This commit includes: - A full code review and bug fixes for language drift, package loading, and CLI crashes. - The generated 15,000-word philosophy manuscript. - CODE_REVIEW.md and CHANGELOG.md documenting the process.
2026-05-20 21:15:11 +00:00
parent dddf5c4a80
commit 13bce7500c
13 changed files with 1160 additions and 198 deletions
@@ -19,14 +19,12 @@ class GitHubIngestor:
        self.token = token or os.environ.get("GITHUB_TOKEN")
        
        # Token is optional - only required for private repos
-        # Public repos can be accessed without authentication
        if self.token:
            self.headers = {
                "Authorization": f"token {self.token}",
                "Accept": "application/vnd.github.v3+json",
            }
        else:
-            # No token - use unauthenticated requests (rate limited)
            self.headers = {
                "Accept": "application/vnd.github.v3+json",
            }
@@ -34,34 +32,22 @@ class GitHubIngestor:
        
        self.base_url = "https://api.github.com"
    
-    def get_contents(self, repo: str, path: str = "") -> list[dict]:
-        """Get contents of a directory or file.
-        
-        Args:
-            repo: "owner/repo" format
-            path: directory path (default: root)
-            
-        Returns:
-            List of content items
-        """
+    def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
+        """Get contents of a directory or file."""
        url = f"{self.base_url}/repos/{repo}/contents/{path}"
+        if branch:
+            url += f"?ref={branch}"
        
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()
        
        return response.json()
    
-    def get_file_content(self, repo: str, path: str) -> str:
-        """Get content of a single file.
-        
-        Args:
-            repo: "owner/repo" format
-            path: file path
-            
-        Returns:
-            Decoded file content
-        """
+    def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
+        """Get content of a single file."""
        url = f"{self.base_url}/repos/{repo}/contents/{path}"
+        if branch:
+            url += f"?ref={branch}"
        
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()
@@ -78,43 +64,34 @@ class GitHubIngestor:
    def get_all_files(
        self,
        repo: str,
+        branch: Optional[str] = None,
+        path: str = "",
        extensions: Optional[list[str]] = None,
        exclude_dirs: Optional[list[str]] = None,
        include_all: bool = True,
    ) -> dict[str, str]:
-        """Get all files from a repository - INCLUDING SOURCE CODE.
-        
-        The AI witnesses EVERYTHING and transforms it into documentation.
-        Don't filter what the AI can see - let it decide what's relevant.
-        
-        Args:
-            repo: "owner/repo" format
-            extensions: File extensions to include (None = ALL files!)
-            exclude_dirs: Directories to exclude (build artifacts, etc.)
-            include_all: If True, include ALL files (default True!)
-            
-        Returns:
-            Dictionary mapping file paths to content
-        """
-        # Default: include ALL files - the AI will witness everything!
+        """Get all files from a repository."""
        if include_all:
-            extensions = None  # No extension filter
-            exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
+            extensions = None
+            exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
        else:
-            extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
-            exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
+            extensions = extensions or [".md", ".txt"]
+            exclude_dirs = exclude_dirs or [".git", "node_modules"]
        
        files = {}
        
-        def walk_directory(path: str = ""):
-            contents = self.get_contents(repo, path)
+        def walk_directory(current_path: str = ""):
+            try:
+                contents = self.get_contents(repo, current_path, branch)
+            except Exception as e:
+                print(f"Error walking {current_path}: {e}")
+                return
            
            if isinstance(contents, dict):
-                # Single file
                if contents.get("type") == "file":
                    content_path = contents["path"]
                    if self._should_include(content_path, extensions, exclude_dirs, include_all):
-                        files[content_path] = self.get_file_content(repo, content_path)
+                        files[content_path] = self.get_file_content(repo, content_path, branch)
                return
            
            for item in contents:
@@ -122,14 +99,16 @@ class GitHubIngestor:
                item_type = item.get("type")
                
                if item_type == "dir":
-                    # Check if excluded
                    if not any(excl in item_path for excl in exclude_dirs):
                        walk_directory(item_path)
                elif item_type == "file":
                    if self._should_include(item_path, extensions, exclude_dirs, include_all):
-                        files[item_path] = self.get_file_content(repo, item_path)
+                        try:
+                            files[item_path] = self.get_file_content(repo, item_path, branch)
+                        except Exception as e:
+                            print(f"Error reading {item_path}: {e}")
        
-        walk_directory()
+        walk_directory(path)
        return files
    
    def _should_include(
@@ -139,73 +118,41 @@ class GitHubIngestor:
        exclude_dirs: list[str],
        include_all: bool = True,
    ) -> bool:
-        """Check if file should be included.
-        
-        Args:
-            path: File path to check
-            extensions: List of extensions (None if include_all=True)
-            exclude_dirs: Directories to exclude
-            include_all: Include ALL files (ignore extensions)
-        """
-        # Exclude directories
+        """Check if file should be included."""
        for excl in exclude_dirs:
            if excl in path:
                return False
-        
-        # If include_all, include everything
        if include_all:
            return True
-        
-        # Otherwise check extensions
        if extensions:
            return any(path.endswith(ext) for ext in extensions)
-        
        return True
    
    def extract_text_from_files(self, files: dict[str, str]) -> str:
-        """Combine all file contents into a single text blob.
-        
-        Args:
-            files: Dictionary of filename -> content
-            
-        Returns:
-            Combined text
-        """
+        """Combine all file contents."""
        combined = []
-        
        for filename, content in sorted(files.items()):
            combined.append(f"=== {filename} ===\n")
            combined.append(content)
            combined.append("\n\n")
-        
        return "".join(combined)
    
    def ingest_repo(
        self,
        repo: str,
+        branch: Optional[str] = None,
+        path: str = "",
        include_readme: bool = True,
    ) -> dict[str, Any]:
-        """Ingest a complete repository.
-        
-        Args:
-            repo: "owner/repo" format
-            include_readme: Include README.md files
-            
-        Returns:
-            Dictionary with files, combined_text, and metadata
-        """
-        # Get all markdown and text files
-        files = self.get_all_files(repo)
-        
-        # Optionally exclude README
+        """Ingest a complete repository."""
+        files = self.get_all_files(repo, branch, path)
        if not include_readme:
            files = {k: v for k, v in files.items() if "README" not in k}
-        
-        # Combine into single text
        combined = self.extract_text_from_files(files)
-        
        return {
            "repo": repo,
+            "branch": branch,
+            "path": path,
            "files": files,
            "combined_text": combined,
            "file_count": len(files),