Witness ALL data: Include source code in ingestion

- LocalIngestor: Include ALL files by default (source code, configs, etc.)
- GitHubIngestor: Include ALL files by default
- AI witnesses everything and transforms it into documentation
- Filter only build artifacts (.pyc, .so, dist, build)

Philosophy: Don't filter what the AI can see - let it decide
what's relevant. The AI can document code directly!
This commit is contained in:
2026-03-13 05:59:03 +00:00
parent 8cb29889cc
commit 81bfe8994a
2 changed files with 75 additions and 27 deletions
+36 -11
View File
@@ -74,19 +74,29 @@ class GitHubIngestor:
repo: str, repo: str,
extensions: Optional[list[str]] = None, extensions: Optional[list[str]] = None,
exclude_dirs: Optional[list[str]] = None, exclude_dirs: Optional[list[str]] = None,
include_all: bool = True,
) -> dict[str, str]: ) -> dict[str, str]:
"""Get all files from a repository. """Get all files from a repository - INCLUDING SOURCE CODE.
The AI witnesses EVERYTHING and transforms it into documentation.
Don't filter what the AI can see - let it decide what's relevant.
Args: Args:
repo: "owner/repo" format repo: "owner/repo" format
extensions: File extensions to include (e.g., ['.md', '.txt']) extensions: File extensions to include (None = ALL files!)
exclude_dirs: Directories to exclude exclude_dirs: Directories to exclude (build artifacts, etc.)
include_all: If True, include ALL files (default True!)
Returns: Returns:
Dictionary mapping file paths to content Dictionary mapping file paths to content
""" """
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft"] # Default: include ALL files - the AI will witness everything!
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"] if include_all:
extensions = None # No extension filter
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
else:
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
files = {} files = {}
@@ -97,7 +107,7 @@ class GitHubIngestor:
# Single file # Single file
if contents.get("type") == "file": if contents.get("type") == "file":
content_path = contents["path"] content_path = contents["path"]
if self._should_include(content_path, extensions, exclude_dirs): if self._should_include(content_path, extensions, exclude_dirs, include_all):
files[content_path] = self.get_file_content(repo, content_path) files[content_path] = self.get_file_content(repo, content_path)
return return
@@ -110,7 +120,7 @@ class GitHubIngestor:
if not any(excl in item_path for excl in exclude_dirs): if not any(excl in item_path for excl in exclude_dirs):
walk_directory(item_path) walk_directory(item_path)
elif item_type == "file": elif item_type == "file":
if self._should_include(item_path, extensions, exclude_dirs): if self._should_include(item_path, extensions, exclude_dirs, include_all):
files[item_path] = self.get_file_content(repo, item_path) files[item_path] = self.get_file_content(repo, item_path)
walk_directory() walk_directory()
@@ -119,17 +129,32 @@ class GitHubIngestor:
def _should_include( def _should_include(
self, self,
path: str, path: str,
extensions: list[str], extensions: Optional[list[str]],
exclude_dirs: list[str], exclude_dirs: list[str],
include_all: bool = True,
) -> bool: ) -> bool:
"""Check if file should be included.""" """Check if file should be included.
Args:
path: File path to check
extensions: List of extensions (None if include_all=True)
exclude_dirs: Directories to exclude
include_all: Include ALL files (ignore extensions)
"""
# Exclude directories # Exclude directories
for excl in exclude_dirs: for excl in exclude_dirs:
if excl in path: if excl in path:
return False return False
# Check extension # If include_all, include everything
return any(path.endswith(ext) for ext in extensions) if include_all:
return True
# Otherwise check extensions
if extensions:
return any(path.endswith(ext) for ext in extensions)
return True
def extract_text_from_files(self, files: dict[str, str]) -> str: def extract_text_from_files(self, files: dict[str, str]) -> str:
"""Combine all file contents into a single text blob. """Combine all file contents into a single text blob.
+39 -16
View File
@@ -1,6 +1,7 @@
"""Local file ingestion for Opus Orchestrator. """Local file ingestion for Opus Orchestrator.
Fetches content from local files and directories. Fetches content from local files and directories - INCLUDING SOURCE CODE.
The AI witnesses ALL data and transforms it into documentation.
""" """
import os import os
@@ -13,10 +14,12 @@ class LocalIngestor:
"""Fetch and parse content from local files and directories. """Fetch and parse content from local files and directories.
Supports: Supports:
- Individual files - ALL file types (including source code!)
- Directories (recursive) - Directories (recursive)
- File pattern matching - File pattern matching
- Multiple formats (txt, md, markdown, etc.)
Philosophy: The AI witnesses everything and transforms it.
Don't filter what the AI can see - let it decide what's relevant.
""" """
def __init__( def __init__(
@@ -33,10 +36,8 @@ class LocalIngestor:
self.root_path = Path(root_path) if root_path else Path.cwd() self.root_path = Path(root_path) if root_path else Path.cwd()
self.encoding = encoding self.encoding = encoding
# Default file extensions to include # INCLUDE ALL FILES - let the AI witness everything!
self.default_extensions = [".txt", ".md", ".markdown", ".notes", ".draft", ".rst", ".org"] # Only exclude build artifacts and version control
# Files/dirs to exclude
self.exclude_patterns = [ self.exclude_patterns = [
".git", ".git",
".svn", ".svn",
@@ -46,10 +47,20 @@ class LocalIngestor:
"venv", "venv",
".env", ".env",
"*.pyc", "*.pyc",
"*.so",
"*.o",
"*.a",
"*.dylib",
".DS_Store", ".DS_Store",
"*.swp", "*.swp",
"*.tmp", "*.tmp",
".cache", ".cache",
"dist",
"build",
"*.egg-info",
".pytest_cache",
".mypy_cache",
".tox",
] ]
def is_excluded(self, path: Path) -> bool: def is_excluded(self, path: Path) -> bool:
@@ -75,14 +86,18 @@ class LocalIngestor:
extensions: Optional[list[str]] = None, extensions: Optional[list[str]] = None,
recursive: bool = True, recursive: bool = True,
max_files: int = 1000, max_files: int = 1000,
include_all: bool = True,
) -> dict[Path, str]: ) -> dict[Path, str]:
"""Get all text files from a path. """Get all files from a path - INCLUDING SOURCE CODE.
The AI witnesses everything and transforms it into documentation.
Args: Args:
path: File or directory path path: File or directory path
extensions: File extensions to include (default: common text formats) extensions: File extensions to include (None = ALL files!)
recursive: Recursively scan directories recursive: Recursively scan directories
max_files: Maximum number of files to read max_files: Maximum number of files to read
include_all: If True, include ALL files (default True!)
Returns: Returns:
Dict mapping file paths to content Dict mapping file paths to content
@@ -92,14 +107,19 @@ class LocalIngestor:
if not path.exists(): if not path.exists():
raise FileNotFoundError(f"Path does not exist: {path}") raise FileNotFoundError(f"Path does not exist: {path}")
extensions = extensions or self.default_extensions # Default: include ALL files (source code, configs, everything!)
extensions = [ext.lower() for ext in extensions] # The AI will witness and transform everything
if include_all:
extensions = None # Include everything
else:
extensions = extensions or self.default_extensions
extensions = [ext.lower() for ext in extensions]
results = {} results = {}
if path.is_file(): if path.is_file():
# Single file # Single file
if self._has_valid_extension(path, extensions): if include_all or self._has_valid_extension(path, extensions or []):
results[path] = self._read_file(path) results[path] = self._read_file(path)
else: else:
# Directory # Directory
@@ -107,7 +127,8 @@ class LocalIngestor:
path, path,
extensions, extensions,
recursive, recursive,
max_files - len(results) max_files - len(results),
include_all=include_all,
) )
for f in files: for f in files:
@@ -126,9 +147,10 @@ class LocalIngestor:
def _scan_directory( def _scan_directory(
self, self,
directory: Path, directory: Path,
extensions: list[str], extensions: Optional[list[str]],
recursive: bool, recursive: bool,
max_files: int, max_files: int,
include_all: bool = True,
) -> list[Path]: ) -> list[Path]:
"""Scan directory for matching files.""" """Scan directory for matching files."""
files = [] files = []
@@ -145,7 +167,8 @@ class LocalIngestor:
if self.is_excluded(filepath): if self.is_excluded(filepath):
continue continue
if self._has_valid_extension(filepath, extensions): # Include ALL files (including source code!)
if include_all or self._has_valid_extension(filepath, extensions or []):
files.append(filepath) files.append(filepath)
if len(files) >= max_files: if len(files) >= max_files:
@@ -154,7 +177,7 @@ class LocalIngestor:
# Non-recursive # Non-recursive
for item in directory.iterdir(): for item in directory.iterdir():
if item.is_file() and not self.is_excluded(item): if item.is_file() and not self.is_excluded(item):
if self._has_valid_extension(item, extensions): if include_all or self._has_valid_extension(item, extensions or []):
files.append(item) files.append(item)
if len(files) >= max_files: if len(files) >= max_files: