Witness ALL data: Include source code in ingestion
- LocalIngestor: Include ALL files by default (source code, configs, etc.) - GitHubIngestor: Include ALL files by default - AI witnesses everything and transforms it into documentation - Filter only build artifacts (.pyc, .so, dist, build) Philosophy: Don't filter what the AI can see - let it decide what's relevant. The AI can document code directly!
This commit is contained in:
@@ -74,19 +74,29 @@ class GitHubIngestor:
|
||||
repo: str,
|
||||
extensions: Optional[list[str]] = None,
|
||||
exclude_dirs: Optional[list[str]] = None,
|
||||
include_all: bool = True,
|
||||
) -> dict[str, str]:
|
||||
"""Get all files from a repository.
|
||||
"""Get all files from a repository - INCLUDING SOURCE CODE.
|
||||
|
||||
The AI witnesses EVERYTHING and transforms it into documentation.
|
||||
Don't filter what the AI can see - let it decide what's relevant.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
extensions: File extensions to include (e.g., ['.md', '.txt'])
|
||||
exclude_dirs: Directories to exclude
|
||||
extensions: File extensions to include (None = ALL files!)
|
||||
exclude_dirs: Directories to exclude (build artifacts, etc.)
|
||||
include_all: If True, include ALL files (default True!)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping file paths to content
|
||||
"""
|
||||
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
||||
# Default: include ALL files - the AI will witness everything!
|
||||
if include_all:
|
||||
extensions = None # No extension filter
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
|
||||
else:
|
||||
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
||||
|
||||
files = {}
|
||||
|
||||
@@ -97,7 +107,7 @@ class GitHubIngestor:
|
||||
# Single file
|
||||
if contents.get("type") == "file":
|
||||
content_path = contents["path"]
|
||||
if self._should_include(content_path, extensions, exclude_dirs):
|
||||
if self._should_include(content_path, extensions, exclude_dirs, include_all):
|
||||
files[content_path] = self.get_file_content(repo, content_path)
|
||||
return
|
||||
|
||||
@@ -110,7 +120,7 @@ class GitHubIngestor:
|
||||
if not any(excl in item_path for excl in exclude_dirs):
|
||||
walk_directory(item_path)
|
||||
elif item_type == "file":
|
||||
if self._should_include(item_path, extensions, exclude_dirs):
|
||||
if self._should_include(item_path, extensions, exclude_dirs, include_all):
|
||||
files[item_path] = self.get_file_content(repo, item_path)
|
||||
|
||||
walk_directory()
|
||||
@@ -119,17 +129,32 @@ class GitHubIngestor:
|
||||
def _should_include(
|
||||
self,
|
||||
path: str,
|
||||
extensions: list[str],
|
||||
extensions: Optional[list[str]],
|
||||
exclude_dirs: list[str],
|
||||
include_all: bool = True,
|
||||
) -> bool:
|
||||
"""Check if file should be included."""
|
||||
"""Check if file should be included.
|
||||
|
||||
Args:
|
||||
path: File path to check
|
||||
extensions: List of extensions (None if include_all=True)
|
||||
exclude_dirs: Directories to exclude
|
||||
include_all: Include ALL files (ignore extensions)
|
||||
"""
|
||||
# Exclude directories
|
||||
for excl in exclude_dirs:
|
||||
if excl in path:
|
||||
return False
|
||||
|
||||
# Check extension
|
||||
return any(path.endswith(ext) for ext in extensions)
|
||||
# If include_all, include everything
|
||||
if include_all:
|
||||
return True
|
||||
|
||||
# Otherwise check extensions
|
||||
if extensions:
|
||||
return any(path.endswith(ext) for ext in extensions)
|
||||
|
||||
return True
|
||||
|
||||
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
||||
"""Combine all file contents into a single text blob.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Local file ingestion for Opus Orchestrator.
|
||||
|
||||
Fetches content from local files and directories.
|
||||
Fetches content from local files and directories - INCLUDING SOURCE CODE.
|
||||
The AI witnesses ALL data and transforms it into documentation.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -13,10 +14,12 @@ class LocalIngestor:
|
||||
"""Fetch and parse content from local files and directories.
|
||||
|
||||
Supports:
|
||||
- Individual files
|
||||
- ALL file types (including source code!)
|
||||
- Directories (recursive)
|
||||
- File pattern matching
|
||||
- Multiple formats (txt, md, markdown, etc.)
|
||||
|
||||
Philosophy: The AI witnesses everything and transforms it.
|
||||
Don't filter what the AI can see - let it decide what's relevant.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -33,10 +36,8 @@ class LocalIngestor:
|
||||
self.root_path = Path(root_path) if root_path else Path.cwd()
|
||||
self.encoding = encoding
|
||||
|
||||
# Default file extensions to include
|
||||
self.default_extensions = [".txt", ".md", ".markdown", ".notes", ".draft", ".rst", ".org"]
|
||||
|
||||
# Files/dirs to exclude
|
||||
# INCLUDE ALL FILES - let the AI witness everything!
|
||||
# Only exclude build artifacts and version control
|
||||
self.exclude_patterns = [
|
||||
".git",
|
||||
".svn",
|
||||
@@ -46,10 +47,20 @@ class LocalIngestor:
|
||||
"venv",
|
||||
".env",
|
||||
"*.pyc",
|
||||
"*.so",
|
||||
"*.o",
|
||||
"*.a",
|
||||
"*.dylib",
|
||||
".DS_Store",
|
||||
"*.swp",
|
||||
"*.tmp",
|
||||
".cache",
|
||||
"dist",
|
||||
"build",
|
||||
"*.egg-info",
|
||||
".pytest_cache",
|
||||
".mypy_cache",
|
||||
".tox",
|
||||
]
|
||||
|
||||
def is_excluded(self, path: Path) -> bool:
|
||||
@@ -75,14 +86,18 @@ class LocalIngestor:
|
||||
extensions: Optional[list[str]] = None,
|
||||
recursive: bool = True,
|
||||
max_files: int = 1000,
|
||||
include_all: bool = True,
|
||||
) -> dict[Path, str]:
|
||||
"""Get all text files from a path.
|
||||
"""Get all files from a path - INCLUDING SOURCE CODE.
|
||||
|
||||
The AI witnesses everything and transforms it into documentation.
|
||||
|
||||
Args:
|
||||
path: File or directory path
|
||||
extensions: File extensions to include (default: common text formats)
|
||||
extensions: File extensions to include (None = ALL files!)
|
||||
recursive: Recursively scan directories
|
||||
max_files: Maximum number of files to read
|
||||
include_all: If True, include ALL files (default True!)
|
||||
|
||||
Returns:
|
||||
Dict mapping file paths to content
|
||||
@@ -92,14 +107,19 @@ class LocalIngestor:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Path does not exist: {path}")
|
||||
|
||||
extensions = extensions or self.default_extensions
|
||||
extensions = [ext.lower() for ext in extensions]
|
||||
# Default: include ALL files (source code, configs, everything!)
|
||||
# The AI will witness and transform everything
|
||||
if include_all:
|
||||
extensions = None # Include everything
|
||||
else:
|
||||
extensions = extensions or self.default_extensions
|
||||
extensions = [ext.lower() for ext in extensions]
|
||||
|
||||
results = {}
|
||||
|
||||
if path.is_file():
|
||||
# Single file
|
||||
if self._has_valid_extension(path, extensions):
|
||||
if include_all or self._has_valid_extension(path, extensions or []):
|
||||
results[path] = self._read_file(path)
|
||||
else:
|
||||
# Directory
|
||||
@@ -107,7 +127,8 @@ class LocalIngestor:
|
||||
path,
|
||||
extensions,
|
||||
recursive,
|
||||
max_files - len(results)
|
||||
max_files - len(results),
|
||||
include_all=include_all,
|
||||
)
|
||||
|
||||
for f in files:
|
||||
@@ -126,9 +147,10 @@ class LocalIngestor:
|
||||
def _scan_directory(
|
||||
self,
|
||||
directory: Path,
|
||||
extensions: list[str],
|
||||
extensions: Optional[list[str]],
|
||||
recursive: bool,
|
||||
max_files: int,
|
||||
include_all: bool = True,
|
||||
) -> list[Path]:
|
||||
"""Scan directory for matching files."""
|
||||
files = []
|
||||
@@ -145,7 +167,8 @@ class LocalIngestor:
|
||||
if self.is_excluded(filepath):
|
||||
continue
|
||||
|
||||
if self._has_valid_extension(filepath, extensions):
|
||||
# Include ALL files (including source code!)
|
||||
if include_all or self._has_valid_extension(filepath, extensions or []):
|
||||
files.append(filepath)
|
||||
|
||||
if len(files) >= max_files:
|
||||
@@ -154,7 +177,7 @@ class LocalIngestor:
|
||||
# Non-recursive
|
||||
for item in directory.iterdir():
|
||||
if item.is_file() and not self.is_excluded(item):
|
||||
if self._has_valid_extension(item, extensions):
|
||||
if include_all or self._has_valid_extension(item, extensions or []):
|
||||
files.append(item)
|
||||
|
||||
if len(files) >= max_files:
|
||||
|
||||
Reference in New Issue
Block a user