Witness ALL data: Include source code in ingestion
- LocalIngestor: Include ALL files by default (source code, configs, etc.) - GitHubIngestor: Include ALL files by default - AI witnesses everything and transforms it into documentation - Filter only build artifacts (.pyc, .so, dist, build) Philosophy: Don't filter what the AI can see - let it decide what's relevant. The AI can document code directly!
This commit is contained in:
@@ -74,19 +74,29 @@ class GitHubIngestor:
|
|||||||
repo: str,
|
repo: str,
|
||||||
extensions: Optional[list[str]] = None,
|
extensions: Optional[list[str]] = None,
|
||||||
exclude_dirs: Optional[list[str]] = None,
|
exclude_dirs: Optional[list[str]] = None,
|
||||||
|
include_all: bool = True,
|
||||||
) -> dict[str, str]:
|
) -> dict[str, str]:
|
||||||
"""Get all files from a repository.
|
"""Get all files from a repository - INCLUDING SOURCE CODE.
|
||||||
|
|
||||||
|
The AI witnesses EVERYTHING and transforms it into documentation.
|
||||||
|
Don't filter what the AI can see - let it decide what's relevant.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
repo: "owner/repo" format
|
repo: "owner/repo" format
|
||||||
extensions: File extensions to include (e.g., ['.md', '.txt'])
|
extensions: File extensions to include (None = ALL files!)
|
||||||
exclude_dirs: Directories to exclude
|
exclude_dirs: Directories to exclude (build artifacts, etc.)
|
||||||
|
include_all: If True, include ALL files (default True!)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping file paths to content
|
Dictionary mapping file paths to content
|
||||||
"""
|
"""
|
||||||
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft"]
|
# Default: include ALL files - the AI will witness everything!
|
||||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
if include_all:
|
||||||
|
extensions = None # No extension filter
|
||||||
|
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
|
||||||
|
else:
|
||||||
|
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
|
||||||
|
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
||||||
|
|
||||||
files = {}
|
files = {}
|
||||||
|
|
||||||
@@ -97,7 +107,7 @@ class GitHubIngestor:
|
|||||||
# Single file
|
# Single file
|
||||||
if contents.get("type") == "file":
|
if contents.get("type") == "file":
|
||||||
content_path = contents["path"]
|
content_path = contents["path"]
|
||||||
if self._should_include(content_path, extensions, exclude_dirs):
|
if self._should_include(content_path, extensions, exclude_dirs, include_all):
|
||||||
files[content_path] = self.get_file_content(repo, content_path)
|
files[content_path] = self.get_file_content(repo, content_path)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -110,7 +120,7 @@ class GitHubIngestor:
|
|||||||
if not any(excl in item_path for excl in exclude_dirs):
|
if not any(excl in item_path for excl in exclude_dirs):
|
||||||
walk_directory(item_path)
|
walk_directory(item_path)
|
||||||
elif item_type == "file":
|
elif item_type == "file":
|
||||||
if self._should_include(item_path, extensions, exclude_dirs):
|
if self._should_include(item_path, extensions, exclude_dirs, include_all):
|
||||||
files[item_path] = self.get_file_content(repo, item_path)
|
files[item_path] = self.get_file_content(repo, item_path)
|
||||||
|
|
||||||
walk_directory()
|
walk_directory()
|
||||||
@@ -119,17 +129,32 @@ class GitHubIngestor:
|
|||||||
def _should_include(
|
def _should_include(
|
||||||
self,
|
self,
|
||||||
path: str,
|
path: str,
|
||||||
extensions: list[str],
|
extensions: Optional[list[str]],
|
||||||
exclude_dirs: list[str],
|
exclude_dirs: list[str],
|
||||||
|
include_all: bool = True,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Check if file should be included."""
|
"""Check if file should be included.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: File path to check
|
||||||
|
extensions: List of extensions (None if include_all=True)
|
||||||
|
exclude_dirs: Directories to exclude
|
||||||
|
include_all: Include ALL files (ignore extensions)
|
||||||
|
"""
|
||||||
# Exclude directories
|
# Exclude directories
|
||||||
for excl in exclude_dirs:
|
for excl in exclude_dirs:
|
||||||
if excl in path:
|
if excl in path:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check extension
|
# If include_all, include everything
|
||||||
return any(path.endswith(ext) for ext in extensions)
|
if include_all:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Otherwise check extensions
|
||||||
|
if extensions:
|
||||||
|
return any(path.endswith(ext) for ext in extensions)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
||||||
"""Combine all file contents into a single text blob.
|
"""Combine all file contents into a single text blob.
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Local file ingestion for Opus Orchestrator.
|
"""Local file ingestion for Opus Orchestrator.
|
||||||
|
|
||||||
Fetches content from local files and directories.
|
Fetches content from local files and directories - INCLUDING SOURCE CODE.
|
||||||
|
The AI witnesses ALL data and transforms it into documentation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -13,10 +14,12 @@ class LocalIngestor:
|
|||||||
"""Fetch and parse content from local files and directories.
|
"""Fetch and parse content from local files and directories.
|
||||||
|
|
||||||
Supports:
|
Supports:
|
||||||
- Individual files
|
- ALL file types (including source code!)
|
||||||
- Directories (recursive)
|
- Directories (recursive)
|
||||||
- File pattern matching
|
- File pattern matching
|
||||||
- Multiple formats (txt, md, markdown, etc.)
|
|
||||||
|
Philosophy: The AI witnesses everything and transforms it.
|
||||||
|
Don't filter what the AI can see - let it decide what's relevant.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -33,10 +36,8 @@ class LocalIngestor:
|
|||||||
self.root_path = Path(root_path) if root_path else Path.cwd()
|
self.root_path = Path(root_path) if root_path else Path.cwd()
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
# Default file extensions to include
|
# INCLUDE ALL FILES - let the AI witness everything!
|
||||||
self.default_extensions = [".txt", ".md", ".markdown", ".notes", ".draft", ".rst", ".org"]
|
# Only exclude build artifacts and version control
|
||||||
|
|
||||||
# Files/dirs to exclude
|
|
||||||
self.exclude_patterns = [
|
self.exclude_patterns = [
|
||||||
".git",
|
".git",
|
||||||
".svn",
|
".svn",
|
||||||
@@ -46,10 +47,20 @@ class LocalIngestor:
|
|||||||
"venv",
|
"venv",
|
||||||
".env",
|
".env",
|
||||||
"*.pyc",
|
"*.pyc",
|
||||||
|
"*.so",
|
||||||
|
"*.o",
|
||||||
|
"*.a",
|
||||||
|
"*.dylib",
|
||||||
".DS_Store",
|
".DS_Store",
|
||||||
"*.swp",
|
"*.swp",
|
||||||
"*.tmp",
|
"*.tmp",
|
||||||
".cache",
|
".cache",
|
||||||
|
"dist",
|
||||||
|
"build",
|
||||||
|
"*.egg-info",
|
||||||
|
".pytest_cache",
|
||||||
|
".mypy_cache",
|
||||||
|
".tox",
|
||||||
]
|
]
|
||||||
|
|
||||||
def is_excluded(self, path: Path) -> bool:
|
def is_excluded(self, path: Path) -> bool:
|
||||||
@@ -75,14 +86,18 @@ class LocalIngestor:
|
|||||||
extensions: Optional[list[str]] = None,
|
extensions: Optional[list[str]] = None,
|
||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
max_files: int = 1000,
|
max_files: int = 1000,
|
||||||
|
include_all: bool = True,
|
||||||
) -> dict[Path, str]:
|
) -> dict[Path, str]:
|
||||||
"""Get all text files from a path.
|
"""Get all files from a path - INCLUDING SOURCE CODE.
|
||||||
|
|
||||||
|
The AI witnesses everything and transforms it into documentation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: File or directory path
|
path: File or directory path
|
||||||
extensions: File extensions to include (default: common text formats)
|
extensions: File extensions to include (None = ALL files!)
|
||||||
recursive: Recursively scan directories
|
recursive: Recursively scan directories
|
||||||
max_files: Maximum number of files to read
|
max_files: Maximum number of files to read
|
||||||
|
include_all: If True, include ALL files (default True!)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mapping file paths to content
|
Dict mapping file paths to content
|
||||||
@@ -92,14 +107,19 @@ class LocalIngestor:
|
|||||||
if not path.exists():
|
if not path.exists():
|
||||||
raise FileNotFoundError(f"Path does not exist: {path}")
|
raise FileNotFoundError(f"Path does not exist: {path}")
|
||||||
|
|
||||||
extensions = extensions or self.default_extensions
|
# Default: include ALL files (source code, configs, everything!)
|
||||||
extensions = [ext.lower() for ext in extensions]
|
# The AI will witness and transform everything
|
||||||
|
if include_all:
|
||||||
|
extensions = None # Include everything
|
||||||
|
else:
|
||||||
|
extensions = extensions or self.default_extensions
|
||||||
|
extensions = [ext.lower() for ext in extensions]
|
||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
# Single file
|
# Single file
|
||||||
if self._has_valid_extension(path, extensions):
|
if include_all or self._has_valid_extension(path, extensions or []):
|
||||||
results[path] = self._read_file(path)
|
results[path] = self._read_file(path)
|
||||||
else:
|
else:
|
||||||
# Directory
|
# Directory
|
||||||
@@ -107,7 +127,8 @@ class LocalIngestor:
|
|||||||
path,
|
path,
|
||||||
extensions,
|
extensions,
|
||||||
recursive,
|
recursive,
|
||||||
max_files - len(results)
|
max_files - len(results),
|
||||||
|
include_all=include_all,
|
||||||
)
|
)
|
||||||
|
|
||||||
for f in files:
|
for f in files:
|
||||||
@@ -126,9 +147,10 @@ class LocalIngestor:
|
|||||||
def _scan_directory(
|
def _scan_directory(
|
||||||
self,
|
self,
|
||||||
directory: Path,
|
directory: Path,
|
||||||
extensions: list[str],
|
extensions: Optional[list[str]],
|
||||||
recursive: bool,
|
recursive: bool,
|
||||||
max_files: int,
|
max_files: int,
|
||||||
|
include_all: bool = True,
|
||||||
) -> list[Path]:
|
) -> list[Path]:
|
||||||
"""Scan directory for matching files."""
|
"""Scan directory for matching files."""
|
||||||
files = []
|
files = []
|
||||||
@@ -145,7 +167,8 @@ class LocalIngestor:
|
|||||||
if self.is_excluded(filepath):
|
if self.is_excluded(filepath):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._has_valid_extension(filepath, extensions):
|
# Include ALL files (including source code!)
|
||||||
|
if include_all or self._has_valid_extension(filepath, extensions or []):
|
||||||
files.append(filepath)
|
files.append(filepath)
|
||||||
|
|
||||||
if len(files) >= max_files:
|
if len(files) >= max_files:
|
||||||
@@ -154,7 +177,7 @@ class LocalIngestor:
|
|||||||
# Non-recursive
|
# Non-recursive
|
||||||
for item in directory.iterdir():
|
for item in directory.iterdir():
|
||||||
if item.is_file() and not self.is_excluded(item):
|
if item.is_file() and not self.is_excluded(item):
|
||||||
if self._has_valid_extension(item, extensions):
|
if include_all or self._has_valid_extension(item, extensions or []):
|
||||||
files.append(item)
|
files.append(item)
|
||||||
|
|
||||||
if len(files) >= max_files:
|
if len(files) >= max_files:
|
||||||
|
|||||||
Reference in New Issue
Block a user