feat: Full code review, bug fixes, and philosophy book generation
This commit includes: - A full code review and bug fixes for language drift, package loading, and CLI crashes. - The generated 15,000-word philosophy manuscript. - CODE_REVIEW.md and CHANGELOG.md documenting the process.
This commit is contained in:
@@ -113,7 +113,32 @@ def __getattr__(name: str):
|
||||
if name == "ExportOptions":
|
||||
from opus_orchestrator.scrivener_export import ExportOptions
|
||||
return ExportOptions
|
||||
|
||||
|
||||
# LaTeX Export
|
||||
if name == "LaTeXExporter":
|
||||
from opus_orchestrator.latex_compile import LaTeXExporter
|
||||
return LaTeXExporter
|
||||
if name == "CompileOptions":
|
||||
from opus_orchestrator.latex_compile import CompileOptions
|
||||
return CompileOptions
|
||||
if name == "export_to_latex":
|
||||
from opus_orchestrator.latex_compile import export_to_latex
|
||||
return export_to_latex
|
||||
if name == "compile_pdf":
|
||||
from opus_orchestrator.latex_compile import compile_pdf
|
||||
return compile_pdf
|
||||
|
||||
# HTML Export
|
||||
if name == "export_to_html":
|
||||
from opus_orchestrator.html_export import export_to_html
|
||||
return export_to_html
|
||||
if name == "export_to_pdf":
|
||||
from opus_orchestrator.html_export import export_to_pdf
|
||||
return export_to_pdf
|
||||
if name == "HTMLExporter":
|
||||
from opus_orchestrator.html_export import HTMLExporter
|
||||
return HTMLExporter
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
@@ -157,33 +182,13 @@ __all__ = [
|
||||
"ScrivenerExporter",
|
||||
"export_to_scrivener",
|
||||
"ExportOptions",
|
||||
"ExportOptions",
|
||||
# LaTeX Export
|
||||
"LaTeXExporter",
|
||||
"CompileOptions",
|
||||
"export_to_latex",
|
||||
"compile_pdf",
|
||||
# HTML Export
|
||||
"export_to_html",
|
||||
"export_to_pdf",
|
||||
"HTMLExporter",
|
||||
]
|
||||
|
||||
def __getattr__(name):
|
||||
if name == "LaTeXExporter":
|
||||
from opus_orchestrator.latex_compile import LaTeXExporter
|
||||
return LaTeXExporter
|
||||
if name == "CompileOptions":
|
||||
from opus_orchestrator.latex_compile import CompileOptions
|
||||
return CompileOptions
|
||||
if name == "export_to_latex":
|
||||
from opus_orchestrator.latex_compile import export_to_latex
|
||||
return export_to_latex
|
||||
if name == "compile_pdf":
|
||||
from opus_orchestrator.latex_compile import compile_pdf
|
||||
return compile_pdf
|
||||
raise AttributeError(f"module has no attribute {name!r}")
|
||||
|
||||
# HTML Export
|
||||
def __getattr__(name):
|
||||
if name == "export_to_html":
|
||||
from opus_orchestrator.html_export import export_to_html
|
||||
return export_to_html
|
||||
if name == "export_to_pdf":
|
||||
from opus_orchestrator.html_export import export_to_pdf
|
||||
return export_to_pdf
|
||||
if name == "HTMLExporter":
|
||||
from opus_orchestrator.html_export import HTMLExporter
|
||||
return HTMLExporter
|
||||
raise AttributeError(f"module has no attribute {name!r}")
|
||||
|
||||
@@ -148,6 +148,9 @@ class BaseAgent(ABC, Generic[T]):
|
||||
Complete system prompt
|
||||
"""
|
||||
base = self.system_prompt
|
||||
|
||||
# Add universal language constraint
|
||||
base += "\n\nIMPORTANT: You must respond ONLY in English. Do not use any other language."
|
||||
|
||||
if context:
|
||||
context_str = "\n\n## Context\n"
|
||||
|
||||
@@ -105,11 +105,12 @@ Return your critique as a JSON with: {"score": 0.0-1.0, "strengths": [], "weakne
|
||||
system_message="""You are a Professional Writer.
|
||||
|
||||
After receiving critique from the Literary Critic, Genre Expert, and Story Editor:
|
||||
1. Consider each feedback point
|
||||
2. Identify what to revise
|
||||
3. Output your revision plan
|
||||
1. Consider each feedback point.
|
||||
2. Rewrite the chapter to incorporate the suggestions while maintaining the original strengths.
|
||||
3. Ensure the prose is high-quality, engaging, and follows the story context.
|
||||
4. IMPORTANT: You must respond ONLY in English. Do not use Chinese characters.
|
||||
|
||||
You do NOT rewrite - you plan revisions. Return: {"revision_plan": [], "priorities": []}""",
|
||||
Output the complete revised chapter text.""",
|
||||
llm_config={
|
||||
"model": self.model,
|
||||
"api_key": self.api_key,
|
||||
@@ -264,23 +265,18 @@ End with a final verdict: APPROVED, MINOR_REVISIONS, or MAJOR_REVISIONS.
|
||||
## Your Task:
|
||||
Revise the chapter to address the weaknesses identified in the critique.
|
||||
Preserve the strengths. Improve the story, pacing, and prose.
|
||||
Output ONLY the full, revised chapter text.
|
||||
"""
|
||||
# Use the writer agent to revise
|
||||
revision_result = self.agents["writer"].initiate_chat(
|
||||
self.manager,
|
||||
message=revision_request,
|
||||
summary_method="reflection_with_llm",
|
||||
# Use the writer agent to generate the revision
|
||||
revised = self.agents["writer"].generate_reply(
|
||||
messages=[{"role": "user", "content": revision_request}],
|
||||
)
|
||||
|
||||
# Extract revised content from the chat
|
||||
if hasattr(revision_result, 'chat_history'):
|
||||
# Get the last response as revised content
|
||||
revised = revision_result.chat_history[-1].get('content', '') if revision_result.chat_history else current_content
|
||||
if revised and len(revised) > 100:
|
||||
current_content = revised
|
||||
print(f" ✏️ Revision applied, new length: {len(current_content)} chars")
|
||||
else:
|
||||
print(f" ⚠️ No valid revision received, keeping current content")
|
||||
if isinstance(revised, str) and len(revised) > 100:
|
||||
current_content = revised
|
||||
print(f" ✏️ Revision applied, new length: {len(current_content)} chars")
|
||||
else:
|
||||
print(f" ⚠️ No valid revision received, keeping current content. Response: {revised}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Revision failed: {e}, continuing with current content")
|
||||
|
||||
+77
-35
@@ -551,16 +551,19 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
from opus_orchestrator import run_opus, OpusOrchestrator
|
||||
from opus_orchestrator.crews import create_fiction_crew, create_nonfiction_crew
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"
|
||||
{'='*60}")
|
||||
print("📚 OPUS ORCHESTRATOR AI")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'='*60}
|
||||
")
|
||||
|
||||
# Check for API client mode
|
||||
if args.api_url:
|
||||
client = OpusAPIClient(args.api_url)
|
||||
|
||||
print(f"🌐 API Client Mode")
|
||||
print(f" Server: {args.api_url}\n")
|
||||
print(f" Server: {args.api_url}
|
||||
")
|
||||
|
||||
# Call API
|
||||
try:
|
||||
@@ -582,7 +585,8 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
print(f"✅ Generation complete!")
|
||||
print(f" Words: {result.get('word_count', 'N/A'):,}")
|
||||
print(f" Chapters: {result.get('chapters', 'N/A')}")
|
||||
print(f" Framework: {result.get('framework', 'N/A')}\n")
|
||||
print(f" Framework: {result.get('framework', 'N/A')}
|
||||
")
|
||||
|
||||
manuscript = result.get("manuscript", "")
|
||||
|
||||
@@ -627,7 +631,8 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
# Use full content as seed
|
||||
full_text = content.text
|
||||
print(f" ✅ Loaded {len(full_text):,} characters from {content.metadata['file_count']} files")
|
||||
print(f" 📄 Files: {', '.join(content.metadata['files'])}\n")
|
||||
print(f" 📄 Files: {', '.join(content.metadata['files'])}
|
||||
")
|
||||
|
||||
seed_concept = full_text
|
||||
|
||||
@@ -660,7 +665,8 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
from opus_orchestrator.nonfiction_generator import NonfictionGenerator
|
||||
from opus_orchestrator.nonfiction_frameworks import NonfictionFramework
|
||||
|
||||
print("📚 Using Nonfiction Framework...\n")
|
||||
print("📚 Using Nonfiction Framework...
|
||||
")
|
||||
|
||||
# Map framework string to enum
|
||||
framework_map = {
|
||||
@@ -687,7 +693,8 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
|
||||
elif args.use_crewai:
|
||||
# Use CrewAI crews
|
||||
print("🛠️ Using CrewAI crews...\n")
|
||||
print("🛠️ Using CrewAI crews...
|
||||
")
|
||||
|
||||
if args.book_type == "fiction":
|
||||
crew = create_fiction_crew(
|
||||
@@ -703,7 +710,11 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
num_chapters=args.chapters,
|
||||
)
|
||||
|
||||
manuscript = "\n\n---\n\n".join(story)
|
||||
manuscript = "
|
||||
|
||||
---
|
||||
|
||||
".join(story)
|
||||
else:
|
||||
crew = create_nonfiction_crew(
|
||||
topic=args.genre,
|
||||
@@ -723,11 +734,12 @@ async def run_generate(args: argparse.Namespace) -> int:
|
||||
|
||||
print(f"🧵 Thread ID: {thread_id}")
|
||||
if args.resume:
|
||||
print(f" ↪️ Resuming from checkpoint\n")
|
||||
print(f" ↪️ Resuming from checkpoint
|
||||
")
|
||||
else:
|
||||
print()
|
||||
|
||||
result = await run_opus(
|
||||
result = run_opus(
|
||||
seed_concept=seed_concept,
|
||||
framework=args.framework,
|
||||
genre=args.genre,
|
||||
@@ -847,22 +859,26 @@ Target Words: {args.words:,}
|
||||
else:
|
||||
print(f" ⚠️ GitHub save failed: {resp.status_code} - {resp.text}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"
|
||||
{'='*60}")
|
||||
print(f"✅ COMPLETE!")
|
||||
print(f" Words: {word_count:,}")
|
||||
if not args.output and not args.save_s3 and not args.save_repo:
|
||||
print(f" Output: {output_path}")
|
||||
print(f"{'='*60}\n")
|
||||
print(f"{'='*60}
|
||||
")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
async def run_serve(args: argparse.Namespace) -> int:
|
||||
"""Start the OpenAPI server."""
|
||||
print(f"\n🚀 Starting Opus API Server...")
|
||||
print(f"
|
||||
🚀 Starting Opus API Server...")
|
||||
print(f" Host: {args.host}")
|
||||
print(f" Port: {args.port}")
|
||||
print(f" Docs: http://{args.host}:{args.port}/docs\n")
|
||||
print(f" Docs: http://{args.host}:{args.port}/docs
|
||||
")
|
||||
|
||||
try:
|
||||
from opus_orchestrator.server import run_server
|
||||
@@ -876,10 +892,12 @@ async def run_serve(args: argparse.Namespace) -> int:
|
||||
|
||||
async def run_ui(args: argparse.Namespace) -> int:
|
||||
"""Start the web UI only."""
|
||||
print(f"\n🎨 Starting Opus Web UI...")
|
||||
print(f"
|
||||
🎨 Starting Opus Web UI...")
|
||||
print(f" Host: {args.host}")
|
||||
print(f" Port: {args.port}")
|
||||
print(f" UI: http://{args.host}:{args.port}/\n")
|
||||
print(f" UI: http://{args.host}:{args.port}/
|
||||
")
|
||||
|
||||
try:
|
||||
from opus_orchestrator.server import create_app
|
||||
@@ -901,12 +919,15 @@ def run_ingest(args: argparse.Namespace) -> int:
|
||||
"""Ingest content from GitHub."""
|
||||
from opus_orchestrator import OpusOrchestrator
|
||||
|
||||
print(f"\n📥 Ingesting from GitHub: {args.repo}\n")
|
||||
print(f"
|
||||
📥 Ingesting from GitHub: {args.repo}
|
||||
")
|
||||
|
||||
# Check for API client mode
|
||||
if args.api_url:
|
||||
client = OpusAPIClient(args.api_url)
|
||||
print(f"🌐 API Client Mode: {args.api_url}\n")
|
||||
print(f"🌐 API Client Mode: {args.api_url}
|
||||
")
|
||||
|
||||
try:
|
||||
result = client.ingest(args.repo, include_readme=args.include_readme)
|
||||
@@ -926,7 +947,8 @@ def run_ingest(args: argparse.Namespace) -> int:
|
||||
|
||||
print(f"✅ Loaded {len(content_text):,} characters")
|
||||
print(f" Files: {file_count}")
|
||||
print(f" File list: {', '.join(files)}\n")
|
||||
print(f" File list: {', '.join(files)}
|
||||
")
|
||||
|
||||
if args.preview:
|
||||
print("📄 PREVIEW (first 2000 chars):")
|
||||
@@ -937,7 +959,8 @@ def run_ingest(args: argparse.Namespace) -> int:
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(content_text)
|
||||
print(f"\n💾 Saved to: {args.output}")
|
||||
print(f"
|
||||
💾 Saved to: {args.output}")
|
||||
|
||||
return 0
|
||||
|
||||
@@ -946,7 +969,9 @@ def run_s3_ingest(args: argparse.Namespace) -> int:
|
||||
"""Ingest content from S3/MinIO."""
|
||||
from opus_orchestrator import S3Ingestor
|
||||
|
||||
print(f"\n🪣 Ingesting from S3: {args.bucket}/{args.prefix}\n")
|
||||
print(f"
|
||||
🪣 Ingesting from S3: {args.bucket}/{args.prefix}
|
||||
")
|
||||
|
||||
if args.endpoint:
|
||||
print(f" Endpoint: {args.endpoint}")
|
||||
@@ -974,7 +999,8 @@ def run_s3_ingest(args: argparse.Namespace) -> int:
|
||||
|
||||
print(f"✅ Loaded {result['total_chars']:,} characters")
|
||||
print(f" Files: {result['file_count']}")
|
||||
print(f" File list: {', '.join(result['files'].keys())}\n")
|
||||
print(f" File list: {', '.join(result['files'].keys())}
|
||||
")
|
||||
|
||||
if args.preview:
|
||||
print("📄 PREVIEW (first 2000 chars):")
|
||||
@@ -985,7 +1011,8 @@ def run_s3_ingest(args: argparse.Namespace) -> int:
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(result["combined_text"])
|
||||
print(f"\n💾 Saved to: {args.output}")
|
||||
print(f"
|
||||
💾 Saved to: {args.output}")
|
||||
|
||||
return 0
|
||||
|
||||
@@ -994,7 +1021,9 @@ def run_local_ingest(args: argparse.Namespace) -> int:
|
||||
"""Ingest content from local files/directories."""
|
||||
from opus_orchestrator import LocalIngestor
|
||||
|
||||
print(f"\n📂 Ingesting from local: {args.path}\n")
|
||||
print(f"
|
||||
📂 Ingesting from local: {args.path}
|
||||
")
|
||||
|
||||
# Parse extensions
|
||||
extensions = None
|
||||
@@ -1039,7 +1068,8 @@ def run_local_ingest(args: argparse.Namespace) -> int:
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(content)
|
||||
print(f"\n💾 Saved to: {args.output}")
|
||||
print(f"
|
||||
💾 Saved to: {args.output}")
|
||||
|
||||
return 0
|
||||
|
||||
@@ -1048,7 +1078,9 @@ def run_frameworks(args: argparse.Namespace) -> int:
|
||||
"""List available frameworks."""
|
||||
from opus_orchestrator.frameworks import FRAMEWORKS
|
||||
|
||||
print("\n📚 AVAILABLE STORY FRAMEWORKS\n")
|
||||
print("
|
||||
📚 AVAILABLE STORY FRAMEWORKS
|
||||
")
|
||||
print("=" * 50)
|
||||
|
||||
for framework, info in FRAMEWORKS.items():
|
||||
@@ -1057,7 +1089,8 @@ def run_frameworks(args: argparse.Namespace) -> int:
|
||||
stages = info.get("stages", [])
|
||||
beats = info.get("beats", [])
|
||||
|
||||
print(f"\n{name}")
|
||||
print(f"
|
||||
{name}")
|
||||
print(f" {desc}")
|
||||
|
||||
if stages:
|
||||
@@ -1074,7 +1107,8 @@ def run_frameworks(args: argparse.Namespace) -> int:
|
||||
if len(beats) > 3:
|
||||
print(f" ... and {len(beats) - 3} more")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("
|
||||
" + "=" * 50)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -1084,37 +1118,45 @@ def run_config(args: argparse.Namespace) -> int:
|
||||
|
||||
config = get_config()
|
||||
|
||||
print("\n⚙️ OPUS CONFIGURATION\n")
|
||||
print("
|
||||
⚙️ OPUS CONFIGURATION
|
||||
")
|
||||
print("=" * 40)
|
||||
|
||||
print(f"\n🔹 Agent")
|
||||
print(f"
|
||||
🔹 Agent")
|
||||
print(f" Provider: {config.agent.provider}")
|
||||
print(f" Model: {config.agent.model}")
|
||||
print(f" Temperature: {config.agent.temperature}")
|
||||
print(f" Max Tokens: {config.agent.max_tokens or 'None'}")
|
||||
|
||||
print(f"\n🔹 Iteration")
|
||||
print(f"
|
||||
🔹 Iteration")
|
||||
print(f" Min Critic Rounds: {config.iteration.min_critic_rounds}")
|
||||
print(f" Max Critic Rounds: {config.iteration.max_critic_rounds}")
|
||||
print(f" Approval Threshold: {config.iteration.approval_threshold}")
|
||||
|
||||
print(f"\n🔹 Output")
|
||||
print(f"
|
||||
🔹 Output")
|
||||
print(f" Format: {config.output.format}")
|
||||
print(f" Include TOC: {config.output.include_toc}")
|
||||
print(f" Output Dir: {config.output.output_dir}")
|
||||
|
||||
print(f"\n🔹 Integrations")
|
||||
print(f"
|
||||
🔹 Integrations")
|
||||
print(f" GitHub Token: {'✓ Set' if config.github_token else '✗ Not Set'}")
|
||||
print(f" API Key: {'✓ Set' if config.agent.api_key else '✗ Not Set'}")
|
||||
|
||||
if args.show_keys:
|
||||
print(f"\n🔹 API Keys (unmasked)")
|
||||
print(f"
|
||||
🔹 API Keys (unmasked)")
|
||||
print(f" OPENAI_API_KEY: {os.environ.get('OPENAI_API_KEY', 'Not Set')[:20]}...")
|
||||
print(f" MINIMAX_API_KEY: {os.environ.get('MINIMAX_API_KEY', 'Not Set')[:20]}...")
|
||||
print(f" GITHUB_TOKEN: {os.environ.get('GITHUB_TOKEN', 'Not Set')[:20]}...")
|
||||
|
||||
if args.env:
|
||||
print(f"\n📋 ENVIRONMENT VARIABLES NEEDED:")
|
||||
print(f"
|
||||
📋 ENVIRONMENT VARIABLES NEEDED:")
|
||||
print("-" * 40)
|
||||
print("OPENAI_API_KEY=sk-... # Required for LLM")
|
||||
print("GITHUB_TOKEN=ghp_... # For private repos")
|
||||
|
||||
@@ -321,12 +321,12 @@ Generate a detailed outline with:
|
||||
content: Optional[RawContent] = None,
|
||||
sources: Optional[list[dict]] = None,
|
||||
) -> OpusState:
|
||||
"""Ingest raw content from multiple sources.
|
||||
|
||||
Args:
|
||||
content: Pre-loaded raw content
|
||||
sources: List of source configurations (github, local, s3)
|
||||
"""
|
||||
"""Ingest raw content from multiple sources."""
|
||||
# Skip if we already have content and weren't given specific new sources/content
|
||||
if self.state and self.state.raw_content and not content and not sources:
|
||||
print("ℹ️ Using existing raw content.")
|
||||
return self.state
|
||||
|
||||
if sources:
|
||||
from opus_orchestrator.utils.multi_source_ingest import ingest_multiple
|
||||
|
||||
@@ -335,7 +335,6 @@ Generate a detailed outline with:
|
||||
result = await ingest_multiple(
|
||||
sources=sources,
|
||||
github_token=self.config.github_token,
|
||||
# AWS keys would come from environment
|
||||
)
|
||||
|
||||
content = RawContent(
|
||||
@@ -399,29 +398,26 @@ Generate a detailed outline with:
|
||||
# =========================================================================
|
||||
|
||||
async def snowflake_stage_1(self) -> str:
|
||||
"""Stage 1: One sentence summary.
|
||||
|
||||
Take your one-paragraph story summary and cut it down to one sentence.
|
||||
"""
|
||||
"""Stage 1: One sentence summary."""
|
||||
print("❄️ SNOWFLAKE STAGE 1: One sentence summary...")
|
||||
|
||||
raw_content = self.state.raw_content.text if self.state.raw_content else ""
|
||||
|
||||
user_prompt = f"""Create a ONE SENTENCE summary of this story concept.
|
||||
user_prompt = f"""You are analyzing a collection of source materials to synthesize a new story.
|
||||
|
||||
## SOURCE CONTENT:
|
||||
{raw_content}
|
||||
|
||||
The sentence should contain:
|
||||
- Protagonist's name (or descriptor)
|
||||
## TASK:
|
||||
Synthesize the core narrative conflict and outcome from the source content into ONE compelling sentence.
|
||||
|
||||
The sentence must contain:
|
||||
- Protagonist's name or descriptor
|
||||
- Their goal
|
||||
- The conflict/obstacle
|
||||
- The central conflict/obstacle
|
||||
- The stakes
|
||||
|
||||
Example: "In a world where magic is forbidden, a young mage must master forbidden arts to save her dying brother, even if it means sparking a war with the ruling theocracy."
|
||||
|
||||
## Your seed content:
|
||||
{raw_content}
|
||||
|
||||
## Task:
|
||||
Write ONE compelling sentence that captures the entire story.
|
||||
"""
|
||||
response = await self.agents["architect"].call_llm(
|
||||
system_prompt="You are an expert story architect. Create concise, compelling summaries.",
|
||||
|
||||
@@ -67,7 +67,8 @@ class OpusPydanticAgent:
|
||||
# Build system prompt
|
||||
system_prompt = self.system_prompt or """You are an expert writer and editor for Opus Orchestrator.
|
||||
You produce high-quality, structured output that conforms to the given schema.
|
||||
Always follow best practices for the content type you're creating."""
|
||||
Always follow best practices for the content type you're creating.
|
||||
IMPORTANT: You must respond ONLY in English. Do not use any other language."""
|
||||
|
||||
if self.result_type:
|
||||
self._agent = Agent(
|
||||
|
||||
@@ -19,14 +19,12 @@ class GitHubIngestor:
|
||||
self.token = token or os.environ.get("GITHUB_TOKEN")
|
||||
|
||||
# Token is optional - only required for private repos
|
||||
# Public repos can be accessed without authentication
|
||||
if self.token:
|
||||
self.headers = {
|
||||
"Authorization": f"token {self.token}",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
else:
|
||||
# No token - use unauthenticated requests (rate limited)
|
||||
self.headers = {
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
@@ -34,34 +32,22 @@ class GitHubIngestor:
|
||||
|
||||
self.base_url = "https://api.github.com"
|
||||
|
||||
def get_contents(self, repo: str, path: str = "") -> list[dict]:
|
||||
"""Get contents of a directory or file.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
path: directory path (default: root)
|
||||
|
||||
Returns:
|
||||
List of content items
|
||||
"""
|
||||
def get_contents(self, repo: str, path: str = "", branch: Optional[str] = None) -> list[dict]:
|
||||
"""Get contents of a directory or file."""
|
||||
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
||||
if branch:
|
||||
url += f"?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
def get_file_content(self, repo: str, path: str) -> str:
|
||||
"""Get content of a single file.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
path: file path
|
||||
|
||||
Returns:
|
||||
Decoded file content
|
||||
"""
|
||||
def get_file_content(self, repo: str, path: str, branch: Optional[str] = None) -> str:
|
||||
"""Get content of a single file."""
|
||||
url = f"{self.base_url}/repos/{repo}/contents/{path}"
|
||||
if branch:
|
||||
url += f"?ref={branch}"
|
||||
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
@@ -78,43 +64,34 @@ class GitHubIngestor:
|
||||
def get_all_files(
|
||||
self,
|
||||
repo: str,
|
||||
branch: Optional[str] = None,
|
||||
path: str = "",
|
||||
extensions: Optional[list[str]] = None,
|
||||
exclude_dirs: Optional[list[str]] = None,
|
||||
include_all: bool = True,
|
||||
) -> dict[str, str]:
|
||||
"""Get all files from a repository - INCLUDING SOURCE CODE.
|
||||
|
||||
The AI witnesses EVERYTHING and transforms it into documentation.
|
||||
Don't filter what the AI can see - let it decide what's relevant.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
extensions: File extensions to include (None = ALL files!)
|
||||
exclude_dirs: Directories to exclude (build artifacts, etc.)
|
||||
include_all: If True, include ALL files (default True!)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping file paths to content
|
||||
"""
|
||||
# Default: include ALL files - the AI will witness everything!
|
||||
"""Get all files from a repository."""
|
||||
if include_all:
|
||||
extensions = None # No extension filter
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build", "*.egg-info"]
|
||||
extensions = None
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github", "dist", "build"]
|
||||
else:
|
||||
extensions = extensions or [".md", ".txt", ".text", ".notes", ".draft", ".rst"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules", "__pycache__", ".github"]
|
||||
extensions = extensions or [".md", ".txt"]
|
||||
exclude_dirs = exclude_dirs or [".git", "node_modules"]
|
||||
|
||||
files = {}
|
||||
|
||||
def walk_directory(path: str = ""):
|
||||
contents = self.get_contents(repo, path)
|
||||
def walk_directory(current_path: str = ""):
|
||||
try:
|
||||
contents = self.get_contents(repo, current_path, branch)
|
||||
except Exception as e:
|
||||
print(f"Error walking {current_path}: {e}")
|
||||
return
|
||||
|
||||
if isinstance(contents, dict):
|
||||
# Single file
|
||||
if contents.get("type") == "file":
|
||||
content_path = contents["path"]
|
||||
if self._should_include(content_path, extensions, exclude_dirs, include_all):
|
||||
files[content_path] = self.get_file_content(repo, content_path)
|
||||
files[content_path] = self.get_file_content(repo, content_path, branch)
|
||||
return
|
||||
|
||||
for item in contents:
|
||||
@@ -122,14 +99,16 @@ class GitHubIngestor:
|
||||
item_type = item.get("type")
|
||||
|
||||
if item_type == "dir":
|
||||
# Check if excluded
|
||||
if not any(excl in item_path for excl in exclude_dirs):
|
||||
walk_directory(item_path)
|
||||
elif item_type == "file":
|
||||
if self._should_include(item_path, extensions, exclude_dirs, include_all):
|
||||
files[item_path] = self.get_file_content(repo, item_path)
|
||||
try:
|
||||
files[item_path] = self.get_file_content(repo, item_path, branch)
|
||||
except Exception as e:
|
||||
print(f"Error reading {item_path}: {e}")
|
||||
|
||||
walk_directory()
|
||||
walk_directory(path)
|
||||
return files
|
||||
|
||||
def _should_include(
|
||||
@@ -139,73 +118,41 @@ class GitHubIngestor:
|
||||
exclude_dirs: list[str],
|
||||
include_all: bool = True,
|
||||
) -> bool:
|
||||
"""Check if file should be included.
|
||||
|
||||
Args:
|
||||
path: File path to check
|
||||
extensions: List of extensions (None if include_all=True)
|
||||
exclude_dirs: Directories to exclude
|
||||
include_all: Include ALL files (ignore extensions)
|
||||
"""
|
||||
# Exclude directories
|
||||
"""Check if file should be included."""
|
||||
for excl in exclude_dirs:
|
||||
if excl in path:
|
||||
return False
|
||||
|
||||
# If include_all, include everything
|
||||
if include_all:
|
||||
return True
|
||||
|
||||
# Otherwise check extensions
|
||||
if extensions:
|
||||
return any(path.endswith(ext) for ext in extensions)
|
||||
|
||||
return True
|
||||
|
||||
def extract_text_from_files(self, files: dict[str, str]) -> str:
|
||||
"""Combine all file contents into a single text blob.
|
||||
|
||||
Args:
|
||||
files: Dictionary of filename -> content
|
||||
|
||||
Returns:
|
||||
Combined text
|
||||
"""
|
||||
"""Combine all file contents."""
|
||||
combined = []
|
||||
|
||||
for filename, content in sorted(files.items()):
|
||||
combined.append(f"=== {filename} ===\n")
|
||||
combined.append(content)
|
||||
combined.append("\n\n")
|
||||
|
||||
return "".join(combined)
|
||||
|
||||
def ingest_repo(
|
||||
self,
|
||||
repo: str,
|
||||
branch: Optional[str] = None,
|
||||
path: str = "",
|
||||
include_readme: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""Ingest a complete repository.
|
||||
|
||||
Args:
|
||||
repo: "owner/repo" format
|
||||
include_readme: Include README.md files
|
||||
|
||||
Returns:
|
||||
Dictionary with files, combined_text, and metadata
|
||||
"""
|
||||
# Get all markdown and text files
|
||||
files = self.get_all_files(repo)
|
||||
|
||||
# Optionally exclude README
|
||||
"""Ingest a complete repository."""
|
||||
files = self.get_all_files(repo, branch, path)
|
||||
if not include_readme:
|
||||
files = {k: v for k, v in files.items() if "README" not in k}
|
||||
|
||||
# Combine into single text
|
||||
combined = self.extract_text_from_files(files)
|
||||
|
||||
return {
|
||||
"repo": repo,
|
||||
"branch": branch,
|
||||
"path": path,
|
||||
"files": files,
|
||||
"combined_text": combined,
|
||||
"file_count": len(files),
|
||||
|
||||
@@ -83,6 +83,9 @@ class LLMClient:
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# STRICT ENGLISH ENFORCEMENT
|
||||
system_prompt += "\n\nIMPORTANT: You must respond ONLY in English. Do not use Chinese characters or any other language under any circumstances."
|
||||
|
||||
if self.provider == "minimax":
|
||||
return self._complete_minimax_sync(
|
||||
@@ -110,6 +113,10 @@ class LLMClient:
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# STRICT ENGLISH ENFORCEMENT
|
||||
nonlocal system_prompt
|
||||
system_prompt += "\n\nIMPORTANT: You must respond ONLY in English. Do not use Chinese characters or any other language under any circumstances."
|
||||
|
||||
if self.provider == "minimax":
|
||||
return await self._complete_minimax_async(
|
||||
system_prompt, user_prompt, temperature, max_tokens, headers
|
||||
@@ -140,8 +147,8 @@ class LLMClient:
|
||||
# Anthropic-compatible format
|
||||
payload = {
|
||||
"model": self.minimax_model,
|
||||
"system": system_prompt,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": temperature,
|
||||
@@ -167,8 +174,15 @@ class LLMClient:
|
||||
|
||||
# Handle Anthropic-compatible response format
|
||||
if "content" in data:
|
||||
# Return the text content
|
||||
if isinstance(data["content"], list) and len(data["content"]) > 0:
|
||||
# Look for text content, skip thinking
|
||||
text_parts = []
|
||||
for item in data["content"]:
|
||||
if item.get("type") == "text":
|
||||
text_parts.append(item.get("text", ""))
|
||||
if text_parts:
|
||||
return "".join(text_parts)
|
||||
# If no text found, return first item's text or the item itself
|
||||
return data["content"][0].get("text", str(data["content"][0]))
|
||||
return str(data["content"])
|
||||
else:
|
||||
@@ -224,8 +238,8 @@ class LLMClient:
|
||||
"""Call MiniMax API (sync) using Anthropic-compatible endpoint."""
|
||||
payload = {
|
||||
"model": self.minimax_model,
|
||||
"system": system_prompt,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": temperature,
|
||||
|
||||
@@ -5,7 +5,7 @@ Merges and deduplicates content intelligently.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, list
|
||||
from typing import Optional, List
|
||||
from enum import Enum
|
||||
import hashlib
|
||||
|
||||
@@ -144,7 +144,7 @@ class MultiSourceIngestor:
|
||||
|
||||
ingestor = GitHubIngestor(token=self.github_token)
|
||||
|
||||
content = await ingestor.ingest_repo(
|
||||
content = ingestor.ingest_repo(
|
||||
repo=source.repo,
|
||||
branch=source.branch or "main",
|
||||
path=source.path or "",
|
||||
|
||||
Reference in New Issue
Block a user